In [1]:
import numpy as np
import tensorflow
from tensorflow.keras.datasets import mnist

np.set_printoptions(threshold=np.inf)

  from ._conv import register_converters as _register_converters


### Define some functions for converting from label to one hot encoding and vice versa:

In [2]:
def labels_to_onehotvector(labels):
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    onehotvector = np.zeros((len(labels), len(unique_labels)))
    for index, label in enumerate(labels):
        onehotvector[int(index), int(label)] = int(1)
    return onehotvector

In [3]:
def onehotvectors_to_labels(onehotvectors):
    labels = np.zeros(onehotvectors.shape[0])
    labels = onehotvectors.argmax(axis=1)
    return labels

### Next we define some functions for better readability in the later parts. First is the ReLU function for the hidden layer activations
ReLU(z) = max(0, z)

In [4]:
def relu(Z):
    return np.maximum(0,Z)

### ... and a function for getting the gradient of the ReLU activations at the hidden layers
d ReLU(Z)/dz = 1 if Z>0 ; 0 otherwise

In [5]:
def delta_relu(Z):
    return (Z>0).astype(int)

### Next is a function for the output layer activation: Softmax:
softmax(Z) = exp(Z)/(SUM(exp(Z))

To avoid python returning NaN due to exceeding of the max float64 value, we multiply a constant C to both the numerator and denominator. Setting the C to be the negative maximum value among all the softmax inputs, we have...

stable_softmax(Z) = exp(Z+max(Z))/(SUM(exp(Z+max(Z))

In [6]:
def softmax(Z):
    exps = np.exp(Z + np.max(Z))
    exps = exps / np.sum(exps,axis=1).reshape(-1,1)
    return exps

### We define a function for computing the categorical cross-entropy Loss of the whole network: L
L = -SUM(y*log(y_pred))

where **y** is the one-hot encoded true labels and **y_pred** is the prediction of the network (i.e. output of the softmax output activation layer).

***To save on computation time:***
Since **y** are in one-hot encoded format, we can instead compute the -log(y_pred) only at the column indices that are '1' and then proceed to computing the summation. This is much faster compared to matrix multiplication

In [7]:
def cross_entropy(y_pred, y):
    
    batch_size = y.shape[0]
    
    #log_likelihood = -np.multiply(y,np.log(y_pred)) #too slow
    
    # Get the negative log likelihoods of only the column(class) arg of y where y=1
    log_likelihood = y_pred[range(batch_size), np.argmax(y, axis=1).reshape(1,-1)]
    log_likelihood = -np.log(log_likelihood)
    
    batch_loss = np.sum(log_likelihood)/batch_size
    
    # Clip the categorical cross entropy loss at 20 (i.e. y_pred=0.000000001 when y=1) to prevent exploding gradients
    LOSS_CLIP_VALUE = 20.0
    batch_loss = min(batch_loss, LOSS_CLIP_VALUE)
                             
    #print('y', y[0:3])
    #print('y_pred', y_pred[0:3])
    #print(log_likelihood.shape,'log_likelihood', log_likelihood[0,0:3])
    #print('batch_loss:', batch_loss)
    return batch_loss

### ... and its corresponding gradient:
dL/dy_pred = y_pred - y

where once again, **y** is the one-hot encoded true labels and **y_pred** is the prediction of the network (i.e. output of the softmax output activation layer)

For computation speed, compute for "y_pred - 1" only at the index where y=1 (e.g. y.argmax()) and leave the rest of y_pred unchanged.

In [8]:
def delta_cross_entropy(y_pred, y):
    batch_size = y.shape[0]
    y_pred[y.argmax()] = y_pred[y.argmax()] - 1
    delta = y_pred/batch_size
    
    return delta

### Next we define a method for initializing the network weights and biases:
For the initialization, we use the default setting of keras **Dense** class:

Weight initialization: Glorot Uniform: (-6/sqrt(m+n) , 6/sqrt(m+n))
where m=number of inputs, n=number of outputs

Bias initialization: all zeros

In [9]:
def init_neurons(num_input, num_hidden1_neurons, num_hidden2_neurons, num_output):
    #INPUT LAYER
    x_in = np.zeros((num_input, 1))
    
    #HIDDEN LAYER 1: num_input inputs, num_hidden1_neurons outputs
    init = 6/np.sqrt(num_input+num_hidden1_neurons)
    w_h1 = np.random.uniform(low=-init, high=init, size=(num_hidden1_neurons, num_input))
    b_h1 = np.zeros([num_hidden1_neurons, 1])

    #HIDDEN LAYER 2: num_hidden1_neurons inputs, num_hidden2_neurons outputs
    init = 6/np.sqrt(num_hidden1_neurons+num_hidden2_neurons)
    w_h2 = np.random.uniform(low=-init, high=init, size=(num_hidden2_neurons, num_hidden1_neurons))
    b_h2 = np.zeros([num_hidden2_neurons, 1])

    #OUTPUT LAYER: num_hidden2_neurons inputs, num_output outputs
    init = 6/np.sqrt(num_hidden2_neurons+num_output)
    w_out = np.random.uniform(low=-init, high=init, size=(num_output, num_hidden2_neurons))
    b_out = np.random.uniform(low=-init, high=init, size=(num_output, 1))
    
    return x_in, w_h1, b_h1, w_h2, b_h2, w_out, b_out

### We also define a method for generating data and their corresponding labels for getting a batch:

In [10]:
def batch(X, Y, batch_size):
    length = X.shape[0]
    for i in np.arange(0, length, batch_size):
        yield (X[i:min(i+batch_size, length)], Y[i:min(i+batch_size, length)])

### Method for forward-pass yielding the prediction **y_pred**

NOTE: dropout value means probability that a neuron will be **included** in the network during training

In [11]:
def predict_batch(x_batch, w_h1, b_h1, w_h2, b_h2, w_out, b_out, y_batch, dropout=1.0):
    #print(x_batch.shape, y_batch.shape)
    #(128, 784) (128, 10)
    
    #print(w_h1.shape, b_h1.shape, w_h2.shape, b_h2.shape, w_out.shape, b_out.shape)
    #     (256, 784) (256, 1)     (256, 256) (256, 1)    (10, 256)     (10, 1)
    
    batch_size = y_batch.shape[0]
    ##### FORWARD PASS #####
    A_h1 = relu(np.dot(x_batch, np.transpose(w_h1)) + np.transpose(b_h1))
    #A_h1 = np.multiply(A_h1, np.random.choice([0,1], size=A_h1.shape, p=[(1-dropout), dropout]))
    #print(A_h1.shape) #(128, 256)
    A_h2 = relu(np.dot(A_h1, np.transpose(w_h2)) + np.transpose(b_h2))
    #A_h2 = np.multiply(A_h2, np.random.choice([0,1], size=A_h2.shape, p=[(1-dropout), dropout]))
    #print(A_h2.shape) #(128, 256)
    y_pred = softmax(np.dot(A_h2, np.transpose(w_out)) + np.transpose(b_out))
    #print(y_pred.shape) #(128, 10)
    
    ##### COMPUTE LOSS AND GRADIENT AT OUTPUT FOR BACKPROPAGATION USE #####
    batch_loss = cross_entropy(y_pred, y_batch)
    #print(batch_loss)
    batch_delta_out = (1/batch_size)*np.sum(delta_cross_entropy(y_pred, y_batch), axis=0) #sum along the columns (expected shape is (10,1))
    batch_delta_out = batch_delta_out.reshape(1,-1)
    #print("batch_delta_out.shape:", batch_delta_out.shape) #(128, 10)
    
    return batch_loss, y_pred, A_h1, A_h2, batch_delta_out

### Create a method for learning rate scheduling:

In [12]:
def get_LR(current_error, initial_LR):
    LR = initial_LR
    #if current_error>2.316 and current_error<2.345:
    if current_error>3.405 and current_error<3.76:
        LR = initial_LR/2
    elif current_error>3.22 and current_error<3.405:
        LR = initial_LR/3
    elif current_error>3.0 and current_error<3.22:
        LR = initial_LR/4
    elif current_error>2.85 and current_error<3.0:
        LR = initial_LR/5
    elif current_error>2.752 and current_error<2.85:
        LR = initial_LR/6
    elif current_error>0 and current_error<2.752:
        LR = initial_LR/7
    return LR

### Declare the weights and biases as globally shared parameters for ease of testing

In [13]:
w_h1, b_h1, w_h2, b_h2, w_out, b_out = np.zeros(6)

### Define a method for the training the artificial neural network: ann_fit()

In [14]:
def ann_fit(train_data, train_labels, num_input, num_hidden1_neurons, num_hidden2_neurons, num_output,
            batch_size=128, LR=0.1, dropout=1.0):
    global w_h1, b_h1, w_h2, b_h2, w_out, b_out 
    
    # Initialize the input, output, and hidden layer neurons (i.e. their weights and biases matrices)
    x_in, w_h1, b_h1, w_h2, b_h2, w_out, b_out = init_neurons(num_input, num_hidden1_neurons,
                                                            num_hidden2_neurons, num_output)
    # Remember the Initial LR for LR scheduling purposes:
    initial_LR = LR
    
    MAX_EPOCH=500
    
    # Generate a vector for the epoch numbers
    epochs = range(0, MAX_EPOCH)
    
    # Stop training if the current total training error goes below this value
    ERR_TERMINATION_COND = -np.log(0.99) # i.e. cross entropy when y_pred = 0.999 and y = 1
    
    total_error = 0.0
    batch_loss = 0.0
    num_batches_processed = 0.0
    
    for epoch_index in epochs:
        print('\n=========================================================\nEPOCH # %d' % (epoch_index+1))
        randomized_train_indices = np.random.permutation(train_data.shape[0])
        randomized_x_train = np.take(x_train, randomized_train_indices, axis=0);
        randomized_y_train = np.take(y_train, randomized_train_indices, axis=0);
        for x_batch, y_batch in batch(randomized_x_train, randomized_y_train, batch_size):
            batch_size = y_batch.shape[0]
            batch_loss, y_pred, A_h1, A_h2, batch_delta_out = predict_batch(x_batch, w_h1, b_h1, w_h2, b_h2, w_out, b_out, y_batch, dropout=dropout)
            
            total_error = total_error + batch_loss
            num_batches_processed += 1
            
            ##### BACK PROPAGATION (PERFORMED AT AFTER EACH BATCH PROCESSING) #####
            
            ### Compute batch gradients at each layer (Don't forget to divide by batch_size!)
            #print(A_h2.shape, w_out.shape, batch_delta_out.shape)
            #      (128, 256) (10, 256)     (1, 10)
            delta_h2 = delta_relu(A_h2)*(np.dot(batch_delta_out, w_out))
            delta_h2 = (np.sum(delta_h2, axis=0)/batch_size).reshape(1,-1)
            #print(delta_h2.shape) #(1,256)
            
            #print(A_h1.shape, w_h2.shape, delta_h2.shape)
            #     (128, 256)  (256, 256)    (1, 256)
            delta_h1 = delta_relu(A_h1)*(np.dot(delta_h2, w_h2))
            delta_h1 = (np.sum(delta_h2, axis=0)/batch_size).reshape(1,-1)
            #print(delta_h1.shape) #(1,256)

            ## Update the weights and biases
            a_h2 = A_h2[-1,:].reshape(1,-1) # Get last output of layer 2 from the batch for parameter update purposes
            #print(w_out.shape, batch_delta_out.shape, a_h2.shape)
            #    (10, 256)     (1, 10)               (1, 256)
            w_out = w_out + LR*np.dot(batch_delta_out.reshape(-1,1), a_h2)
            b_out = b_out + LR*batch_delta_out.reshape(-1,1)

            a_h1 = A_h1[-1,:].reshape(1,-1) # Get last output of layer 1 from the batchfor parameter update purposes
            w_h2 = w_h2 - LR*np.dot(delta_h2.reshape(-1,1), a_h1)
            b_h2 = b_h2 - LR*delta_h2.reshape(-1,1)

            x = x_batch[-1,:].reshape(1,-1) # Get last input from batch for parameter update purposes
            w_h1 = w_h1 - LR*np.dot(delta_h1.reshape(-1,1), x)
            b_h1 = b_h1 - LR*delta_h1.reshape(-1,1)
            
            
        total_average_error = total_error/num_batches_processed
        print("\ttotal average error: %f @ LR=%f" % (total_error/num_batches_processed, LR))
        print("\tcurrent validation error: %f" % (batch_loss))
        LR = get_LR(total_average_error, initial_LR)
        if( total_average_error < ERR_TERMINATION_COND):
            print('TRAINING ERROR TARGET REACHED! STOPPING TRAINING...')
            break
                
    

### Load the data and reshape the training and test data to 1x(28*28), then normalize:

In [15]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
print('x_train shape: ', x_train.shape)
print('y_train shape: ', y_train.shape)
print('x_test shape: ', x_test.shape)
print('y_test shape: ', y_test.shape)

x_train = np.reshape(x_train, [-1, x_train.shape[1]*x_train.shape[2]])
x_train = x_train.astype('float64')/np.max(x_train)
x_test = np.reshape(x_test, [-1, x_test.shape[1]*x_test.shape[2]])
x_test = x_test.astype('float64')/np.max(x_test)

print('new x_train shape after reshaping: ', x_train.shape)
print('new x_test shape after reshaping: ', x_test.shape)

num_labels = len(np.unique(y_train))
y_train = labels_to_onehotvector(y_train)
y_test = labels_to_onehotvector(y_test)
print('new y_train shape after onehot vector encoding: ', y_train.shape)
print('new y_test shape after onehot vector encoding: ', y_test.shape)

x_train shape:  (60000, 28, 28)
y_train shape:  (60000,)
x_test shape:  (10000, 28, 28)
y_test shape:  (10000,)
new x_train shape after reshaping:  (60000, 784)
new x_test shape after reshaping:  (10000, 784)
new y_train shape after onehot vector encoding:  (60000, 10)
new y_test shape after onehot vector encoding:  (10000, 10)


# Main training loop

In [16]:
# Define number of neurons per layer
NUM_INPUT = x_train.shape[1]
NUM_HIDDEN1_NEURONS = 256
NUM_HIDDEN2_NEURONS = 256
NUM_OUTPUT = y_train.shape[1]

# Initial Learning Rate of Keras SGD Optimizer: https://keras.io/api/optimizers/sgd/
LR = 0.1

# Define the maximum number of epochs to run before stopping even if the stopping criteria is not met
MAX_EPOCH = 30000

# Define Dropout probability, i.e. probability that a hidden neuron will be INCLUDED in the training:
DROPOUT = 0.45

ann_fit(x_train, y_train, num_input=NUM_INPUT, num_hidden1_neurons=NUM_HIDDEN1_NEURONS,
            num_hidden2_neurons=NUM_HIDDEN2_NEURONS, num_output=NUM_OUTPUT,
            batch_size=128, LR=LR, dropout=DROPOUT)


EPOCH # 1
	total average error: 6.359850 @ LR=0.100000
	current validation error: 8.534270

EPOCH # 2
	total average error: 6.463504 @ LR=0.100000
	current validation error: 3.470510

EPOCH # 3
	total average error: 5.313887 @ LR=0.100000
	current validation error: 2.803547

EPOCH # 4
	total average error: 4.647262 @ LR=0.100000
	current validation error: 2.731085

EPOCH # 5
	total average error: 4.226903 @ LR=0.100000
	current validation error: 2.754256

EPOCH # 6
	total average error: 3.949715 @ LR=0.100000
	current validation error: 2.641761

EPOCH # 7
	total average error: 3.752294 @ LR=0.100000
	current validation error: 2.655037

EPOCH # 8
	total average error: 3.602591 @ LR=0.050000
	current validation error: 2.541911

EPOCH # 9
	total average error: 3.486538 @ LR=0.050000
	current validation error: 2.645055

EPOCH # 10
	total average error: 3.394375 @ LR=0.050000
	current validation error: 2.531394

EPOCH # 11
	total average error: 3.319744 @ LR=0.033333
	current validation er

	total average error: 3.065922 @ LR=0.025000
	current validation error: 4.238893

EPOCH # 56
	total average error: 3.083101 @ LR=0.025000
	current validation error: 4.046547

EPOCH # 57
	total average error: 3.101297 @ LR=0.025000
	current validation error: 4.178633

EPOCH # 58
	total average error: 3.120732 @ LR=0.025000
	current validation error: 4.437294

EPOCH # 59
	total average error: 3.141678 @ LR=0.025000
	current validation error: 4.404308

EPOCH # 60
	total average error: 3.164257 @ LR=0.025000
	current validation error: 4.621054

EPOCH # 61
	total average error: 3.188323 @ LR=0.025000
	current validation error: 4.809807

EPOCH # 62
	total average error: 3.213621 @ LR=0.025000
	current validation error: 4.501545

EPOCH # 63
	total average error: 3.240087 @ LR=0.025000
	current validation error: 4.934796

EPOCH # 64
	total average error: 3.268422 @ LR=0.033333
	current validation error: 5.002915

EPOCH # 65
	total average error: 3.299012 @ LR=0.033333
	current validation error

	total average error: 7.602125 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 110
	total average error: 7.715089 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 111
	total average error: 7.826013 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 112
	total average error: 7.934953 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 113
	total average error: 8.041959 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 114
	total average error: 8.147085 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 115
	total average error: 8.250378 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 116
	total average error: 8.351886 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 117
	total average error: 8.451655 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 118
	total average error: 8.549730 @ LR=0.100000
	current validation error: 20.000000

EPOCH # 119
	total average error: 8.646153 @ LR=0.100000
	curr

KeyboardInterrupt: 

In [None]:
average_batch_error, _, _, _, y_pred, _, _ = predict_batch(x_test, w_h1, b_h1, w_h2, b_h2, w_out, b_out, y_test)

In [None]:
average_batch_error

In [None]:
y_test[0], y_pred[0]

In [None]:
np.random.choice([0,1], size=(2,2), p=[1-0.45, 0.45])