In [1]:
import numpy as np
import tensorflow
from tensorflow.keras.datasets import mnist

np.set_printoptions(threshold=np.inf)

  from ._conv import register_converters as _register_converters


### Define some functions for converting from label to one hot encoding and vice versa:

In [2]:
def labels_to_onehotvector(labels):
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    onehotvector = np.zeros((len(labels), len(unique_labels)))
    for index, label in enumerate(labels):
        onehotvector[int(index), int(label)] = int(1)
    return onehotvector

In [3]:
def onehotvectors_to_labels(onehotvectors):
    labels = np.zeros(onehotvectors.shape[0])
    labels = onehotvectors.argmax(axis=1)
    return labels

### Next we define some functions for better readability in the later parts. First is the ReLU function for the hidden layer activations
ReLU(z) = max(0, z)

In [4]:
def relu(Z):
    return np.maximum(0,Z)

### ... and a function for getting the gradient of the ReLU activations at the hidden layers
d ReLU(Z)/dz = 1 if Z>0 ; 0 otherwise

In [5]:
def delta_relu(Z):
    return (Z>0).astype(int)

### Next is a function for the output layer activation: Softmax:
softmax(Z) = exp(Z)/(SUM(exp(Z))

To avoid python returning NaN due to exceeding of the max float value, we multiply a constant C to both the numerator and denominator.
C*exp(Z)/C(SUM(exp(Z-max(Z)))

Setting the C to be the e^(-max(Z)), we have...

stable_softmax(Z) = exp(Z-max(Z))/(SUM(exp(Z-max(Z))

In [6]:
def softmax(Z):
    exps = np.exp(Z - np.max(Z))
    denominator = np.sum(exps,axis=1).reshape(-1,1)
    exps = exps / denominator
    return exps

### We define a function for computing the categorical cross-entropy Loss of the whole network: L
L = -SUM(y*log(y_pred))

where **y** is the one-hot encoded true labels and **y_pred** is the prediction of the network (i.e. output of the softmax output activation layer).

***To save on computation time:***
Since **y** are in one-hot encoded format, we can instead compute the -log(y_pred) only at the column indices that are '1' and then proceed to computing the summation. This is much faster compared to matrix multiplication

In [7]:
def cross_entropy(y_pred, y):
    
    batch_size = y.shape[0]
    
    #log_likelihood = -np.multiply(y,np.log(y_pred)) #too slow
    
    # Get the negative log likelihoods of only the column(class) arg of y where y=1
    log_likelihood = y_pred[range(batch_size), np.argmax(y, axis=1).reshape(1,-1)]
    log_likelihood = -np.log(log_likelihood)
    
    loss = np.sum(log_likelihood)/batch_size
    
    return loss

### ... and its corresponding gradient:
dJ/dz_out = d/dz_out(-ylog(softmax(z_out))) = y_pred - y

where once again, **y** is the one-hot encoded true labels and **y_pred** is the prediction of the network (i.e. output of the softmax output activation layer)

For computation speed, compute for "y_pred - 1" only at the index where y=1 (e.g. y.argmax()) and leave the rest of y_pred unchanged.

In [8]:
def delta_cross_entropy(y_pred, y):
    batch_size = y.shape[0]
    delta = y_pred-y
    
    return delta

### Next we define a method for initializing the network weights and biases:
For the initialization, we use the default setting of keras **Dense** class:

Weight initialization: Glorot Uniform: (-6/sqrt(m+n) , 6/sqrt(m+n))
where m=number of inputs, n=number of outputs

Bias initialization: all zeros

In [9]:
def init_neurons(num_input, num_hidden1_neurons, num_hidden2_neurons, num_output):   
    #HIDDEN LAYER 1: num_input inputs, num_hidden1_neurons outputs
    init = 6/np.sqrt(num_input+num_hidden1_neurons)
    w_h1 = np.random.uniform(low=-init, high=init, size=(num_hidden1_neurons, num_input))
    b_h1 = np.zeros([num_hidden1_neurons, 1])

    #HIDDEN LAYER 2: num_hidden1_neurons inputs, num_hidden2_neurons outputs
    init = 6/np.sqrt(num_hidden1_neurons+num_hidden2_neurons)
    w_h2 = np.random.uniform(low=-init, high=init, size=(num_hidden2_neurons, num_hidden1_neurons))
    b_h2 = np.zeros([num_hidden2_neurons, 1])

    #OUTPUT LAYER: num_hidden2_neurons inputs, num_output outputs
    init = 6/np.sqrt(num_hidden2_neurons+num_output)
    w_out = np.random.uniform(low=-init, high=init, size=(num_output, num_hidden2_neurons))
    b_out = np.zeros([num_output, 1])
    
    return  w_h1, b_h1, w_h2, b_h2, w_out, b_out

### We also define a method for generating data and their corresponding labels for getting a batch:

In [10]:
def batch(X, Y, batch_size):
    length = X.shape[0]
    for i in np.arange(0, length, batch_size):
        yield (X[i:min(i+batch_size, length)], Y[i:min(i+batch_size, length)])

### Define a method for calculating the classification accuracy

In [11]:
def compute_accuracy(y_pred, y):
    acc = (np.argmax(y_pred, axis=1) == np.argmax(y, axis=1)).astype(int)
    acc = np.average(acc)
    
    return acc

### Method for forward-pass yielding the prediction **y_pred**

NOTE: dropout value means probability that a neuron will be **included** in the network during training

In [12]:
def predict_batch(x_batch, w_h1, b_h1, w_h2, b_h2, w_out, b_out, y_batch, dropout=1.0):
    
    ##### FORWARD PASS #####
        ### LAYER 1 ###
    Z_h1 = np.dot(x_batch, np.transpose(w_h1)) + np.transpose(b_h1)
    A_h1 = relu(Z_h1)
        #DROP-OUT:
    A_h1 = np.multiply(A_h1, np.random.choice([0,1], size=A_h1.shape, p=[(1-dropout), dropout]))
    
        ### LAYER 2 ###
    Z_h2 = np.dot(A_h1, np.transpose(w_h2)) + np.transpose(b_h2)
    A_h2 = relu(Z_h2)
        #DROP-OUT
    A_h2 = np.multiply(A_h2, np.random.choice([0,1], size=A_h2.shape, p=[(1-dropout), dropout]))
    
        ### OUTPUT LAYER ###
    Z_out = np.dot(A_h2, np.transpose(w_out)) + np.transpose(b_out)
    y_pred = softmax(Z_out)
    
    ##### COMPUTE LOSS AND GRADIENT AT OUTPUT FOR BACKPROPAGATION USE #####
    batch_loss = cross_entropy(y_pred, y_batch)
    batch_delta_out = delta_cross_entropy(y_pred, y_batch) # dJ/Z_out
    
    acc = compute_accuracy(y_pred, y_batch)
    
    return batch_loss, acc, y_pred, A_h1, A_h2, Z_h1, Z_h2, batch_delta_out

### Create a method for learning rate scheduling:
* current_step: number of batches processed so far
* decay_steps: number of steps before multiplying initial_LR by decay_rate (higher decay_steps leads to slower decaying)

In [13]:
def get_LR(initial_LR, current_step, decay_rate=0.96, decay_steps=200):
    MIN_LR = 0.001
    LR = initial_LR * np.power(decay_rate, (current_step/decay_steps))
    LR = max(LR, MIN_LR)
    return LR

### Define a method for computing parameter updates given neuron gradients and neuron inputs

In [14]:
def get_param_update(LR, delta, neuron_input, batch_size):
    gradient_w = np.dot(np.transpose(delta), neuron_input) # Gradient of Loss wrt w
    gradient_w = gradient_w/batch_size                     # Averaged for the batch

    w_update = LR*gradient_w

    gradient_b = np.transpose(delta)                       # Gradient of Loss wrt b
    gradient_b = np.sum(gradient_b,axis=1)/batch_size      # Averaged for the batch
    gradient_b = gradient_b.reshape(-1,1)

    b_update = LR*gradient_b

    return w_update, b_update

### Define a method for train and validation set splitting

In [15]:
def train_validation_split(x, y, validation_ratio=0.2):
    num_data_points = y.shape[0]
    split_index = int(validation_ratio*num_data_points)
    
    randomized_indices = np.random.permutation(num_data_points)
    
    randomized_x = np.take(x, randomized_indices, axis=0);
    x_validation = randomized_x[0:split_index]
    x_train = randomized_x[split_index:]
    
    randomized_y = np.take(y, randomized_indices, axis=0);
    y_validation = randomized_y[0:split_index]
    y_train = randomized_y[split_index:]
    
    return x_train, y_train, x_validation, y_validation

### Declare the weights and biases as globally shared parameters for ease of testing

In [16]:
w_h1, b_h1, w_h2, b_h2, w_out, b_out = np.zeros(6)

### Define a method for the training the artificial neural network: ann_fit()

In [21]:
def ann_fit(train_data, train_labels, num_input, num_hidden1_neurons, num_hidden2_neurons, num_output,
            batch_size=128, LR=0.1, dropout=1.0, max_epoch=20):
    global w_h1, b_h1, w_h2, b_h2, w_out, b_out 
    
    # Initialize the input, output, and hidden layer neurons (i.e. their weights and biases matrices)
    w_h1, b_h1, w_h2, b_h2, w_out, b_out = init_neurons(num_input, num_hidden1_neurons,
                                                            num_hidden2_neurons, num_output)
    # Remember the Initial LR for LR scheduling purposes:
    initial_LR = LR
    
    # Generate a vector for the epoch numbers
    epochs = range(0, max_epoch)
    
    # Stop training if the current total training error goes below this value
    ERR_TERMINATION_COND = -np.log(0.999) # i.e. cross entropy when y_pred = 0.999 and y = 1
    
    total_error = 0.0
    total_training_acc = 0.0
    num_batches_processed = 0.0
    
    x_train, y_train, x_validation, y_validation = train_validation_split(train_data, train_labels)
    print("x_train.shape:", x_train.shape, "y_train.shape:", y_train.shape, "x_validation.shape", x_validation.shape, "y_validation.shape:", y_validation.shape)
    
    for epoch_index in epochs:
        print('\n============================================================================\nEPOCH # %d' % (epoch_index+1))
        randomized_train_indices = np.random.permutation(x_train.shape[0])
        randomized_x_train = np.take(x_train, randomized_train_indices, axis=0);
        randomized_y_train = np.take(y_train, randomized_train_indices, axis=0);
        for x_batch, y_batch in batch(randomized_x_train, randomized_y_train, batch_size):
            this_batch_size = y_batch.shape[0]
            
            ##### FORWARD PASS for TRAINING DATA SET
            batch_loss, acc, y_pred, A_h1, A_h2, Z_h1, Z_h2, batch_delta_out = predict_batch(x_batch,
                                                                                        w_h1, b_h1, w_h2,
                                                                                        b_h2, w_out, b_out,
                                                                                        y_batch, dropout=dropout)

            total_error = total_error + batch_loss
            total_training_acc = total_training_acc + acc
            LR = get_LR(initial_LR, current_step=num_batches_processed)
            num_batches_processed += 1
            total_average_error = total_error/num_batches_processed            
            print("\ttotal average error: %f train_acc:%f @LR=%f" % (total_average_error, acc, LR), end="\r")
            
            ##### BACK PROPAGATION (PERFORMED AT AFTER EACH BATCH PROCESSING) #####
            
            ### Compute batch gradients at each layer (Don't forget to divide by this_batch_size!)
            delta_h2 = delta_relu(Z_h2)*(np.dot(batch_delta_out, w_out))
            delta_h1 = delta_relu(Z_h1)*(np.dot(delta_h2, w_h2))

            ### Update the weights and biases
            w_out_update, b_out_update = get_param_update(LR, batch_delta_out, A_h2, this_batch_size)
            w_out = w_out - w_out_update
            b_out = b_out - b_out_update

            w_h2_update, b_h2_update = get_param_update(LR, delta_h2, A_h1, this_batch_size)
            w_h2 = w_h2 - w_h2_update
            b_h2 = b_h2 - b_h2_update

            w_h1_update, b_h1_update = get_param_update(LR, delta_h1, x_batch, this_batch_size)
            w_h1 = w_h1 - w_h1_update
            b_h1 = b_h1 - b_h1_update
            
        # Get validation accuracy at end of epoch
        _, validation_acc, _, _, _, _, _, _ = predict_batch(x_validation, w_h1, b_h1, w_h2, b_h2, w_out, b_out,
                                                                y_validation, dropout=1.0)
        print("\n\tvalidation_acc:%f" % (validation_acc))
        
        if( total_average_error < ERR_TERMINATION_COND):
            print('TRAINING ERROR TARGET REACHED! STOPPING TRAINING...')
            break
    print('\n========================= END OF TRAINING =========================\n\n')
                
    

# Start here for Code Review!
### Load the data and reshape the training and test data to 1x(28*28), then normalize:

In [22]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
print('x_train shape: ', x_train.shape)
print('y_train shape: ', y_train.shape)
print('x_test shape: ', x_test.shape)
print('y_test shape: ', y_test.shape)

x_train = np.reshape(x_train, [-1, x_train.shape[1]*x_train.shape[2]])
x_train = x_train.astype('float64')/np.max(x_train)
x_test = np.reshape(x_test, [-1, x_test.shape[1]*x_test.shape[2]])
x_test = x_test.astype('float64')/np.max(x_test)

print('new x_train shape after reshaping: ', x_train.shape)
print('new x_test shape after reshaping: ', x_test.shape)

num_labels = len(np.unique(y_train))
y_train = labels_to_onehotvector(y_train)
y_test = labels_to_onehotvector(y_test)
print('new y_train shape after onehot vector encoding: ', y_train.shape)
print('new y_test shape after onehot vector encoding: ', y_test.shape)

x_train shape:  (60000, 28, 28)
y_train shape:  (60000,)
x_test shape:  (10000, 28, 28)
y_test shape:  (10000,)
new x_train shape after reshaping:  (60000, 784)
new x_test shape after reshaping:  (10000, 784)
new y_train shape after onehot vector encoding:  (60000, 10)
new y_test shape after onehot vector encoding:  (10000, 10)


# Main training loop
LAYER1:       256 neurons: w_h1(256x784), b_h1(256,1), activation:ReLU

LAYER2:       256 neurons: w_h2(256x256), b_h2(256,1), activation:ReLU

OUTPUT LAYER: 10 neurons: w_out(10x256), b_out(10,1), activation: Softmax

COST FUNCTION: Categorical Cross-Entropy

In [23]:
# Define number of neurons per layer
NUM_INPUT = x_train.shape[1]
NUM_HIDDEN1_NEURONS = 256
NUM_HIDDEN2_NEURONS = 256
NUM_OUTPUT = y_train.shape[1]

# Initial Learning Rate of Keras SGD Optimizer: https://keras.io/api/optimizers/sgd/
LR = 0.3

# Define the maximum number of epochs to run before stopping even if the stopping criteria is not met
MAX_EPOCH = 25

# Define Dropout probability, i.e. probability that a hidden neuron will be INCLUDED in the training:
DROPOUT = 0.45

# INITIALIZE NETWORK PARAMETERS (to verify that previously trained parameters is reset)
w_h1, b_h1, w_h2, b_h2, w_out, b_out = np.zeros(6)

# TRAIN NETWORK
ann_fit(x_train, y_train, num_input=NUM_INPUT, num_hidden1_neurons=NUM_HIDDEN1_NEURONS,
            num_hidden2_neurons=NUM_HIDDEN2_NEURONS, num_output=NUM_OUTPUT,
            batch_size=128, LR=LR, dropout=DROPOUT, max_epoch=MAX_EPOCH)

# TEST NETWORK
_, test_acc, _, _, _, _, _, _ = predict_batch(x_test, w_h1, b_h1, w_h2, b_h2, w_out, b_out, y_test)
print("Test Accuracy = %f" % (test_acc))

x_train.shape: (48000, 784) y_train.shape: (48000, 10) x_validation.shape (12000, 784) y_validation.shape: (12000, 10)

EPOCH # 1
	total average error: 1.152408 train_acc:0.757812 @LR=0.277951
	validation_acc:0.899000

EPOCH # 2
	total average error: 1.052963 train_acc:0.812500 @LR=0.257470
	validation_acc:0.918417

EPOCH # 3
	total average error: 1.032176 train_acc:0.828125 @LR=0.238498
	validation_acc:0.928250

EPOCH # 4
	total average error: 1.027768 train_acc:0.820312 @LR=0.220925
	validation_acc:0.928750

EPOCH # 5
	total average error: 1.038109 train_acc:0.820312 @LR=0.204646
	validation_acc:0.922750

EPOCH # 6
	total average error: 1.051249 train_acc:0.820312 @LR=0.189566
	validation_acc:0.933667

EPOCH # 7
	total average error: 1.063606 train_acc:0.890625 @LR=0.175598
	validation_acc:0.928750

EPOCH # 8
	total average error: 1.072253 train_acc:0.859375 @LR=0.162659
	validation_acc:0.934833

EPOCH # 9
	total average error: 1.074983 train_acc:0.882812 @LR=0.150674
	validation_acc

  after removing the cwd from sys.path.
  if __name__ == '__main__':



	validation_acc:0.934583

EPOCH # 16
	total average error: 1.073489 train_acc:0.867188 @LR=0.088175
	validation_acc:0.942750

EPOCH # 17
	total average error: 1.070189 train_acc:0.812500 @LR=0.081678
	validation_acc:0.940000

EPOCH # 18
	total average error: 1.063636 train_acc:0.820312 @LR=0.075660
	validation_acc:0.929667

EPOCH # 19
	total average error: 1.058289 train_acc:0.929688 @LR=0.070085
	validation_acc:0.946500

EPOCH # 20
	total average error: 1.050310 train_acc:0.882812 @LR=0.064920
	validation_acc:0.944333

EPOCH # 21
	total average error: 1.040807 train_acc:0.914062 @LR=0.060137
	validation_acc:0.943000

EPOCH # 22
	total average error: 1.029677 train_acc:0.898438 @LR=0.055706
	validation_acc:0.947667

EPOCH # 23
	total average error: 1.017531 train_acc:0.820312 @LR=0.051601
	validation_acc:0.947083

EPOCH # 24
	total average error: 1.006100 train_acc:0.867188 @LR=0.047799
	validation_acc:0.944833

EPOCH # 25
	total average error: 0.993825 train_acc:0.929688 @LR=0.044277

### For Testing:

In [20]:
_, test_acc, _, _, _, _, _, _ = predict_batch(x_test, w_h1, b_h1, w_h2, b_h2, w_out, b_out, y_test)
print("\tTest Accuracy = %f" % (test_acc))

	Test Accuracy = 0.945300
