In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.utils import to_categorical

mnist_data = pd.read_csv("assets/training60000.csv", header=None)
mnist_labels = pd.read_csv("assets/training60000_labels.csv", header=None)
mnist_testing_data = pd.read_csv("assets/testing10000.csv", header=None)
mnist_testing_labels = pd.read_csv("assets/testing10000_labels.csv", header=None)

In [2]:
def logistic(z):
    """
    The logistic activation function.
    """
    return 1.0 / (1.0 + np.exp(-z))


def delta_logistic(z):
    """
    Derivative of the logistic function used for computing deltas.
    """
    return logistic(z) * (1 - logistic(z))


def softmax(z):
    """
    Softmax function used for converting inputs into values that sum to 1.
    Used as the activation function for the output layer.
    """
    return np.exp(z) / np.sum(np.exp(z))


def encoder(labels):
    return to_categorical(labels)

In [3]:
def old_dnn():
    # Algorithm 5: Backpropagation for a feedforward network with L layers
    # create mini batches
    batches = np.array_split(mnist_data, 600)
    label_batches = np.array_split(mnist_labels, 600)
    batch_size = 100
    i_neurons = 784
    h_neurons = 30
    o_neurons = 10
    num_epochs = 10
    learning_rate = 0.0001

    # initialize the weight matrix for each layer
#     Wh = np.random.randn(h_neurons, i_neurons + 1)
    Wh = np.random.uniform(-0.5, 0.5, size=(h_neurons, i_neurons+1))
#     print("Hidden weights shape:", Wh.shape)
#     Wo = np.random.randn(o_neurons, h_neurons + 1)
    Wo = np.random.uniform(-0.5, 0.5, size=(o_neurons, h_neurons+1))
#     print("Output weights shape:", Wo.shape)

    #     print(Wh)
    #     print(Wo)

    # FOR # EPOCHS
    # Each loop of lines 3-33 represents one epoch of training
    for epoch in range(10):

        # FOR EACH MINI BATCH
        for batch, labels in zip(batches, label_batches):
            # Each iteration of for loop lines 4-31 involves
            # the processing of a single mini-batch, including both
            # forward and backward pass of the algorithm
            # and a single set of weight updates.
            # In line 5, the matrix of descriptive features for the examples
            # in the mini batch is fed into the input layer.
            
            """
            FORWARD PASS
            """
            # FOR EACH LAYER - forward pass (MATRIX MULTIPLICATION FIGURE 8.6)
            # Lines 6-11 = forward pass.
            # This pass follows the set of operations illustrated in Figure 8.6
            # each iteration of this for loop propagates the activations for the mini
            # batch forward through the next layer of the network
            # transpose input row of batch size 100 for matrix multiplication
            input_layer = batch.transpose()
#             print("Input layer matrix shape:", input_layer.shape)

            # The vector v created on line 7 is the vector of bias inputs (as wide as the number
            # of neurons in the layer).
            # Create a bias vector of 1s
            input_bias = [1] * batch_size

            # line 8 the bias inputs vector and the matrix of activations
            # from the previous layer are vertically concatenated so that
            # the bias inputs are now stored in the first row of the activation matrix
            input_layer = np.vstack([input_bias, input_layer])
#             print("Input layer + bias shape:", input_layer.shape)

            # Line 9 is the matrix multiplication of the layer's weights
            # by the activations from the preceding layer.
            zh = np.dot(Wh, input_layer)
#             print("Hidden z's shape:", zh.shape)

            # Line 10, the activation function is applied to each element of the previous result.
            # This generates the activations for each neuron in the layer for each example in the batch.
            activations = logistic(zh)
#             print("Hidden Layer Activations Matrix Shape: ", activations.shape)

            # Repeat for each layer:
            hidden_layer = activations
            hidden_layer = np.vstack([input_bias, hidden_layer])
            zo = np.dot(Wo, hidden_layer)
#             print("Output z's shape:", zo.shape)
            output_activations = softmax(zo)
#             print(output_activations)
#             print("Output activations shape:", output_activations.shape)
            #             print(output_activations)
            """
            END FORWARD PASS
            """
            # END FOR - forward pass - result is a matrix that stores all activations of output layer
            """
            Begin Backward pass
            """
            # FOR EACH WEIGHT IN THE NETWORK
            # INITIALIZE ERROR GRADIENTS TO 0
            # a vector of delta_weights for each weight
            #             delta_hidden_weights = np.zeros((h_neurons, i_neurons))
            #             delta_output_weights = np.zeros((o_neurons, h_neurons))
            # THIS GOT SKIPPED BECAUSE WE USED THE ZEROS THAT ARE IN THE ONE-HOT-ENCODED VECTOR

            # Create an array of one-hot-encoded vectors representing each label
            enc_labels = encoder(labels)
            enc_labels = enc_labels.transpose()

            
#             print("Output Activations Matrix Shape:", output_activations.shape)
            #             print("Delta_hidden_weights Shape:", delta_hidden_weights.shape)
            #             print("Delta_output_weights Shape:", delta_output_weights.shape)
#             print("Encoded labels transposed shape:", enc_labels.shape)

            # END FOR

            # BACK PROPAGATION
            # FOR EACH EXAMPLE IN MINI BATCH - BACKPROP
            # LINES 15 - 27 backpropagation of deltas and summation of error gradients across
            # examples in the mini batch.

            # Lines 16-18
            # FOR EACH NEURON IN OUTPUT LAYER
            # calculate deltas for neurons in output layer using SOFTMAX
            delta_outputs = enc_labels - output_activations
#             print("Output layer delta matrix shape:", delta_outputs.shape)
            #             print(delta_output_test)
            # END FOR

            # Lines 19-23
            # FOR EACH HIDDEN LAYER IN NETWORK
            # FOR EACH NEURON IN HIDDEN LAYER
            # calculate deltas for neurons in hidden layers
            # dk = logistic(z) x (1 - logistic(z)) x (sum_weights x delta_i) - vectors not for loops

            # Logistic derivative
            delta_activations = delta_logistic(activations)
#             print("Hidden layer delta logistic shape:", delta_activations.shape)
            delta_activations = np.vstack([input_bias, delta_activations])
            # Weight^T x Deltas
#             make weight matrix 3D by copying to have depth of batch size?
            weight_t_deltas = np.dot(Wo.transpose(), delta_outputs)
#             print("Weight transposed dot delta_activations shape:", weight_t_deltas.shape)

            
            delta_hidden = np.multiply(delta_activations, weight_t_deltas)
            
            # Do another dot product
            # what you should get out should be as many deltas as you have nodes in the hidden layer
#             print("Hidden layer deltas shape:", delta_hidden.shape)
#             print(delta_hidden)
            delta_hidden_bias_deleted = np.delete(delta_hidden, 1, 0)
#             print(delta_hidden_bias_deleted.shape)

            # END FOR
            # END FOR

            # Lines 24-26
            # FOR EACH WEIGHT IN NETWORK
            # error gradients are accumulated
            # DELTA_weight = weight + delta & activation
            # dot product of deltas and activations
#             output_activations_transpose = output_activations.transpose()
#             hidden_delta_weights = np.dot(delta_hidden, output_activations_transpose)
            
            output_delta_weights = np.dot(hidden_layer, delta_outputs.transpose())
#             print(output_delta_weights.shape)
            
            output_delta_weights = output_delta_weights.transpose()
#             print(output_delta_weights.shape)
            
            hidden_delta_weights = np.dot(input_layer, delta_hidden_bias_deleted.transpose())
#             print(hidden_delta_weights.shape)
            
            hidden_delta_weights = hidden_delta_weights.transpose()
#             print(hidden_delta_weights.shape)
            
            
            
            
            # END FOR
            # END FOR - BACKPROP

            # Lines 28-30
            # FOR EACH WEIGHT IN NETWORK
            # Update the weights after summing error gradients over a complete pass
            # weight = weight - learning_rate * delta_weight
            Wh = Wh - learning_rate * hidden_delta_weights
            Wo = Wo - learning_rate * output_delta_weights
            
            # END FOR

            # END FOR - MINI BATCH (line 31 in algo)

            # Mini batch sequence is shuffled between epochs
            
#     # END FOR # EPOCHS
    print("Done training model.\n")

Done training model.



In [107]:
def forward_pass(input_layer, Wh, Wo, input_bias):
    # line 8 the bias inputs vector and the matrix of activations
    # from the previous layer are vertically concatenated so that
    # the bias inputs are now stored in the first row of the activation matrix


    # Line 9 is the matrix multiplication of the layer's weights
    # by the activations from the preceding layer.
    zh = np.dot(Wh, input_layer)
#             print("Hidden z's shape:", zh.shape)

    # Line 10, the activation function is applied to each element of the previous result.
    # This generates the activations for each neuron in the layer for each example in the batch.
    activations = logistic(zh)
#             print("Hidden Layer Activations Matrix Shape: ", activations.shape)

    # Repeat for each layer:
    hidden_layer = activations
    hidden_layer = np.vstack([input_bias, hidden_layer])
    zo = np.dot(Wo, hidden_layer)
#             print("Output z's shape:", zo.shape)
    output_activations = softmax(zo)
    return output_activations, activations, hidden_layer


def dnn():
    # create mini batches
    batches = np.array_split(mnist_data, 600)
    label_batches = np.array_split(mnist_labels, 600)
    test_batches = np.array_split(mnist_testing_data, 100)
    test_labels = np.array_split(mnist_testing_labels, 100)
    batch_size = 100
    i_neurons = 784
    h_neurons = 64
    o_neurons = 10
    num_epochs = 10
    learning_rate = 0.0001

    # initialize the weight matrix for each layer
#     Wh = np.random.randn(h_neurons, i_neurons + 1)
    Wh = np.random.uniform(-0.5, 0.5, size=(h_neurons, i_neurons+1))
#     print("Hidden weights shape:", Wh.shape)
#     Wo = np.random.randn(o_neurons, h_neurons + 1)
    Wo = np.random.uniform(-0.5, 0.5, size=(o_neurons, h_neurons+1))
#     print("Output weights shape:", Wo.shape)

    #     print(Wh)
    #     print(Wo)

    for epoch in range(10):
        for batch, labels in zip(batches, label_batches):
            
            """
            FORWARD PASS
            """
            input_layer = batch.transpose()
            input_bias = [1] * batch_size
            input_layer = np.vstack([input_bias, input_layer])
            
            # This pass follows the set of operations illustrated in Figure 8.6
            # each iteration of this for loop propagates the activations for the mini
            # batch forward through the next layer of the network
            # transpose input row of batch size 100 for matrix multiplication
            output_activations, activations, hidden_layer = forward_pass(input_layer, Wh, Wo, input_bias)
            """
            END FORWARD PASS
            Result is a matrix that stores all activations of output layer
            """
            
            """
            Begin Backward pass
            """
            # Create an array of one-hot-encoded vectors representing each label
            enc_labels = encoder(labels)
            enc_labels = enc_labels.transpose()
            # BACK PROPAGATION
            # FOR EACH EXAMPLE IN MINI BATCH - BACKPROP
            # LINES 15 - 27 backpropagation of deltas and summation of error gradients across
            # examples in the mini batch.

            # FOR EACH NEURON IN OUTPUT LAYER
            # calculate deltas for neurons in output layer using SOFTMAX
            delta_outputs = output_activations - enc_labels
#             print(delta_outputs)
#             print("Output layer delta matrix shape:", delta_outputs.shape)
            #             print(delta_output_test)

            # calculate deltas for neurons in HIDDEN LAYER
            # dk = logistic(z) x (1 - logistic(z)) x (sum_weights x delta_i) - vectors not for loops
            # Logistic derivative
            delta_activations = delta_logistic(activations)
#             print("Hidden layer delta logistic shape:", delta_activations.shape)
            delta_activations = np.vstack([input_bias, delta_activations])
            # Weight^T x Deltas
            weight_t_deltas = np.dot(Wo.transpose(), delta_outputs)
#             print("Weight transposed dot delta_activations shape:", weight_t_deltas.shape)

            
            delta_hidden = np.multiply(weight_t_deltas, delta_activations)
            
#             print("Hidden layer deltas shape:", delta_hidden.shape)
#             print(delta_hidden)
            delta_hidden_bias_deleted = np.delete(delta_hidden, 1, 0)
#             print(delta_hidden_bias_deleted.shape)

            # DELTA_weight = weight + delta & activation
            # dot product of deltas and activations
#             output_activations_transpose = output_activations.transpose()
#             hidden_delta_weights = np.dot(delta_hidden, output_activations_transpose)
            
            output_delta_weights = np.dot(hidden_layer, delta_outputs.transpose())
#             print(output_delta_weights.shape)
            
            output_delta_weights = output_delta_weights.transpose()
#             print(output_delta_weights.shape)
            
            hidden_delta_weights = np.dot(input_layer, delta_hidden_bias_deleted.transpose())
#             print(hidden_delta_weights.shape)
            
            hidden_delta_weights = hidden_delta_weights.transpose()
#             print(hidden_delta_weights.shape)

            # FOR EACH WEIGHT IN NETWORK
            # Update the weights after summing error gradients over a complete pass
            # weight = weight - learning_rate * delta_weight
            Wh = Wh - learning_rate * hidden_delta_weights
            Wo = Wo - learning_rate * output_delta_weights

    print("Done training model.\n")
    
    print("Testing model.\n")

    for batch, label in zip(test_batches, test_labels):
        input_layer = batch.transpose()
        input_bias = [1] * batch_size
        input_layer = np.vstack([input_bias, input_layer])
#         input_layer = np.vstack([input_bias, row])
        
        output, throw_1, throw_2 = forward_pass(input_layer, Wh, Wo, input_bias)
        print(output.shape)
        print(output.transpose().shape)
        for i in range(100):
            print(np.argmax(output.transpose()[i]))
            print(output.transpose()[i])
        break
        
        

In [108]:
dnn()

Done training model.

Testing model.

(10, 100)
(100, 10)
1
[2.08980214e-12 9.28988267e-03 2.90908789e-10 7.49318655e-07
 2.11299425e-13 1.40653842e-24 3.66249998e-12 7.09368305e-04
 5.20377937e-12 1.78329423e-10]
1
[2.08980214e-12 9.28988267e-03 2.90908789e-10 7.49318655e-07
 2.11299425e-13 1.40653842e-24 3.66249998e-12 7.09368305e-04
 5.20377937e-12 1.78329423e-10]
1
[2.08980214e-12 9.28988267e-03 2.90908789e-10 7.49318655e-07
 2.11299425e-13 1.40653842e-24 3.66249998e-12 7.09368305e-04
 5.20377937e-12 1.78329423e-10]
1
[2.08980214e-12 9.28988267e-03 2.90908789e-10 7.49318655e-07
 2.11299425e-13 1.40653842e-24 3.66249998e-12 7.09368305e-04
 5.20377937e-12 1.78329423e-10]
1
[2.08980214e-12 9.28988267e-03 2.90908789e-10 7.49318655e-07
 2.11299425e-13 1.40653842e-24 3.66249998e-12 7.09368305e-04
 5.20377937e-12 1.78329423e-10]
1
[2.08980214e-12 9.28988267e-03 2.90908789e-10 7.49318655e-07
 2.11299425e-13 1.40653842e-24 3.66249998e-12 7.09368305e-04
 5.20377937e-12 1.78329423e-10]
1
[2.0