# Exploring the MNIST Dataset

In [5]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# The dataset for Assignment 3 is the MNIST dataset. THe four files given are:

# 60000 images of hand-written digits
# 60000 labels
# 10000 images of hand-written testing digits
# 10000 labels

# You will use training60000.csv and training60000_labels.csv for building/training your neural network model
# You will use testing10000.csv and testing10000_labels.csv for testing you neural network model

# Load the training data and their corresponding labels into DataFrames
mnist_data = pd.read_csv("assets/training60000.csv", header=None)
mnist_labels = pd.read_csv("assets/training60000_labels.csv", header=None)

print(type(mnist_data), "\n")

# Print their sizes
print("Training sizes:\n")
print(mnist_data.shape)
print(mnist_labels.shape,"\n")

# Notice that the mnist_data training dataset has 784 columns. The pictures are 28 x 28 gray scale.
# The pixels of each picture were flattened from 2D into 1D. 784 = 28 x 28

<class 'pandas.core.frame.DataFrame'> 

Training sizes:

(60000, 784)
(60000, 1) 



In [6]:
# Load the testing data and their corresponding labels into DataFrames
mnist_testing_data = pd.read_csv("assets/testing10000.csv", header=None)
mnist_testing_labels = pd.read_csv("assets/testing10000_labels.csv", header=None)

print(type(mnist_testing_data), "\n")

# Print their sizes
print("Testing sizes:\n")
print(mnist_testing_data.shape)
print(mnist_testing_labels.shape, "\n")

<class 'pandas.core.frame.DataFrame'> 

Testing sizes:

(10000, 784)
(10000, 1) 



In [None]:
# Let's look at the raw data of an image (the first image)
# Notice from the output that it is a gray scale image with pixel values from 0 to 255
first_image = (np.array(mnist_data.iloc[0], dtype='float')).reshape(28,28)
print("First image raw data:\n\n", first_image)

# Print the image
print("\nFirst image:\n\n")
image = (np.array(mnist_data.iloc[0], dtype='float')).reshape(28,28)
plt.imshow(image, cmap=matplotlib.cm.binary,interpolation='nearest')
plt.show

In [None]:
# Show labels of the first 20 images
print("\nLabels of the first 20 images:\n", mnist_labels[:20])

In [None]:
# Show the first 20 images

plt.clf()
fig, axes = plt.subplots(4,5, figsize=(10,10), sharex=True, sharey=True)

# Display the 20 images ina  grid of 4 rows and 5 columns
row_index = 0
col_index = 0

for i in range(20):
    ax = axes[row_index, col_index]
    
    image = (np.array(mnist_data.iloc[i], dtype='float')).reshape(28,28)
    ax.imshow(image, cmap=matplotlib.cm.binary, interpolation='nearest')
    
    ax.set_title('Label: ' + str(np.array(mnist_labels.iloc[i], dtype='object')))
    ax.set_xbound([0,28])
    
    if col_index < 4:
        col_index += 1
    else:
        col_index = 0
        row_index += 1
        
#plt.tight_layout()
plt.show()

In [14]:
from numpy import exp

# The softmax is e^z / sum(e^z) (see eq 8.65 on page 463 in Kelleher et al)

def softmax(data):
    return exp(data) / sum(exp(data))

Zs = [3, 2, 2, 1]
output = softmax(Zs)
print(output)

# Notice (from the output) that the Z scores were normalized and the total adds up to ~ 1.
print("Sum of the values in the output =", sum(output))

# THe values in output can be imterpreted as probabilities.

[0.53444665 0.19661193 0.19661193 0.07232949]
Sum of the values in the output = 0.9999999999999999


In [None]:
# dnn = Network([784, 30, 10])
# dnn.SGD(mnist_data, 30, 10, 3.0, test_data=mnist_testing_data)

![image.png](attachment:43543035-97eb-4734-974c-c1ef202a0823.png)

In [48]:
# import random
import numpy as np
from tensorflow.keras.utils import to_categorical

def logistic(z):
    """
    The logistic activation function.
    """
    return 1.0/(1.0+np.exp(-z))

def delta_logistic(z):
    """
    Derivative of the logistic function used for computing deltas.
    """
    return logistic(z)*(1-logistic(z))

def softmax(z):
    """
    Softmax function used for converting inputs into values that sum to 1.
    Used as the activation function for the output layer.
    """
    return np.exp(z) / np.sum(np.exp(z), axis=0)

def encoder(labels):
    encoded_labels = to_categorical(labels)
    return encoded_labels
#     for label in labels:
#         enc = np.zeros(10)
#         enc[label] = 1
#         encoded_labels = encoded_labels.vstack([encoded_labels, enc])
#     return np.array(encoded_labels)

# class DNN():
    
#     def __init__(self, num_layers, epochs = 10, learning_rate = 0.001):
#         self.sizes = sizes
#         self.epochs = epochs
        
#     def train(self, ):
        
# dnn = DNN()

# First try:
# Input layer: 784 neurons
# Hidden layer: 30 neurons
# Output layer: 10 neurons

def dnn():
    # Algorithm 5: Backpropagation for a feedforward network with L layers
    # create mini batches
    batches = np.array_split(mnist_data, 600)
    label_batches = np.array_split(mnist_labels, 600)
    batch_size = 100
    i_neurons = 784
    h_neurons = 30
    o_neurons = 10
    num_epochs = 10
    learning_rate = 0.001
    
    # initialize the weight matrix for each layer
    Wh = np.random.randn(h_neurons, i_neurons+1)
    Wo = np.random.randn(o_neurons, h_neurons+1)

#     print(Wh)
#     print(Wo)
    
    # FOR # EPOCHS
    # Each loop of lines 3-33 represents one epoch of training
    for epoch in range(1):
            
        # FOR EACH MINI BATCH
        for batch, labels in zip(batches, label_batches):
            # Each iteration of for loop lines 4-31 involves
            # the processing of a single mini-batch, including both
            # forward and backward pass of the algorithm
            # and a single set of weight updates.
            # In line 5, the matrix of descriptive features for the examples
            # in the mini batch is fed into the input layer.

            # FOR EACH LAYER - forward pass (MATRIX MULTIPLICATION FIGURE 8.6)
                # Lines 6-11 = forward pass.
                # This pass followes the set of operations illustrated in Figure 8.6
                # each iteration of this for loop propagates the activations for the mini
                # batch forward through the next layer of the network
            # transpose input row of batch size 100 for matrix multiplication
            input_layer = batch.transpose()
            
                # The vector v created on line 7 is the vector of bias inputs (as wide as the number
                # of neurons in the layer).
            # Create a bias vector of 1s
            input_bias = [1] * batch_size

                # line 8 the bias inputs vector and the matrix of activations
                # from the previous layer are vertically concatenated so that
                # the bias inputs are now stored in the first row of the activation matrix
            input_layer = np.vstack([input_bias, input_layer])
            
                # Line 9 is the matrix multiplication of the layer's weights
                # by the activations from the preceding layer.
            zh = np.dot(Wh, input_layer)
            
                # Line 10, the activation function is applied to each element of the previous result.
                # This generates the activations for each neuron in the layer for each example in the batch.
            activations = logistic(zh)
            
            # Repeat for each layer:
            hidden_layer = activations
            hidden_layer = np.vstack([input_bias, hidden_layer])
            zo = np.dot(Wo, hidden_layer)
            output_activations = softmax(zo)
#             print(output_activations)
            
            # END FOR - forward pass - result is a matrix that stores all activations of output layer

            # FOR EACH WEIGHT IN THE NETWORK
                # INITIALIZE ERROR GRADIENTS TO 0
                # a vector of delta_weights for each weight
#             delta_hidden_weights = np.zeros((h_neurons, i_neurons))
#             delta_output_weights = np.zeros((o_neurons, h_neurons))
            # THIS GOT SKIPPED BECAUSE WE USED THE ZEROS THAT ARE IN THE ONE-HOT-ENCODED VECTOR
            
            # Create an array of one-hot-encoded vectors representing each label
            enc_labels = encoder(labels)
            enc_labels = enc_labels.transpose()
            
            print("Hidden Layer Activations Matrix Shape: ", activations.shape)
            print("Output Activations Matrix Shape:", output_activations.shape)
#             print("Delta_hidden_weights Shape:", delta_hidden_weights.shape)
#             print("Delta_output_weights Shape:", delta_output_weights.shape)
            print("Encoded labels shape:", enc_labels.shape)
            
            # END FOR

            


            # BACK PROPAGATION
            # FOR EACH EXAMPLE IN MINI BATCH - BACKPROP
                # LINES 15 - 27 backpropagation of deltas and summation of error gradients across
                # examples in the mini batch.

                # Lines 16-18
                # FOR EACH NEURON IN OUTPUT LAYER
                    # calculate deltas for neurons in output layer using SOFTMAX
            delta_output_weights = enc_labels - output_activations
            print("Delta_i weights:", delta_output_weights.shape)
            print("Activations shape:", activations.shape)
#             print(delta_output_test)
                # END FOR

                # Lines 19-23
                # FOR EACH HIDDEN LAYER IN NETWORK
                    # FOR EACH NEURON IN HIDDEN LAYER
                        # calculate deltas for neurons in hidden layers
                        # dk = logistic(z) x (1 - logistic(z)) x (sum_weights x delta_i) - vectors not for loops
            
#             delta_hidden_weights =  
                    # END FOR
                # END FOR

                # Lines 24-26
                # FOR EACH WEIGHT IN NETWORK
                    # error gradients are accumulated
                    # DELTA_weight = weight + delta & activation
                # END FOR
            # END FOR - BACKPROP

            # Lines 28-30
            # FOR EACH WEIGHT IN HETWORK
                # Update the weights after summing error gradients over a complete pass
                # weight = weight - alpha * delta_weight
            # END FOR

        # END FOR - MINI BATCH (line 31 in algo)

        # Mini batch sequence is suffled between epochs
            break
    # END FOR # EPOCHS

In [49]:
dnn()

Hidden Layer Activations Matrix Shape:  (30, 100)
Output Activations Matrix Shape: (10, 100)
Encoded labels shape: (10, 100)
Delta_i weights: (10, 100)
Activations shape: (30, 100)


In [28]:
# delta_hidden_weights = np.zeros((30, 785))
# delta_output_weights = np.zeros((10, 31))

# print(delta_output_weights)