In [52]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv('/kaggle/input/mnist-in-csv/mnist_train.csv')

In [53]:
data.head

<bound method NDFrame.head of        label  1x1  1x2  1x3  1x4  1x5  1x6  1x7  1x8  1x9  ...  28x19  28x20  \
0          5    0    0    0    0    0    0    0    0    0  ...      0      0   
1          0    0    0    0    0    0    0    0    0    0  ...      0      0   
2          4    0    0    0    0    0    0    0    0    0  ...      0      0   
3          1    0    0    0    0    0    0    0    0    0  ...      0      0   
4          9    0    0    0    0    0    0    0    0    0  ...      0      0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...    ...    ...   
59995      8    0    0    0    0    0    0    0    0    0  ...      0      0   
59996      3    0    0    0    0    0    0    0    0    0  ...      0      0   
59997      5    0    0    0    0    0    0    0    0    0  ...      0      0   
59998      6    0    0    0    0    0    0    0    0    0  ...      0      0   
59999      8    0    0    0    0    0    0    0    0    0  ...      0      0   

       28

In [54]:
# Turn data into a numpy array of data
data = np.array(data)
m,n = data.shape
print(m,n)

60000 785


There are 60000 rows of training data and 785 cols because of (28 by 28) pixels + 1 label

In [55]:
# Shuffle data before training sets
np.random.shuffle(data)


data_dev = data[0:1000].T
# First col of labels -> 1 * 60000
Y_dev = data_dev[0]
# Rest of the rows and cols -> 784 * 60000
X_dev = data_dev[1:n]
# Normalize images because they are grascale (0 to 255)
X_dev = X_dev / 255


data_train = data[1000:11000].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train/255

pixel_im,training_ex = X_train.shape
print(pixel_im,training_ex)

784 10000


In [56]:
def ReLU(Z):
    return np.maximum(Z,0)

def init_params():
    # Initialize random numpy arrays for the specific dimensions between
    sub = 0.5
    W1 = np.random.rand(10,784) - sub
    b1 = np.random.rand(10,1) - sub
    W2 = np.random.rand(10,10) - sub
    b2 = np.random.rand(10,1) - sub
    return W1,b1,W2,b2

In [57]:
def softmax(Z):
    # Function to get max of 1 spread between final hidden layer to output layer
    A = np.exp(Z) / sum(np.exp(Z))
    return A

def ReLU_deriv(Z):
    # Get derivative of ReLU which is one if its positive (straight line) or zero if its negative (flat line)
    return Z > 0

# one_hot takes in the label array of 1 * 60000
def one_hot(Y):
    # Create numpy array with all zeroes of dimensions Y.size for rows and 10 for cols because 0-9 digits require 10 cols
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    
    # Run through and set according values using arange and Y
    # np.arange makes a np array from 0 to Y.size sequentially [0,1,2,3,4,etc...] and Y is the original label array [2,5,1,0,etc...]
    # For each of these intersections it maps that exact 2D location to equal 1
    # Looking at the two arrays, because 0 and 2 are the first nums of both then [0,2] = 1, then [1,5] = 1, and so on...
    # This works out because the digits are from 0-9 so the label array, 'Y', accounts for the index starting from one ->
    # For the first example, 2 will actually be inserted into the third row, while 5 will be inserted in the sixth, 1 into the second, and 0 into the first
    one_hot_Y[np.arange(Y.size), Y] = 1
    
    # Transpose it to match the rest of the data which has been transposed
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

Input layer will contain the 784 units from the image, which are passed to a hidden layer that has 10 units using the ReLU activation (numbers from 0-1), which are finally output by the output layer with 10 units that represent the 0-9 digits using softmax so that the sum of the activations is 1 amongst all 0-9 digits, and we take the highest activation to be the most accurate digit chosen.

**Forward propagation**

$$Z^{[1]} = W^{[1]} X + b^{[1]}$$
$$A^{[1]} = g_{\text{ReLU}}(Z^{[1]}))$$
$$Z^{[2]} = W^{[2]} A^{[1]} + b^{[2]}$$
$$A^{[2]} = g_{\text{softmax}}(Z^{[2]})$$

In [58]:
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

**Backward propagation**

$$dZ^{[2]} = A^{[2]} - Y$$
$$dW^{[2]} = \frac{1}{m} dZ^{[2]} A^{[1]T}$$
$$dB^{[2]} = \frac{1}{m} \Sigma {dZ^{[2]}}$$
$$dZ^{[1]} = W^{[2]T} dZ^{[2]} .* g^{[1]\prime} (z^{[1]})$$
$$dW^{[1]} = \frac{1}{m} dZ^{[1]} A^{[0]T}$$
$$dB^{[1]} = \frac{1}{m} \Sigma {dZ^{[1]}}$$

In [59]:
def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    m = Y.size
    
    # Creates one hot Y matrix
    one_hot_Y = one_hot(Y)
    
    # Finds total difference between final output layer with softmax activation and actual Y
    dZ2 = A2 - one_hot_Y
    
    # Average difference of (total difference between second layers times softmax activation layer)
    dW2 = 1 / m * dZ2.dot(A1.T)
    
    # Finds average difference between second layers to determine how much bias should be
    db2 = 1 / m * np.sum(dZ2)
    
    # Weights of second layer times difference between layers times ReLU derivative
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    
    # Average difference of (difference in first layer times original layer)
    dW1 = 1 / m * dZ1.dot(X.T)
    
    # Average differene of (sum of difference in first layer)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

In [60]:
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    # alpha is learning rate which we determine
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1    
    W2 = W2 - alpha * dW2  
    b2 = b2 - alpha * db2    
    return W1, b1, W2, b2

In [61]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    # Boolean comparison where if predictions is equal to Y, it adds one, then find average
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2

In [62]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.11, 1000)

Iteration:  0
[6 6 6 ... 6 0 0] [4 4 5 ... 8 0 0]
0.1204
Iteration:  10
[6 6 6 ... 6 0 0] [4 4 5 ... 8 0 0]
0.1799
Iteration:  20
[7 6 6 ... 6 0 0] [4 4 5 ... 8 0 0]
0.2301
Iteration:  30
[7 6 6 ... 6 0 0] [4 4 5 ... 8 0 0]
0.2886
Iteration:  40
[4 6 6 ... 6 0 0] [4 4 5 ... 8 0 0]
0.3681
Iteration:  50
[4 6 6 ... 3 0 0] [4 4 5 ... 8 0 0]
0.4317
Iteration:  60
[4 9 6 ... 3 0 0] [4 4 5 ... 8 0 0]
0.4722
Iteration:  70
[4 4 6 ... 3 0 0] [4 4 5 ... 8 0 0]
0.5062
Iteration:  80
[4 4 6 ... 6 0 0] [4 4 5 ... 8 0 0]
0.5346
Iteration:  90
[4 4 6 ... 6 0 0] [4 4 5 ... 8 0 0]
0.5659
Iteration:  100
[4 4 8 ... 6 0 0] [4 4 5 ... 8 0 0]
0.5974
Iteration:  110
[4 4 5 ... 6 0 0] [4 4 5 ... 8 0 0]
0.6262
Iteration:  120
[4 4 5 ... 8 0 0] [4 4 5 ... 8 0 0]
0.6519
Iteration:  130
[4 4 5 ... 8 0 0] [4 4 5 ... 8 0 0]
0.6759
Iteration:  140
[4 4 5 ... 8 0 0] [4 4 5 ... 8 0 0]
0.6963
Iteration:  150
[4 4 5 ... 8 0 0] [4 4 5 ... 8 0 0]
0.7161
Iteration:  160
[4 4 5 ... 8 0 0] [4 4 5 ... 8 0 0]
0.7307
Iteratio

In [65]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

In [66]:
dev_predictions = make_predictions(X_dev, W1, b1, W2, b2)
get_accuracy(dev_predictions, Y_dev)

[9 6 4 9 1 7 9 9 5 3 9 2 9 9 5 2 4 4 1 1 1 7 4 2 6 4 1 2 6 3 5 2 5 9 8 2 3
 9 1 2 9 6 4 6 5 4 7 0 6 5 2 0 6 6 9 2 8 4 2 9 3 6 9 2 4 5 3 9 7 9 6 1 8 4
 3 6 4 7 5 4 6 3 1 3 3 2 2 2 6 8 8 4 2 5 9 7 4 9 2 8 6 7 2 1 4 7 7 2 5 6 8
 0 1 8 9 0 1 3 3 5 4 0 8 5 8 8 5 0 0 7 6 9 8 9 7 1 5 2 5 5 0 0 9 9 3 0 9 9
 3 0 7 8 6 1 6 9 7 0 6 0 3 3 6 3 4 8 2 6 7 0 0 0 4 2 0 9 0 6 3 5 1 9 6 9 9
 7 8 5 1 0 5 8 3 5 2 4 4 2 7 9 3 6 1 4 8 3 3 2 0 2 7 0 1 4 2 2 6 1 4 6 6 3
 9 0 7 0 9 0 8 1 2 9 1 8 6 9 7 5 7 5 2 6 9 3 2 4 5 5 4 8 4 9 4 3 7 8 9 2 9
 4 4 1 9 1 7 4 8 8 5 7 9 6 7 2 6 2 6 0 3 8 7 2 5 0 3 3 8 2 2 1 9 0 5 4 3 1
 6 1 8 4 1 9 8 2 9 0 6 4 5 0 3 1 8 9 8 7 8 7 4 6 6 7 4 0 1 8 6 2 4 8 4 1 8
 6 0 7 4 1 4 9 4 4 1 3 0 1 6 0 3 4 0 9 2 3 9 3 5 4 2 3 0 7 6 6 0 9 2 5 7 5
 1 8 7 6 3 1 4 6 8 7 6 1 7 0 7 3 3 0 1 1 0 4 1 5 1 6 6 2 2 2 6 2 7 4 0 9 2
 5 8 5 1 0 4 0 0 7 5 3 0 0 8 3 9 7 1 5 4 7 0 2 2 4 7 8 8 6 9 9 8 2 0 0 3 2
 3 5 0 0 9 3 1 5 2 6 5 4 5 4 6 3 0 5 5 6 0 9 2 8 2 9 8 8 3 9 3 1 0 4 0 7 6
 5 1 6 6 2 5 1 9 1 9 9 1 

0.864