Import the MNIST dataset

In [1]:
from sklearn.datasets import fetch_openml
from keras.utils.np_utils import to_categorical
import numpy as np
#from sklearn.model_selection import train_test_split
import time

mnist = fetch_openml('mnist_784')
x,y=mnist['data'],mnist['target']
x = (x/255).astype('float32')
y = to_categorical(y)

# train-test split
x_train,x_val=x[:60000], x[60000:]
y_train, y_val = y[:60000], y[60000:]
#shuffling the data
shuffle_index=np.random.permutation(60000)
x_train,y_train= x_train[shuffle_index] , y_train[shuffle_index]

Initialize the Matrices

In [2]:

def initialization():
    # number of nodes in each layer

    W1 = np.random.randn(128, 784) * np.sqrt(1. / 128)
    W2 = np.random.randn(10, 128) * np.sqrt(1. / 10)
    B1 = np.random.randn(128,) * np.sqrt(1. / 128)
    B2 = np.random.randn(10,) * np.sqrt(1. / 128)     

    return W1,B1,W2,B2


Activation Functions: 
(i) Sigmoid for Inner layer
(ii) Softmax for Output layer

In [4]:
def tanh(x):
    t=(np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
    return t

def tanh_derivative(x):
    t=(np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
    dt=1-t**2
    return dt


def softmax(x):
    # Numerically stable with large exponentials
    exps = np.exp(x - x.max())
    return exps / np.sum(exps, axis=0)

def softmax_derivative(x):
    exps = np.exp(x - x.max())
    return exps / np.sum(exps, axis=0) * (1 - exps / np.sum(exps, axis=0))

Forward Propagation 

In [6]:
def forward_pass(x_train, W1,B1,W2,B2):
    

    # input layer activations becomes sample
    A0 = x_train

    # input layer to hidden layer 1
    Z1 = np.dot(W1, A0) + B1
    A1 = tanh(Z1)


    # hidden layer 1 to hidden layer 2
    Z2 = np.dot(W2, A1)
    A2 = softmax(Z2) + B2

    return A1,A2,Z1,Z2


Backward Propagation

In [7]:
def backward_pass(x_train, y_train,A1, output, W1,B1,W2,B2,Z1,Z2):
    
    # Calculate W3 update
    
    error = 2 * (output - y_train) / output[0] * softmax_derivative(Z2)
    change_W2 = np.outer(error, A1)
    change_B2 = (1 / 70000) * np.sum(error)

    # Calculate W2 update
    error = np.dot(W2.T, error) * tanh_derivative(Z1)
    change_W1 = np.outer(error, x_train)
    change_B1 = (1 / 70000) * np.sum(error)

    return change_W2,change_B2,change_W1,change_B1

Update Vectors/Matrices according to update rule from
        Stochastic Gradient Descent.

In [8]:

def update_network_parameters(W1,B1,W2,B2, change_W2,change_B2,change_W1,change_B1,l_rate):
    '''
       
    '''
    W2 = W2 - l_rate * change_W2
    B2 = B2 - l_rate * change_B2
    W1 = W1 - l_rate * change_W1
    B1 = B1 - l_rate * change_B1
  
    return W1,B1,W2,B2

Calculate accuracy by matching each result with respective label

In [9]:

def compute_accuracy(x_val, y_val,W1,B1,W2,B2):
    
    predictions = []

    for x, y in zip(x_val, y_val):
        A1,output,Z1,Z2 = forward_pass(x,W1,B1,W2,B2)
        pred = np.argmax(output)
        predictions.append(pred == np.argmax(y))
    
    return np.mean(predictions)

Gradient Descent: training function

In [10]:

def train(x_train, y_train, x_val, y_val,epochs,l_rate):
    start_time = time.time()
    W1,B1,W2,B2 = initialization()
    for iteration in range(epochs):
        for x,y in zip(x_train, y_train):

            A1,output,Z1,Z2 = forward_pass(x,W1,B1,W2,B2)
      
            change_W2,change_B2,change_W1,change_B1 = backward_pass(x,y,A1,output,  W1,B1,W2,B2,Z1,Z2)
            W1,B1,W2,B2 = update_network_parameters(W1,B1,W2,B2,change_W2,change_B2,change_W1,change_B1, l_rate)

        accuracy = compute_accuracy(x_val, y_val,W1,B1,W2,B2)
        print('Epoch: {0}, Time Spent: {1:.2f}s, Accuracy: {2:.2f}%'.format(
            iteration+1, time.time() - start_time, accuracy * 100
        ))
      
    return W1,B1,W2,B2
          
W1,B1,W2,B2 = train(x_train, y_train, x_val, y_val,10,0.001)

  

Epoch: 1, Time Spent: 59.34s, Accuracy: 91.55%
Epoch: 2, Time Spent: 118.63s, Accuracy: 91.23%
Epoch: 3, Time Spent: 177.98s, Accuracy: 91.95%
Epoch: 4, Time Spent: 237.25s, Accuracy: 91.31%
Epoch: 5, Time Spent: 296.47s, Accuracy: 90.60%
Epoch: 6, Time Spent: 355.31s, Accuracy: 91.52%
Epoch: 7, Time Spent: 414.60s, Accuracy: 90.36%
Epoch: 8, Time Spent: 473.80s, Accuracy: 90.60%
Epoch: 9, Time Spent: 532.90s, Accuracy: 91.74%
Epoch: 10, Time Spent: 592.07s, Accuracy: 91.58%


Testing Portion:
Checking the train model againt inputs

In [11]:
def predictions(x,W1,B1,W2,B2):
    A1,output,Z1,Z2 = forward_pass(x,W1,B1,W2,B2)
    pred = np.argmax(output)
    return pred

In [12]:

def test_prediction(index, W1, B1, W2, B2):
    prediction = predictions(x_train[ index,:],W1,B1,W2,B2)
    label = y_train[index]
    print("Prediction: ", prediction)
    print("Label: ", label)
  

Testing by providing input

In [15]:
test_prediction(23, W1, B1, W2, B2)
test_prediction(3799, W1, B1, W2, B2)
test_prediction(123, W1, B1, W2, B2)
test_prediction(45, W1, B1, W2, B2)

Prediction:  8
Label:  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
Prediction:  8
Label:  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
Prediction:  0
Label:  [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Prediction:  2
Label:  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
