In [2]:
import numpy as np  

In [3]:
from tensorflow import keras

(train_images, train_labels), (test_images, test_labels) = keras.datasets.mnist.load_data()

In [4]:
train_images = train_images.reshape(train_images.shape[0],-1)
train_images.shape

(60000, 784)

In [5]:
test_images = test_images.reshape(test_images.shape[0],-1)
test_images.shape

(10000, 784)

In [6]:
X_train = train_images
Y_train = train_labels

X_test = test_images
Y_test = test_labels

In [7]:
#normalizing input so that softmax doesnt give wrong output
X_train = X_train / 255.0
X_test = X_test / 255.0

In [None]:
# converting y_train to one hot vector
num_classes = np.max(Y_train) + 1
y_one_hot_train = np.eye(num_classes)[Y_train]
y_one_hot_test  = np.eye(num_classes)[Y_test]

In [9]:
X_train.shape

(60000, 784)

In [None]:
# activation function for hidden layer
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
def softmax(X):
    # subtract row-wise max for stability as layer powers on e could cause overflow or underflow
    exp_shifted = np.exp(X - np.max(X, axis=1, keepdims=True))
    return exp_shifted / np.sum(exp_shifted, axis=1, keepdims=True)

In [12]:
W1 = np.random.randn(X_train.shape[1], 256) * 0.01
B1 = np.zeros((1,256))

W2 = np.random.randn(256, 10) * 0.01
B2 = np.zeros((1,10))

In [13]:
data = np.load('trained_params.npz')
W1 = data["W1"]
B1 = data["B1"]

W2 = data["W2"]
B2 = data["B2"]

In [14]:
((X_train @ W1) + B1).shape

(60000, 256)

In [15]:
def forwardPass(activation1,activation2,X_train,W1,B1,W2,B2):
    #predicted values
    Z1 = (X_train @ W1) + B1
    A1 = activation1(Z1)
    Z2 = (A1 @ W2) + B2
    A2 = activation2(Z2)
    return A1,A2

In [16]:
def TestNN(X_train):
    activation1 = sigmoid
    activation2 = softmax
    A1,A2 = forwardPass(activation1,activation2,X_train,W1,B1,W2,B2)

    
    return A2

In [17]:
def BinaryBackwardPass(Y_actual,A1,W2,A2,X_train):
    m = X_train.shape[0]
    Y_pred = A2

    # (n,1)
    dL_dA2 = (Y_pred - Y_actual)
    
    #(n,1)
    dL_dZ2 =  A2 - Y_actual
    
    ### dL_dW2 , dL_dB2

    #(256,1)
    dL_dW2 = (A1.T @ dL_dZ2)/m


    dL_dB2 = np.sum((dL_dZ2 * 1),axis = 0,keepdims=True) /m

    ### dL_dW1 , dL_dB1

    #(n,256)
    dL_dA1 = dL_dZ2 @ W2.T
    #(n,256)
    dL_dZ1 = dL_dA1 * (A1 * (1 - A1))

    #(784,256)
    dL_dW1 = (X_train.T @ dL_dZ1)/m

    #(n,256)
    dL_dB1 = np.sum((dL_dZ1 * 1),axis=0,keepdims=True) /m

    return dL_dW2,dL_dB2,dL_dW1,dL_dB1

In [18]:
def backwardPass(Y_actual,A1,W2,A2,X_train):
    m = X_train.shape[0]
    # X_train = (n,784)

    #(n,10)
    dL_dZ2 = (A2 - Y_actual)

    #(10,256)
    dL_dW2 = (A1.T @ dL_dZ2) / m # /m for stability

    #(1,10)
    dL_dB2 = np.sum(dL_dZ2 * 1,axis=0,keepdims=True) / m # /m for stability

    #(n,256)
    dL_dA1 = (dL_dZ2 @ W2.T)

    #(n,256)
    dL_dZ1 = dL_dA1 * (A1 * (1 - A1))

    #(784,256)
    dL_dW1 = (X_train.T @ dL_dZ1) / m # /m for stability

    #(1,256)
    dL_dB1 = np.sum(dL_dZ1 * 1,axis = 0,keepdims=True) / m # /m for stability
    
    return dL_dW2,dL_dB2,dL_dW1,dL_dB1
    


In [19]:
def updateParams(dL_dW2,dL_dB2,dL_dW1,dL_dB1, W1,W2,B1,B2,learning_rate):
    W1 = W1 - (learning_rate * dL_dW1)
    B1 = B1 - (learning_rate * dL_dB1)

    W2 = W2 - (learning_rate * dL_dW2)
    B2 = B2 - (learning_rate * dL_dB2)
    
    return W1, B1, W2, B2


In [20]:
def binaryCrossEntropy(Y_pred,Y_actual):
    m = Y_actual.shape[0]
    first_part = Y_actual*np.log(Y_pred)
    second_part = (1 - Y_actual)*np.log(1 - Y_pred)
    return  (-1) * (1/m) * np.sum((first_part + second_part),axis = 0)

In [21]:
def softmax_loss_onehot(y_pred, y_true):

    y_pred = np.clip(y_pred, 1e-12, 1.0)
    loss = -np.sum(y_true * np.log(y_pred)) / y_pred.shape[0]
    return loss

In [22]:
epochs = 1000
learning_rate = 0.1
for i in range(epochs):
    A1,A2 = forwardPass(sigmoid,softmax,X_train,W1,B1,W2,B2)

    dL_dW2,dL_dB2,dL_dW1,dL_dB1 = backwardPass(y_one_hot_train,A1,W2,A2,X_train)

    W1, B1, W2, B2 = updateParams(dL_dW2,dL_dB2,dL_dW1,dL_dB1, W1,W2,B1,B2,learning_rate)
    if (i%100 == 0):
        # mse = binaryCrossEntropy(A2, Y_train)
        # print(mse)
        print(softmax_loss_onehot(A2,y_one_hot_train))
        print("nice one")
     
        

0.44644836492613055
nice one
0.4257611656894478
nice one
0.4091316864423833
nice one
0.3954814078680738
nice one
0.3840591523883693
nice one
0.3743340247204024
nice one
0.3659258435025068
nice one
0.35855845739549713
nice one
0.3520281299198976
nice one
0.3461820356681281
nice one


In [24]:
Y_pred = TestNN(X_test)

In [25]:
predicted_classes = np.argmax(Y_pred, axis=1)

In [26]:
accuracy = np.mean(predicted_classes == Y_test)

In [37]:
acc_pct = str(round(accuracy * 100,2)) + "%"
print("your trained model is",acc_pct, "accurate on unseen test data")

your trained model is 90.6% accurate on unseen test data


In [28]:
np.unique(predicted_classes, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([1033, 1153, 1001, 1013, 1006,  857,  966, 1008,  956, 1007]))

In [29]:
np.savez('trained_params.npz', W1=W1, B1=B1, W2=W2, B2=B2)