<a href="https://colab.research.google.com/github/kb107/MNIST-Handwritten-Digit-Recogniser/blob/main/MNISTfromscratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
import numpy as np 
import sklearn 
import keras
import pickle

In [37]:
from sklearn.datasets import fetch_openml
from keras.utils.np_utils import to_categorical

X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
y = to_categorical(y)
X = (X/255).astype(np.float32)

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
size = [784, 128, 64, 10]

In [40]:
from numpy.random import randn

params = { 'w1' : randn(size[1], size[0]) / np.sqrt(size[0]),
           'b1' : randn(size[1], 1) / np.sqrt(size[0]),
           'w2' : randn(size[2], size[1]) / np.sqrt(size[1]),
           'b2' : randn(size[2], 1) / np.sqrt(size[1]),
           'w3' : randn(size[3], size[2]) / np.sqrt(size[2]),
           'b3' : randn(size[3], 1) / np.sqrt(size[2])}
cache = {}
gradient = {}
alpha = 0.001
epochs = 10

In [41]:
def sigmoid(z, derivative=False):
    sig = 1 / (1 + np.exp(-z))
    if derivative:
        return sig * (1 - sig)
    else: 
        return sig

In [42]:
def softmax(z, derivative=False):
    exps = np.exp(z - z.max())
    exps = exps / np.sum(exps)
    if derivative:
        return exps * (1 - exps)
    else:
        return exps

In [43]:
def forward_prop(x):
    global params, cache
    cache['a0'] = x
    cache['z1'] = np.dot(params['w1'], cache['a0']) + params['b1']
    cache['a1'] = sigmoid(cache['z1'])
    cache['z2'] = np.dot(params['w2'], cache['a1']) + params['b2']
    cache['a2'] = sigmoid(cache['z2'])
    cache['z3'] = np.dot(params['w3'], cache['a2']) + params['b3']
    cache['o'] = softmax(cache['z3'])

In [44]:
def backward_prop(y):
    global params, cache, gradient
    cache['del3'] = (cache['o']-y) * softmax(cache['z3'], derivative=True)
    cache['del2'] = np.dot(params['w3'].T, cache['del3']) * sigmoid(cache['z2'], derivative=True)
    cache['del1'] = np.dot(params['w2'].T, cache['del2']) * sigmoid(cache['z1'], derivative=True)

    gradient['w3'] = np.dot(cache['del3'], cache['a2'].T)
    gradient['b3'] = cache['del3']
    gradient['w2'] = np.dot(cache['del2'], cache['a1'].T)
    gradient['b2'] = cache['del2']
    gradient['w1'] = np.dot(cache['del1'], cache['a0'].T)
    gradient['b1'] = cache['del1']

In [45]:
def gradient_descent(alpha):
  global params, gradient
  params['w3'] -= alpha * gradient['w3']
  params['w2'] -= alpha * gradient['w2']
  params['w1'] -= alpha * gradient['w1']
  params['b3'] -= alpha * gradient['b3']
  params['b2'] -= alpha * gradient['b2']
  params['b1'] -= alpha * gradient['b1']

In [59]:
def train(X_train, y_train, epochs, alpha):
  n = len(X_train)
  for i in range(epochs):
    for j in range(n):
      if j == 0 or j == 27999 or j == 55999:
        print(i+1, ':', j+1) 
      x = np.array([X_train[j]])
      x = x.reshape(784,1)
      y = np.array([y_train[j]])
      y = y.reshape(10,1)
      forward_prop(x)
      backward_prop(y)
      gradient_descent(alpha)

In [72]:
train(X_train, y_train, epochs=1, alpha=0.01)

1 : 1
1 : 28000
1 : 56000


In [65]:
def pred_accuracy(X_train, y_train):
  pred_acc = []
  n = len(X_train)
  for j in range(n):
    if j == 0 or j == 5000 or j == 13999:
      print(j+1)
    x = np.array([X_train[j]])
    x = x.reshape(784,1)
    y = np.array([y_train[j]])
    y = y.reshape(10,1)
    forward_prop(x)
    pred = np.argmax(cache['o'])
    pred_acc.append(pred == np.argmax(y))
  return np.mean(pred_acc)


In [73]:
acc = pred_accuracy(X_test, y_test)
print('Accuracy on test set: ', acc)

1
5001
14000
Accuracy on test set:  0.9487857142857142


In [74]:
print(params)

{'w1': array([[-0.02167367,  0.00756273,  0.05073333, ..., -0.01016585,
        -0.07478265,  0.07854969],
       [ 0.00466357, -0.07668132,  0.02168547, ..., -0.03664479,
        -0.01869971,  0.03244802],
       [ 0.04384679,  0.00626101,  0.05553747, ..., -0.09110255,
         0.01037669, -0.03279341],
       ...,
       [-0.02173494,  0.04301976,  0.01278058, ..., -0.01614712,
        -0.00127388, -0.00106792],
       [-0.01479284, -0.03824029,  0.03688875, ..., -0.01194906,
         0.01501774,  0.06259521],
       [-0.02097972, -0.04208916,  0.00515604, ..., -0.0090167 ,
         0.04542296,  0.00038703]]), 'b1': array([[ 0.10249761],
       [ 0.17497143],
       [-0.05625298],
       [-0.00822584],
       [ 0.21414113],
       [ 0.00924411],
       [ 0.04733501],
       [-0.24582726],
       [ 0.21428819],
       [ 0.03406768],
       [-0.04952715],
       [ 0.09564579],
       [ 0.02726371],
       [-0.08369489],
       [ 0.46504353],
       [-0.30520455],
       [-0.0204573 ],

In [79]:
file = open('mnist_model.pkl', 'wb')
pickle.dump(params, file)
file.close()

In [80]:
file = open('mnist_model.pkl', 'rb')
out = pickle.load(file)

print(out)
file.close()

{'w1': array([[-0.02167367,  0.00756273,  0.05073333, ..., -0.01016585,
        -0.07478265,  0.07854969],
       [ 0.00466357, -0.07668132,  0.02168547, ..., -0.03664479,
        -0.01869971,  0.03244802],
       [ 0.04384679,  0.00626101,  0.05553747, ..., -0.09110255,
         0.01037669, -0.03279341],
       ...,
       [-0.02173494,  0.04301976,  0.01278058, ..., -0.01614712,
        -0.00127388, -0.00106792],
       [-0.01479284, -0.03824029,  0.03688875, ..., -0.01194906,
         0.01501774,  0.06259521],
       [-0.02097972, -0.04208916,  0.00515604, ..., -0.0090167 ,
         0.04542296,  0.00038703]]), 'b1': array([[ 0.10249761],
       [ 0.17497143],
       [-0.05625298],
       [-0.00822584],
       [ 0.21414113],
       [ 0.00924411],
       [ 0.04733501],
       [-0.24582726],
       [ 0.21428819],
       [ 0.03406768],
       [-0.04952715],
       [ 0.09564579],
       [ 0.02726371],
       [-0.08369489],
       [ 0.46504353],
       [-0.30520455],
       [-0.0204573 ],