#MNIST
Our objective is to build a neural network for the classification of the MNIST dataset. This neural network will comprise two layers, each with 10 nodes, and an input layer with 784 nodes corresponding to the image pixels. The specific structure of the neural network is outlined below, where $X$ represents the input, $A^{[0]}$ denotes the first layer, $Z^{[1]}$ signifies the unactivated layer 1, $A^{[1]}$ stands for the activated layer 1, and so forth. The weights and biases are represented by $W$ and $b$ respectively:


<div align="center">

$A^{[0]}=X$

$Z^{[1]}=W^{[1]}A^{[0]}+b^{[1]}$

$A^{[1]}=\text{ReLU}(Z^{[1]})$

$Z^{[2]}=W^{[2]}A^{[1]}+b^{[2]}$

$A^{[2]}=\text{softmax}(Z^{[2]})$
</div>




You have the flexibility to create any function within or outside the class, allowing you to modify parameters as needed

In [8]:
#importing libraries
import pandas as pd
import numpy as np
from keras.datasets import mnist
import matplotlib.pyplot as plt
from sklearn import preprocessing

### Required functions

In [9]:
x=np.array([[[1], [2], [3]], [[4], [5], [6]]])
print(x.shape)
sumarr=np.sum(x, axis=1)
print(sumarr.shape)
print(sumarr)
print(x/sumarr.reshape((2,1,1)))

(2, 3, 1)
(2, 1)
[[ 6]
 [15]]
[[[0.16666667]
  [0.33333333]
  [0.5       ]]

 [[0.26666667]
  [0.33333333]
  [0.4       ]]]


In [10]:
# activation and loss functions
n=10000

def ReLU(x):
  return np.maximum(x, 0)

def derivative_ReLU(x):
  return np.where(x>=0, 1, 0)

def softmax(arr):
  return (np.exp(arr)).astype(np.float64)/(np.sum(np.exp(arr), axis=1).reshape(n,1,1)).astype(np.float64)

def CrossEntropy(pred, actual):
  sum=0.0
  for i in range(10):
    sum=sum+actual[i]*np.log(pred[i])
  return -1*sum

def AvgError(pred, actual):
  n_samples=len(pred)
  sum=0
  for i in range(n_samples):
    sum=sum-CrossEntropy(pred[i], actual[i])
  return -1*sum/float(n_samples)

In [11]:
#complete the class of neural network

class NN:
  def __init__(self):
      self.w1=np.zeros((n, 10, 784), dtype="float64")
      self.b1=np.zeros((n, 10, 1), dtype="float64")
      self.w2=np.zeros((n, 10, 10), dtype="float64")
      self.b2=np.zeros((n, 10, 1), dtype="float64")
      self.a0=np.zeros((n, 784, 1), dtype="float64")
      self.a1=np.zeros((n, 10,1), dtype="float64")
      self.a2=np.zeros((n, 10,1), dtype="float64")
      self.z1=np.zeros((n, 10,1), dtype="float64")
      self.z2=np.zeros((n, 10,1), dtype="float64")
      self.der_loss_w2=None
      self.der_loss_b2=None
      self.der_loss_w1=None
      self.der_loss_b1=None

  def forward_propagation(self, input):
      self.a0=input.reshape(n, 784, 1).astype(np.float64)
      self.z1=(np.matmul(self.w1, self.a0) + self.b1).astype(np.float64)
      self.a1=(ReLU(self.z1)).astype(np.float64)
      self.z2=(np.matmul(self.w2, self.a1) + self.b2).astype(np.float64)
      self.a2=softmax(self.z2).astype(np.float64)
      # print(type(self.a0[0][0][0]))

  def one_hot(self, y): #return a 0 vector with 1 only in the position corresponding to the value in test target
      return np.eye(10)[y].astype(np.float64)

  # def backward_propagation(self, t):
  #     t=t.reshape(n, 10, 1).astype(np.float64)
  #     sum=np.sum(np.exp(self.z2), axis=1).astype(np.float64)
  #     sum=sum.reshape(n,1,1)
  #     self.der_loss_b2=(np.exp(self.z2)/sum - t)
  #     self.der_loss_w2=(np.matmul(np.exp(self.z2), np.transpose(self.a1, (0,2,1)))/sum - np.matmul(t, np.transpose(self.a1, (0,2,1))))

  #     self.der_loss_b1=(derivative_ReLU(self.z1) * ((1/sum)*np.matmul(np.transpose(np.exp(self.z2), (0,2,1)), self.w2) - np.matmul(np.transpose(t, (0,2,1)), self.w2)).reshape(n, 10,1))
  #     self.der_loss_w1=(np.matmul(derivative_ReLU(self.z1), np.transpose(self.a0, (0,2,1))) * ((1/sum)*np.matmul(np.transpose(np.exp(self.z2), (0,2,1)), self.w2) - np.matmul(np.transpose(t, (0,2,1)), self.w2)).reshape(n, 10,1))

  def backward_propagation(self, t):
    """
    Performs backpropagation to calculate gradients of loss with respect to weights and biases.

    Args:
      t: Target output (one-hot encoded).
    """

    t = t.reshape(-1, 10, 1)  # Reshape target for compatibility

    # Output layer (softmax)
    sum_exp_z2 = np.sum(np.exp(self.z2), axis=1, keepdims=True)
    dz2 = (np.exp(self.z2) / sum_exp_z2) - t  # Derivative of softmax loss
    a1transpose=np.transpose(self.a1, (0,2,1))
    dw2 = [np.matmul(slice1, slice2) for slice1, slice2 in zip(dz2, a1transpose)]
    dw2=np.array(dw2)
    # dw2 = np.einsum(dz2, self.a1.T)  # Gradient of loss w.r.t. w2
    db2=(np.exp(self.z2)/sum_exp_z2 - t)
    # db2 = np.sum(dz2, axis=(0, 2), keepdims=True)  # Gradient of loss w.r.t. b2

    # Hidden layer (ReLU)
    w2transpose=np.transpose(self.w2, (0,2,1))
    dz1 = [np.matmul(slice1, slice2) for slice1, slice2 in zip(w2transpose, dz2)]
    dz1 = np.array(dz1)
    # dz1 = np.matmul(self.w2.T, dz2) * derivative_ReLU(self.z1)  # Backpropagate through ReLU
    a0transpose = np.transpose(self.a0, (0,2,1))
    print(dz1.shape)
    print(a0transpose.shape)
    dw1 = [np.matmul(slice1, slice2) for slice1, slice2 in zip(dz1, a0transpose)]
    dw1 = np.array(dw1)
    # dw1 = np.matmul(dz1, self.a0.T)  # Gradient of loss w.r.t. w1
    db1=(derivative_ReLU(self.z1) * ((1/sum_exp_z2)*np.matmul(np.transpose(np.exp(self.z2), (0,2,1)), self.w2) - np.matmul(np.transpose(t, (0,2,1)), self.w2)).reshape(n, 10,1))

    # db1 = np.sum(dz1, axis=(0, 2), keepdims=True)  # Gradient of loss w.r.t. b1
    self.der_loss_w2 = dw2
    self.der_loss_b2 = db2
    self.der_loss_w1 = dw1
    self.der_loss_b1 = db1


  def update_params(self, xtrain, ytrain):
      reqdy=self.one_hot(ytrain)
      self.forward_propagation(xtrain)
      self.backward_propagation(reqdy)
      self.gradient_descent()

  def get_accuracy(self, y_pred, y_actual):
      n_samples=len(y_pred)
      sum=0.0
      for i in range(n_samples):
        if y_pred[i]==y_actual[i]:
          sum+=1
      return float(sum)/n_samples

  def gradient_descent(self, lr=0.05, niter=1000):
    der_loss_w1avg=np.mean(self.der_loss_w1, axis=0).astype(np.float64)
    der_loss_b1avg=np.mean(self.der_loss_b1, axis=0).astype(np.float64)
    der_loss_w2avg=np.mean(self.der_loss_w2, axis=0).astype(np.float64)
    der_loss_b2avg=np.mean(self.der_loss_b2, axis=0).astype(np.float64)
    for _ in range(niter):
        self.w1=self.w1-lr*der_loss_w1avg
        self.b1=self.b1-lr*der_loss_b1avg
        self.w2=self.w2-lr*der_loss_w2avg
        self.b2=self.b2-lr*der_loss_b2avg

  def make_predictions(self, xtest):
      self.forward_propagation(xtest)
      print(self.a2)
      y_pred=np.argmax(self.a2, axis=1).reshape(-1)
      return y_pred



## main

In [12]:
nn=NN()
print(nn.one_hot([1,2,3]))


[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


In [13]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
X_train=X_train.reshape(60000,784)/255.0
X_test=X_test.reshape(10000,784)/255.0


###preprocessing the data


###Model Training

In [14]:
 #training model using gradient descent
MyNetwork=NN()

for i in range(6):
  print("training iteration", i)
  MyNetwork.update_params(X_train[i*n:(i+1)*n], Y_train[i*n:(i+1)*n])


training iteration 0
(10000, 10, 1)
(10000, 1, 784)
training iteration 1
(10000, 10, 1)
(10000, 1, 784)
training iteration 2
(10000, 10, 1)
(10000, 1, 784)
training iteration 3
(10000, 10, 1)
(10000, 1, 784)
training iteration 4
(10000, 10, 1)
(10000, 1, 784)
training iteration 5
(10000, 10, 1)
(10000, 1, 784)


### Viewing Results


In [16]:
#viewing prediction for 10 random images in dataset
predictions=np.array([])
for i in range(1):
  print("test iteration", i)
  predictions=np.append(predictions, MyNetwork.make_predictions(X_test[i*n:(i+1)*n]))

predictions.reshape(-1)
acc=MyNetwork.get_accuracy(predictions, Y_test)
print(predictions)
print(acc)


test iteration 0
[[[4.73452336e-03]
  [1.37132472e-01]
  [4.59357628e-01]
  ...
  [1.00727118e-01]
  [4.65260808e-06]
  [7.96252256e-02]]

 [[4.73452336e-03]
  [1.37132472e-01]
  [4.59357628e-01]
  ...
  [1.00727118e-01]
  [4.65260808e-06]
  [7.96252256e-02]]

 [[4.73452336e-03]
  [1.37132472e-01]
  [4.59357628e-01]
  ...
  [1.00727118e-01]
  [4.65260808e-06]
  [7.96252256e-02]]

 ...

 [[4.73452336e-03]
  [1.37132472e-01]
  [4.59357628e-01]
  ...
  [1.00727118e-01]
  [4.65260808e-06]
  [7.96252256e-02]]

 [[4.73452336e-03]
  [1.37132472e-01]
  [4.59357628e-01]
  ...
  [1.00727118e-01]
  [4.65260808e-06]
  [7.96252256e-02]]

 [[4.73452336e-03]
  [1.37132472e-01]
  [4.59357628e-01]
  ...
  [1.00727118e-01]
  [4.65260808e-06]
  [7.96252256e-02]]]
[2. 2. 2. ... 2. 2. 2.]
0.1032
