In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
## downloading the dataset and normalizing images
## we normalise images because tensorflow datasets provides images of type tf.uint8, while the model expects tf.float32.

mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
import keras
from keras import utils
num_classes = 10

## one hot encoding the data

y_train_enc = utils.to_categorical(y_train, num_classes)
y_test_enc = utils.to_categorical(y_test, num_classes)

In [4]:
## Flattening the data

x_train = x_train.reshape(-1,784)
x_test = x_test.reshape(-1,784)

In [5]:
class FF_Neural_Net():

    

    def __init__(self, units, activation='sigmoid'):

        ## units: [num_nodes in input layer, num_nodes in hidden layer, num_nodes in output layer]
        ## in our case, we will take it to be [784,100,10] : [784 flattened features, 100 hidden units as specified in the question, 10 output features]

        self.units = units

        #we can later add another activation function, but for now we will go with sigmoid function in hidden layer

        self.activation = self.sig  
        
        # Initialising the weights

        self.weights_and_biases = self.initialise()

        # Caching all the values after applying activation

        self.cache = {}



    def sig(self, x, forward=True):

      ##applying sigmoid function of y = 1/(1 + exp(-x)) for forward path

      if forward:
        return 1/(1 + np.exp(-x))

      ## applying differential of sigmoid function for back propagation

      else:
        return (np.exp(-x))/((np.exp(-x)+1)**2)

      
    def softmax(self, x):
     
     ## SoftMax function for output 'probabilities'
     ## Applying the exp(y)/summation(exp(y))

        numerator = np.exp(x - x.max())
        denominator = np.sum(numerator, axis=0)
        return numerator/denominator



    def initialise(self):


      # num of nodes in each layer


      n_x=self.units[0]      ## input layer (our case: 784)

      n_hidden=self.units[1]      ## hidden layer (our case: 100)

      n_y=self.units[2]      ## output layer (our case: 10)
        
      weights_and_biases = {"W1": np.random.randn(n_hidden, n_x) * np.sqrt(1./n_x),      ## (n_x,n_hidden)
                            "b1": np.zeros((n_hidden, 1)) * np.sqrt(1./n_x),             ## (1,n_hidden)
                            "W2": np.random.randn(n_y, n_hidden) * np.sqrt(1./n_hidden), ## (n_hidden, n_y)
                            "b2": np.zeros((n_y, 1)) * np.sqrt(1./n_hidden)}             ## (1,n_y)
      return weights_and_biases

    def forward(self, x):

    ## y = Weight*X + bias

        self.cache["X"] = x

        self.cache["Z1"] = np.dot(self.weights_and_biases["W1"], self.cache["X"].T) + self.weights_and_biases["b1"]

        self.cache["A1"] = self.activation(self.cache["Z1"])

        self.cache["Z2"] = np.dot(self.weights_and_biases["W2"], self.cache["A1"]) + self.weights_and_biases["b2"]

        self.cache["A2"] = self.softmax(self.cache["Z2"]) ## using softmax for output layers

        return self.cache["A2"]



    def backward(self, y, output):

    ## using batch gradient descent 

        current_batch_size = y.shape[0]

        ## using differentials for backward propagation

        dZ2 = output - y.T
        dW2 = (1./current_batch_size) * np.dot(dZ2, self.cache["A1"].T)
        db2 = (1./current_batch_size) * np.sum(dZ2, axis=1, keepdims=True)


        dA1 = np.dot(self.weights_and_biases["W2"].T, dZ2)
        dZ1 = dA1 * self.activation(self.cache["Z1"], forward=False) ## applies sig function defined above with the else condition
        dW1 = (1./current_batch_size) * np.dot(dZ1, self.cache["X"])
        db1 = (1./current_batch_size) * np.sum(dZ1, axis=1, keepdims=True)

        self.gradients = {"W1": dW1, "b1": db1, "W2": dW2, "b2": db2}

        return self.gradients
    


    def loss(self, y, output):

        ## Avg Loss = −summation(y*log(y_pred))/num_y

        l_sum = np.sum(np.multiply(y.T, np.log(output)))

        m = y.shape[0]

        l = -(1./m) * l_sum

        return l
     


    def optimise(self, learning_rate=0.1, beta=.9):
 
    ## revising weights and biases based on learning rate as multiplier

        for i in self.weights_and_biases:
          self.weights_and_biases[i] = self.weights_and_biases[i] - learning_rate * self.gradients[i]



    def accuracy(self, y, output):
        return np.mean(np.argmax(y, axis=-1) == np.argmax(output.T, axis=-1))



    def train(self, x_train, y_train, x_test, y_test, num_epochs=30, 
              batch_size=64, learning_rate=0.1, beta=.9):
      

        # Defining num_epochs and batch_size
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        num_batches = -(-x_train.shape[0] // self.batch_size)
        
        
        # Training the network

        for i in range(self.num_epochs):
            for j in range(num_batches):
              
                # Batch

                begin = j * self.batch_size
                end = min(begin + self.batch_size, x_train.shape[0]-1)

                x = x_train[begin:end]
                y = y_train[begin:end]
                
                # Forward

                output = self.forward(x)

                # Backward propagation

                gradients = self.backward(y, output)

                # Optimize

                self.optimise(learning_rate=learning_rate, beta=beta)

            # Evaluate performance on training and test data

            # Training data

            output = self.forward(x_train)
            train_acc = self.accuracy(y_train, output)
            train_loss = self.loss(y_train, output)

            # Test data

            output = self.forward(x_test)
            test_acc = self.accuracy(y_test, output)
            test_loss = self.loss(y_test, output)


            print(f"Epoch {i+1}:, train acc={train_acc}, train loss={train_loss}, test acc={test_acc}, test loss={test_loss}")


In [6]:
model_ff_nn = FF_Neural_Net(units=[784, 100, 10], activation='sigmoid')
model_ff_nn.train(x_train, y_train_enc, x_test, y_test_enc, batch_size=128, learning_rate=0.4, beta=.9)

Epoch 1:, train acc=0.8944666666666666, train loss=0.34758745741179775, test acc=0.903, test loss=0.333584849757094
Epoch 2:, train acc=0.9162833333333333, train loss=0.2796990552539154, test acc=0.9186, test loss=0.271458833549335
Epoch 3:, train acc=0.92825, train loss=0.24094238145658634, test acc=0.9294, test loss=0.23640597139890115
Epoch 4:, train acc=0.9379, train loss=0.2126613734462774, test acc=0.9381, test loss=0.21086504388541666
Epoch 5:, train acc=0.9444166666666667, train loss=0.19065557311904396, test acc=0.9438, test loss=0.19123255880834092
Epoch 6:, train acc=0.94925, train loss=0.1729626963517057, test acc=0.9469, test loss=0.17576393213629002
Epoch 7:, train acc=0.9537833333333333, train loss=0.1584312965496865, test acc=0.9503, test loss=0.16333495454778663
Epoch 8:, train acc=0.9576666666666667, train loss=0.14627829112967689, test acc=0.9531, test loss=0.15315266479947612
Epoch 9:, train acc=0.9610166666666666, train loss=0.13593551045726904, test acc=0.9561, te