## Neural Network
Implementation of fully connected neural network with a static architecture. 

- Input layer
- Dense hidden layer with 512 neurons, using relu as the activation function
- Dropout with a value of 0.2
- Dense hidden layer with 512 neurons, using relu as the activation function
- Dropout with a value of 0.2
- Output layer, using softmax as the activation function


The model uses categorical crossentropy as its loss function. 
Optimized the gradient descent using RMSProp, with a learning rate of 0.001 and a rho value of 0.9.
The evaluation of the model is using accuracy.
This was in an attempt to reproduce from scratch [example from the Keras documentation](https://keras.io/examples/mnist_mlp/).

In [2]:
import numpy as np
import random
import math
     


class NeuralNetwork(object):
        
    def __init__(self, epochs, learning_rate,X, Y,batch_size,iterations):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.iterations = iterations 
        self.X = X
        self.Y = Y
        self.output = np.zeros((self.batch_size,self.Y.shape[1]))
        self.dropout = 0.2
        self.w1 = np.random.randn(self.X.shape[1],512) * np.sqrt(1/(512+784))
        self.bias1 = np.random.randn(1,512)
        self.a1 = np.random.randn(self.batch_size,512)
        self.x_d1 = np.zeros((1,512)) #dropout vector 
        self.z1 = np.random.randn(self.batch_size,512)
        self.vdw1 = 0
        self.vdb1 = 0
        
        
        self.w2 = np.random.randn(self.w1.shape[1],512) * np.sqrt(1/(512+self.w1.shape[1]))
        self.bias2 = np.random.randn(1,512)
        self.z2 =  np.random.randn(self.batch_size,512)
        self.a2 = np.random.randn(self.batch_size,512)
        self.x_d2 = np.zeros((1,512)) #dropout vector
        self.vdw2 = 0
        self.vdb2 = 0
        
        self.w3 = np.random.randn(self.w2.shape[1],10) * np.sqrt(1/(10+self.w2.shape[1]))
        self.bias3 = np.random.randn(1,10)
        self.z3 = np.random.randn(self.batch_size,10)
        self.vdw3 = 0
        self.vdb3 = 0
        self.loss = []
        
        
    
    def fit(self,x_train,y_train):
        
        for e in range(self.epochs):
                self.X = x_train[0: (1)*self.batch_size,:].reshape(self.batch_size,x_train.shape[1])
                self.Y = y_train[0: (1)*self.batch_size,:].reshape(self.batch_size,y_train.shape[1])
      
                self.z1 = self.X.dot(self.w1) + self.bias1   
                self.a1 = np.maximum(self.z1,0)

                #dropout after layer 1
                active_nodes = int((1-self.dropout)*(self.a1).shape[1])
                active_node_list = sorted(random.sample(range(0,self.a1.shape[1]), active_nodes))
                self.x_d1[:,active_node_list] = 1

                self.z2 = ((self.a1* self.x_d1) @ (self.w2)) + self.bias2
                self.a2 = np.maximum(self.z2,0)

                #droput after layer2
                active_nodes = int((1-self.dropout)*(self.a2).shape[1])
                active_node_list = sorted(random.sample(range(0,(self.a2).shape[1]), active_nodes)) 
                self.x_d2[:,active_node_list] = 1 

                self.z3 = ((self.a2*self.x_d2) @ self.w3) +  self.bias3
                self.output = NeuralNetwork.softmax(self.z3)
                loss = -np.sum(self.Y*np.log(self.output))/self.X.shape[0]
                self.loss.append(loss)
        for itr in range(1,self.iterations):
            NeuralNetwork.backPropagation(self)
            self.X = x_train[(itr*self.batch_size): (itr+1)*self.batch_size,:].reshape(self.batch_size,x_train.shape[1])
            self.Y = y_train[(itr*self.batch_size): (itr+1)*self.batch_size,:].reshape(self.batch_size,y_train.shape[1])
            NeuralNetwork.feedForward(self)

    def feedForward(self):
        self.z1 = self.X.dot(self.w1) + self.bias1
        self.a1 = np.maximum(self.z1,0)
       
        #dropout after layer 1
        active_nodes = int((1-self.dropout)*(self.a1).shape[1])
        active_node_list = sorted(random.sample(range(0,self.a1.shape[1]), active_nodes))
        self.x_d1[:,active_node_list] = 1
        self.z2 = ((self.a1*self.x_d1) @ self.w2) + self.bias2
        self.a2 = np.maximum(self.z2,0)
        
        #droput after layer2
        active_nodes = int((1-self.dropout)*(self.a2).shape[1])
        active_node_list = sorted(random.sample(range(0,(self.a2).shape[1]), active_nodes))
        self.x_d2[:,active_node_list] = 1 
        self.z3 = ((self.a2*self.x_d2) @ self.w3) +  self.bias3
        self.output = NeuralNetwork.softmax(self.z3)

    def predict(self,X):
        z1 = X.dot(self.w1) + self.bias1
        a1 = np.maximum(z1,0)
        z2 = (a1 @ self.w2) + self.bias2
        a2 = np.maximum(z2,0)
        z3 = (a2 @ self.w3) +  self.bias3
        return NeuralNetwork.softmax(z3)

        
    def backPropagation(self):
        eps = 1e-8
        de_do3 = self.output - self.Y     
        din_w3 = self.a2
        delta3 = (de_do3)
        de_w3 = ((delta3.T @ din_w3).T)/self.X.shape[0]
        de_b3 = (np.sum((de_do3),axis=0,keepdims=True))
        self.vdw3 = self.vdw3 * 0.9 + (de_w3**2)*0.1
        self.vdb3 = self.vdb3 * 0.9 + (de_b3**2)*0.1

        de_do2 = delta3
        do2_din =  NeuralNetwork.reluDerivative(self.z2)
        din_w2 = self.a1
        delta2 =  (self.w3 @ de_do2.T).T * (do2_din)
        de_w2 =  (delta2.T @ din_w2)/self.X.shape[0]
        de_b2 =  (np.sum((self.w3 @ (de_do2.T @ do2_din)),axis=0,keepdims=True))
        self.vdw2 = self.vdw2 * 0.9 + (de_w2**2)*0.1
        self.vdb2 = self.vdb2 * 0.9 + (de_b2**2)*0.1
       
        de_do1 = delta2 @ self.w2
        do1_din =  NeuralNetwork.reluDerivative(self.z1)
        din_w1 = self.X
        de_w1 = (din_w1.T @ (do1_din * de_do1))/self.X.shape[0]
        de_b1 = np.sum((de_do1.T @ do1_din),axis=0,keepdims=True) 
        #(np.ones((1,512)) @ (de_do1.T @ do1_din))
        self.vdw1 = self.vdw1 * 0.9 + (de_w1**2)*0.1 
        self.vdb1 = self.vdb1 * 0.9 + (de_b1**2)*0.1
        
#update w3       
        self.w3 = self.w3 - self.learning_rate * de_w3/np.sqrt(self.vdw3+eps)
        self.bias3 = self.bias3 - self.learning_rate * de_b3/np.sqrt(self.vdb3+eps)

        
#update w2
        self.w2 = self.w2 - self.learning_rate * de_w2/ np.sqrt(self.vdw2+eps)
        self.bias2 = self.bias2 - self.learning_rate * de_b2/ np.sqrt(self.vdb2+eps)

    
#update w1       
        self.w1 = self.w1 - self.learning_rate * de_w1/np.sqrt(self.vdw1+eps)
        self.bias1 = self.bias1 - self.learning_rate * de_b1/np.sqrt(self.vdb1+eps)

       
    
    def reluDerivative(z):
        z[z<=0] = 0
        z[z>0] = 1
        return z

    def softmax(z):
        expo = np.exp(z - np.max(z, axis=1, keepdims=True))
        return expo/expo.sum(axis=1, keepdims=True)
     
    def evaluate(Y_pred,Y_actual):
        correct_predictions = 0
        for i in range(Y_pred.shape[0]):
            if np.argmax(np.asarray(Y_pred[i,:])) == np.argmax(np.asarray(Y_actual[i,:])):
                correct_predictions += 1
        return float(correct_predictions/Y_pred.shape[0])*100

Train your fully-connected neural network on the Fashion-MNIST dataset using 5-fold cross validation. Reporting accuracy on the folds, as well as on the test set.

In [4]:
import numpy as np
import keras
from keras.datasets import fashion_mnist
from sklearn.model_selection import KFold

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

# convert class vectors to binary class matrices
num_classes = 10
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

X = np.concatenate((np.array(x_train),np.array(x_test)),axis = 0)
Y = np.concatenate((np.array(y_train),np.array(y_test)),axis = 0)

folds = KFold(n_splits=5, random_state=None, shuffle=False)
i=0
for train_index, test_index in folds.split(X):
    i = i+1
    X_train, X_test, Y_train, Y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]
    n = NeuralNetwork(epochs=22, learning_rate=0.001,X=X_train, Y=Y_train,batch_size = 50 , iterations = 1120)
    n.fit(X_train,Y_train)
    Y_pred_train = n.predict(X_train)
    Y_pred_test = n.predict(X_test)
    print("Training accuracy for fold ",i," ",NeuralNetwork.evaluate(Y_pred_train,Y_train))
    print("Testing accuracy for fold ",i, " ",NeuralNetwork.evaluate(Y_pred_test,Y_test))

Training accuracy for fold  1   65.44821428571429
Testing accuracy for fold  1   65.69285714285714
Training accuracy for fold  2   69.28214285714286
Testing accuracy for fold  2   68.88571428571429
Training accuracy for fold  3   75.22142857142858
Testing accuracy for fold  3   75.12857142857143
Training accuracy for fold  4   65.89821428571429
Testing accuracy for fold  4   65.69285714285714
Training accuracy for fold  5   65.54642857142858
Testing accuracy for fold  5   65.39285714285714


Links referred : 
https://medium.com/@14prakash/back-propagation-is-very-simple-who-made-it-complicated-97b794c97e5c
https://towardsdatascience.com/a-look-at-gradient-descent-and-rmsprop-optimizers-f77d483ef08b


Dropout code inspired from : 
https://www.python-course.eu/neural_networks_with_dropout.php