In [None]:
## All Imports
import numpy as np
from tqdm import tqdm
from keras.datasets import fashion_mnist

In [None]:
## Activation functions and its derivative

class Activation:
    def __init__(self,activation="sigmoid",output_activation="softmax") -> None:
        self.activation = activation
        self.output_activation=output_activation
        
    def get_activation(self,a):
        if self.activation=="relu":
            return np.maximum(a,0)
        a = np.clip(a, -200, 200)
        if self.activation=="tanh":
            return np.tanh(a)
        if self.activation=="sigmoid":
            return 1/(1+np.exp(-a))
        
        return 1/(1+np.exp(-a)) # by default it will take sigmoid
        
        
    def get_derivative(self,a):
        if self.activation=="relu":
            a[a<=0] = 0
            a[a>0] = 1
            return a
        a = np.clip(a, -200, 200)
        if self.activation=="tanh":
            return 1 - (self.get_activation(a)**2)
        if self.activation=="sigmoid":
            return self.get_activation(a) * (1 - self.get_activation(a))
        
        return self.get_activation(a) * (1 - self.get_activation(a)) # by default it will take sigmoid
        
        
    def get_output_activation(self, a):
        if self.output_activation=="softmax":
            a = np.clip(a,-200,200)
            return np.exp(a)/np.sum(np.exp(a))
        
        a = np.clip(a,-200,200)
        return np.exp(a)/np.sum(np.exp(a))
    

In [None]:
## Initializer class

class Initializer:
    def __init__(self, init_type="random") -> None:
        self.init_type = init_type
        
    def random_initialize(self, number_of_layers, nodes_per_hidden_layer, nodes_in_output_layer, input_layer_size):
        if number_of_layers<=2:
            return [np.random.randn(nodes_in_output_layer,input_layer_size)], [np.random.randn(nodes_in_output_layer)]

        if number_of_layers==3:
            Weights = [np.random.randn(nodes_per_hidden_layer, input_layer_size), np.random.randn(nodes_in_output_layer, nodes_per_hidden_layer)]
            Biases = [np.random.randn(nodes_per_hidden_layer), np.random.randn(nodes_in_output_layer)]
            return Weights, Biases

        WS = np.random.randn(nodes_per_hidden_layer, input_layer_size)
        W = np.random.randn(number_of_layers-3, nodes_per_hidden_layer ,nodes_per_hidden_layer)
        B = np.random.randn(number_of_layers-2, nodes_per_hidden_layer)
        WL = np.random.randn(nodes_in_output_layer, nodes_per_hidden_layer)
        BL = np.random.randn(nodes_in_output_layer)

        Weights = [WS] + [i for i in W] + [WL]
        Biases = [i for i in B] + [BL]
        return Weights, Biases
    
    def xavier_initialize(self,number_of_layers, nodes_per_hidden_layer, nodes_in_output_layer, input_layer_size=784):
        if number_of_layers<=2:
            return [np.random.randn(nodes_in_output_layer,input_layer_size)], [np.random.randn(nodes_in_output_layer)]

        fact_in = np.sqrt(6/(input_layer_size + nodes_per_hidden_layer))
        fact_out = np.sqrt(6/(nodes_in_output_layer + nodes_per_hidden_layer))
        fact_hid = np.sqrt(6/(nodes_per_hidden_layer + nodes_per_hidden_layer))

        if number_of_layers==3:
            # fact_in = np.sqrt(6/(input_layer_size + nodes_per_hidden_layer))
            # fact_out = np.sqrt(6/(nodes_in_output_layer + nodes_per_hidden_layer))
            Weights = [np.random.uniform(-fact_in, fact_in, (nodes_per_hidden_layer, input_layer_size)), np.random.uniform(-fact_out,fact_out,(nodes_in_output_layer, nodes_per_hidden_layer))]
            Biases = [np.zeros(nodes_per_hidden_layer), np.zeros(nodes_in_output_layer)]
            return Weights, Biases

        WS = np.random.uniform(-fact_in, fact_in, (nodes_per_hidden_layer, input_layer_size))
        W = np.random.uniform(-fact_hid, fact_hid, (number_of_layers-3, nodes_per_hidden_layer ,nodes_per_hidden_layer))
        B = np.zeros([number_of_layers-2, nodes_per_hidden_layer])
        WL = np.random.uniform(-fact_out,fact_out,(nodes_in_output_layer, nodes_per_hidden_layer))
        BL = np.zeros(nodes_in_output_layer)

        Weights = [WS] + [i for i in W] + [WL]
        Biases = [i for i in B] + [BL]
        return Weights, Biases
    
    def initialize(self,number_of_layers, nodes_per_hidden_layer, nodes_in_output_layer, input_layer_size=784):
        if self.init_type=="xavier":
            return self.xavier_initialize(number_of_layers, nodes_per_hidden_layer, nodes_in_output_layer, input_layer_size)
        elif self.init_type=="random":
            return self.random_initialize(number_of_layers, nodes_per_hidden_layer, nodes_in_output_layer, input_layer_size)
        return self.random_initialize(number_of_layers, nodes_per_hidden_layer, nodes_in_output_layer, input_layer_size) # by default random initializr will be called
    

In [None]:
## Preactivation class

class Preactivation:
    def __init__(self, pre_activation = "linear") -> None:
        self.pre_activation = pre_activation
        
    def get_pre_activation(self, Weights, Biases, H):
        if self.pre_activation=="linear":
            return np.matmul(Weights, H) + Biases
        return np.matmul(Weights, H) + Biases
        

In [None]:
## Accuracy class

class Accuracy:
    def __init__(self) -> None:
        pass
    
    def get_accuracy(self, y_actual, y_predicted):
        total = len(y_actual)
        cnt = 0
        for i in range(total):
            if np.argmax(y_actual[i]) == np.argmax(y_predicted[i]):
                cnt = cnt + 1

        return (cnt/total)*100

In [None]:
## Loss class

class Loss:
    def __init__(self, loss_type="cross_entropy") -> None:
        self.loss_type = loss_type
    
    def get_loss(self, y_actual, y_pred):
        if self.loss_type=="cross_entropy":
            return self.cross_entropy_loss(y_actual,y_pred)
        if self.loss_type=="mean_squared":
            return self.mean_squared_error(y_actual, y_pred)
        if self.loss_type=="root_mean_squared":
            return self.root_mean_squared(y_actual,y_pred)
        
        return self.cross_entropy_loss(y_actual,y_pred)
        
        
    def cross_entropy_loss(self, y_actual, y_pred):
        loss = 0
        for i in range(len(y_actual)):
            loss = loss + (-np.log(y_pred[i][np.argmax(y_actual[i])]))
        return loss
    
    def mean_squared_error(self, y_actual, y_pred):
        loss = 0
        for i in range(len(y_actual)):
            for j in range(len(y_actual[i])):
                loss = loss + (y_actual[i][j]-y_pred[i][j])**2
        loss = loss / (len(y_actual)*len(y_actual[0]))
        return loss
    
    def root_mean_squared(self, y_actual,y_pred):
        return np.sqrt(self.mean_squared_error(y_actual,y_pred))

In [None]:
## gradient_descent class
from tqdm import tqdm
class GradientDescent:
    def __init__(self, optimizer) -> None:
        self.optimizer = optimizer
    
    def update_weights_and_biases(self, learning_rate, Weights, Biases, delta_Weights, delta_Biases, l2_regularization_constant):
        for i in range(len(Weights)):
            Weights[i] = Weights[i] - learning_rate * delta_Weights[i]
            # for j in range(len(Weights[i])):
            #     Weights[i][j] = Weights[i][j] - learning_rate * delta_Weights[i][j] - (learning_rate * l2_regularization_constant * Weights[i][j])

            for j in range(len(Biases[i])):
                Biases[i][j] = Biases[i][j] - learning_rate * delta_Biases[i][j] #- (learning_rate * l2_regularization_constant * Biases[i][j])

        return Weights, Biases
    
    
    def forward_propagation(self, X, Weights, Biases, number_of_layers, activation, output_activation, batch_size, pre_activation, **params):
        A = []
        H = [X]
        
        for i in range(number_of_layers-2):
        
            modified_bias = Biases[i].reshape(1,-1)
            modified_bias_N = np.repeat(modified_bias, batch_size, axis=0).transpose()
            
            A.append(pre_activation(Weights[i],H[i],modified_bias_N))
            H.append(activation(A[i]))

        modified_bias = Biases[-1].reshape(1,-1)
        modified_bias_N = np.repeat(modified_bias, batch_size, axis=0).transpose()

        A.append(pre_activation(Weights[-1], H[-1], modified_bias_N))

        y_pred_temp = []
        A_trns = A[-1].transpose()
        for i in range(batch_size):
            y_pred_temp.append(output_activation(A_trns[i]))

        y_pred = np.array(y_pred_temp).transpose()

        return H, A, y_pred
    
    def backward_propagation(self, H, A, W, y_actual, y_pred, number_of_layers, derivative, **params):
        
        delta_W = [0 for i in range(number_of_layers-1)]
        delta_B = [0 for i in range(number_of_layers-1)]

        delta_A = -(y_actual-y_pred)
        delta_H = None

        for k in reversed(range(number_of_layers-1)):

            # gradient with respect to parameters
            delta_W[k] = np.matmul(delta_A, H[k].transpose())

            delta_B[k] = np.sum(delta_A,axis=1)

            if k==0:
                break
            
            # gradient with respect to layer below
            delta_H = np.matmul(W[k].transpose() , delta_A)

            #gradient with respect to layer below (i.e. pre-activation)
            delta_A = np.multiply(delta_H,[derivative(i) for i in A[k-1]])            

        return delta_W, delta_B

    def run_gradient_descent(self, X, Y, learning_rate, num_of_layers, batch_size, Weights, Biases, activation, l2_reg_constant, beta, epsilon, output_activation, pre_activation, derivative):
        if self.optimizer=="sgd" or self.optimizer=="mini_batch":
            return self.gradient_descent_mini_batch(X=X,Y=Y,learning_rate=learning_rate,number_of_layers=num_of_layers,batch_size=batch_size,Weights=Weights,Biases=Biases,activation_function=activation,l2_regularization_constant=l2_reg_constant,output_activation=output_activation,pre_activation=pre_activation,derivative=derivative)
        elif self.optimizer=="mbgd":
            return self.gradient_descent_mbgd(X=X,Y=Y,learning_rate=learning_rate,number_of_layers=num_of_layers,batch_size=batch_size,Weights=Weights,Biases=Biases,activation_function=activation,l2_regularization_constant=l2_reg_constant,beta=beta,output_activation=output_activation,pre_activation=pre_activation,derivative=derivative)
        elif self.optimizer=="rmsprop":
            return self.gradient_descent_rmsprop(X=X,Y=Y,learning_rate=learning_rate,number_of_layers=num_of_layers,batch_size=batch_size,Weights=Weights,Biases=Biases,activation_function=activation,l2_regularization_constant=l2_reg_constant,beta=beta,epsilon=epsilon,output_activation=output_activation,pre_activation=pre_activation,derivative=derivative)
        elif self.optimizer=="nagd":
            return self.gradient_descent_nagd(X=X,Y=Y,learning_rate=learning_rate,number_of_layers=num_of_layers,batch_size=batch_size,Weights=Weights,Biases=Biases,activation_function=activation,l2_regularization_constant=l2_reg_constant,beta=beta,output_activation=output_activation,pre_activation=pre_activation,derivative=derivative)
        elif self.optimizer=="adam":
            pass
        elif self.optimizer=="nadam":
            pass
        

    
    def gradient_descent_mini_batch(self,X, Y, learning_rate, number_of_layers,  batch_size, Weights, Biases, activation_function, l2_regularization_constant, output_activation, pre_activation,derivative):
        itr = 0

        for itr in tqdm(range(X.shape[0]//batch_size)):
            H, A, y_pred = self.forward_propagation(X[itr*batch_size:(itr+1)*batch_size].transpose(), Weights, Biases, number_of_layers, activation_function, output_activation, batch_size,pre_activation)

            delta_Weights, delta_Biases = self.backward_propagation(H, A, Weights, Y[itr*batch_size:(itr+1)*batch_size].transpose(), y_pred, number_of_layers, derivative)

            Weights, Biases = self.update_weights_and_biases(learning_rate, Weights, Biases, delta_Weights, delta_Biases, l2_regularization_constant)

        return Weights, Biases
    
    def gradient_descent_mbgd(self, X, Y, learning_rate, number_of_layers,  batch_size, Weights, Biases, activation_function, l2_regularization_constant, beta, output_activation, pre_activation,derivative):
        
        def __accumulate_history(prev, current, prev_factor=1):
            temp = []
            for i in range(len(prev)):
                temp.append((prev[i]*prev_factor) + (current[i]))

            return temp
        
        itr = 0
        u_t_weights = [np.zeros_like(weight) for weight in Weights]
        u_t_biases = [np.zeros_like(bias) for bias in Biases]
    
        for itr in tqdm(range(X.shape[0])):
            H, A, y_pred = self.forward_propagation(X[itr*batch_size:(itr+1)*batch_size], Weights, Biases, number_of_layers, activation_function,output_activation,batch_size,pre_activation)
            delta_Weights, delta_Biases = self.backward_propagation(H, A, Weights, Y[itr*batch_size:(itr+1)*batch_size], y_pred, number_of_layers, derivative)
            # u_t = beta * u_t + delta_Weights
            u_t_weights = __accumulate_history(u_t_weights,delta_Weights,prev_factor=beta)
            u_t_biases = __accumulate_history(u_t_biases,delta_Biases, prev_factor=beta)

            
            Weights, Biases = self.update_weights_and_biases(learning_rate, Weights, Biases, u_t_weights, u_t_biases, l2_regularization_constant)

        return Weights, Biases
    
    def gradient_descent_rmsprop(self, X, Y, learning_rate, number_of_layers,  batch_size, Weights, Biases, activation_function, l2_regularization_constant, beta, epsilon, output_activation, pre_activation,derivative):
        
        def __accumulate_history(prev, current, prev_factor,current_factor):
            temp = []
            for i in range(len(prev)):
                temp.append((prev[i]*prev_factor) + (current[i]*current_factor))

            return temp
        
        def __square_each_term(a):
            temp = []
            for i in range(len(a)):
                temp.append(np.array(a[i])**2)
            return temp

        def __modify_deltas_RMSProp(v_t, w_t, epsilon):
            temp = []
            for i in range(len(v_t)):
                temp.append(w_t[i] / (np.sqrt(v_t[i]) + epsilon))
            return temp
        
        itr = 0
        v_t_weights = [np.zeros_like(weight) for weight in Weights]
        v_t_biases = [np.zeros_like(bias) for bias in Biases]

        for itr in tqdm(range(X.shape[0])):

            H, A, y_pred = self.forward_propagation(X[itr*batch_size:(itr+1)*batch_size], Weights, Biases, number_of_layers, activation_function,output_activation,batch_size,pre_activation)
            delta_Weights, delta_Biases = self.backward_propagation(H, A, Weights, Y[itr*batch_size:(itr+1)*batch_size], y_pred, number_of_layers, derivative)

            v_t_weights = __accumulate_history(v_t_weights,__square_each_term(delta_Weights),prev_factor=beta, current_factor=1-beta)
            v_t_biases = __accumulate_history(v_t_biases,__square_each_term(delta_Biases), prev_factor=beta, current_factor=1-beta)

            Weights, Biases = self.update_weights_and_biases(learning_rate, Weights, Biases, __modify_deltas_RMSProp(v_t_weights, delta_Weights, epsilon), __modify_deltas_RMSProp(v_t_biases, delta_Biases, epsilon),l2_regularization_constant)


        return Weights, Biases
    
    def gradient_descent_nagd(self,X, Y, learning_rate, number_of_layers,  batch_size, Weights, Biases, activation_function, l2_regularization_constant, beta, output_activation, pre_activation,derivative):
        def __modify_W_B_NAGD(u_t, w_t, beta):
            temp = []
            for i in range(len(u_t)):
                temp.append(w_t[i]- (beta*u_t[i]))
            return temp
        
        def __accumulate_history(prev, current, prev_factor=1, current_factor=1):
            temp = []
            for i in range(len(prev)):
                temp.append((prev[i]*prev_factor) + (current[i]*current_factor))

            return temp
        
        itr = 0
        u_t_weights = [np.zeros_like(weight) for weight in Weights]
        u_t_biases = [np.zeros_like(bias) for bias in Biases]

        for itr in tqdm(range(X.shape[0]//batch_size)):
            H, A, y_pred = self.forward_propagation(X[itr*batch_size:(itr+1)*batch_size].transpose(), __modify_W_B_NAGD(u_t_weights, Weights, beta), __modify_W_B_NAGD(u_t_biases, Biases, beta), number_of_layers, activation_function, output_activation, batch_size,pre_activation)
            delta_Weights, delta_Biases = self.backward_propagation(H, A, __modify_W_B_NAGD(u_t_weights, Weights, beta), Y[itr*batch_size:(itr+1)*batch_size].transpose(), y_pred, number_of_layers, derivative)

            u_t_weights = __accumulate_history(u_t_weights,delta_Weights,prev_factor=beta)
            u_t_biases = __accumulate_history(u_t_biases,delta_Biases, prev_factor=beta)

            Weights, Biases = self.update_weights_and_biases(learning_rate, Weights, Biases, u_t_weights, u_t_biases, l2_regularization_constant)


        return Weights, Biases
        
    
    def gradient_descent_adam():
        pass
    
    def gradient_descent_nadam():
        pass

In [None]:
## FFNN model

class FeedForwardNeuralNetwork:
    def __init__(self, X, Y, epochs, num_of_hidden_layer, size_of_layer, learning_rate, optimizer, batch_size, l2_reg_constant, weight_init_type, activation_function, beta, epsilon, loss_type) -> None:
        self.X = X
        self.Y = Y
        self.X_validation = X[54000:]
        self.Y_validation = Y[54000:]
        self.X_train = X[:54000]
        self.Y_train = Y[:54000]
        self.epochs = epochs
        self.num_of_hidden_layer = num_of_hidden_layer
        self.num_of_layers = num_of_hidden_layer+2
        self.size_of_layer = size_of_layer
        self.learning_rate = learning_rate
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.weight_init_type = weight_init_type
        self.l2_reg_constant = l2_reg_constant
        self.activation_function = activation_function
        self.beta = beta
        self.epsilon = epsilon
        self.loss_type = loss_type
        
        # self.X = self.X.reshape(self.X.shape[0],-1)/255
        self.X_train = self.X_train.reshape(self.X_train.shape[0],-1)/255
        self.X_validation = self.X_validation.reshape(self.X_validation.shape[0],-1)/255
        
        ty = [np.zeros(10) for i in range(self.X_train.shape[0])]
        for i in range(self.X_train.shape[0]):
            ty[i][self.Y_train[i]] = 1
            
        vy = [np.zeros(10) for i in range(self.X_validation.shape[0])]
        for i in range(self.X_validation.shape[0]):
            vy[i][self.Y_validation[i]] = 1
            
        self.Y_train = np.array(ty)
        self.Y_validation = np.array(vy)
            
        
        initializer = Initializer(self.weight_init_type)
        self.Weights, self.Biases = initializer.initialize(self.num_of_layers,self.size_of_layer,self.Y_train.shape[1], self.X_train.shape[1])

        
        
        
    

    def train_model(self):
        print("number of training datapoints:",self.X_train.shape[0])
        print("number of validation datapoints:",self.X_validation.shape[0])
        print("number of epochs:", self.epochs)
        print("number of hidden layers:", self.num_of_hidden_layer)
        print("size of hidden layers:", self.size_of_layer)
        print("learning rate:", self.learning_rate)
        print("optimizer:", self.optimizer)
        print("batch_size:", self.batch_size)
        print("l2 regularization constant:", self.l2_reg_constant)
        print("weights and biases initialization type:", self.weight_init_type)
        print("activation function:", self.activation_function)
        print("beta:", self.beta)
        print("epsilon:", self.epsilon)
        
        activation_class = Activation(self.activation_function,"softmax")
        derivative = activation_class.get_derivative
        activation =activation_class.get_activation
        output_activation = activation_class.get_output_activation
        
        pre_activation = Preactivation("linear").get_pre_activation
        
        # initializer = Initializer(self.weight_init_type)
        # Weights, Biases = initializer.initialize(self.num_of_hidden_layer+2,self.size_of_layer,self.Y.shape[1], self.X.shape[1])


        loss = Loss(self.loss_type)


        accuracy = Accuracy()
        gradient_descent = GradientDescent(self.optimizer)

        for epoch in range(self.epochs):
            print("Epoch number", epoch+1, " started")
            self.Weights, self.Biases = gradient_descent.run_gradient_descent(self.X, self.Y, self.learning_rate, self.num_of_hidden_layer+2, self.batch_size, self.Weights, self.Biases, activation, self.l2_reg_constant, self.beta, self.epsilon, output_activation, pre_activation, derivative)
            print("Epoch ",epoch+1, " finished.")
            # _,_, Y_validation_predict = gradient_descent.forward_propagation(self.X_validation, Weights, Biases, self.num_of_hidden_layer+2, activation, output_activation, self.X_validation.shape[0],pre_activation)
            # _,_, Y_train_predict = gradient_descent.forward_propagation(self.X_train, Weights, Biases, self.num_of_hidden_layer+2, activation, output_activation, self.X_validation.shape[0],pre_activation)

            # print("validation accuracy after epoch",epoch+1,accuracy.get_accuracy(self.Y_validation,Y_predict))

        return self.Weights, self.Biases
    
    
    def predict(self, X_test):
        
        if X_test.shape[1]==28:
            X_test = X_test.reshape(X_test.shape[0],-1)/255
        gradient_descent = GradientDescent(self.optimizer)
        act = Activation(self.activation_function, "softmax")
        pre = Preactivation("linear")
        _,_, Y_test_predict = gradient_descent.forward_propagation(X_test, self.Weights, self.Biases, self.num_of_layers, act.get_activation, act.get_output_activation, X_test.shape[0],pre.get_pre_activation)
        return Y_test_predict
