In [None]:
# imports
import numpy as np
from keras.datasets import fashion_mnist


In [None]:
# All functions required

# Activation functions
# 1 Relu
def relu(a):
    if a < 0 :
        return 0
    return a

def relu_vector(a):
    temp = []
    for i in range(len(a)):
        temp.append(relu(a[i]))
    return temp

# 2 Sigmoid
def sigmoid(a):
    ans = 1/(1+np.exp(-a))
    return ans

def sigmoid_vector_old(a):
    temp=[]
    for i in range(len(a)):
        temp.append(sigmoid(a[i]))
    return temp

def sigmoid_vector(a):
    a = np.clip(a, -200,200)
    ans = 1/(1+np.exp(-a))
    return ans

# 3 Tanh
def tanh(a):
    a = np.clip(a, -200, 200)
    ans = (np.exp(a)-np.exp(-a))/(np.exp(a)+np.exp(-a))
    return ans

def tanh_vector_old(a):
    temp = []
    for i in range(len(a)):
        temp.append(tanh(a[i]))
    return temp

def tanh_vector(a):
    a = np.clip(a, -200, 200)
    ans = (np.exp(a)-np.exp(-a))/(np.exp(a)+np.exp(-a))
    return ans

# Output Activation Function

# Softmax
def softmax(a):
    a = np.clip(a, -200, 200)
    return np.exp(a)/np.sum(np.exp(a))

#-------------------------------------------------------
# utility functions

def hadamard_product(A,B):
    result = []
    for i in range(len(A)):
        result.append(A[i]*B[i])

    return result

def random_initialize(number_of_layers, nodes_per_hidden_layer, nodes_in_output_layer, input_layer_size=784):
    if number_of_layers<=2:
        return [np.random.randn(nodes_in_output_layer,input_layer_size)], [np.random.randn(nodes_in_output_layer)]

    if number_of_layers==3:
        Weights = [np.random.randn(nodes_per_hidden_layer, input_layer_size), np.random.randn(nodes_in_output_layer, nodes_per_hidden_layer)]
        Biases = [np.random.randn(nodes_per_hidden_layer), np.random.randn(nodes_in_output_layer)]
        return Weights, Biases

    WS = np.random.randn(nodes_per_hidden_layer, input_layer_size)
    W = np.random.randn(number_of_layers-3, nodes_per_hidden_layer ,nodes_per_hidden_layer)
    B = np.random.randn(number_of_layers-2, nodes_per_hidden_layer)
    WL = np.random.randn(nodes_in_output_layer, nodes_per_hidden_layer)
    BL = np.random.randn(nodes_in_output_layer)

    Weights = [WS] + [i for i in W] + [WL]
    Biases = [i for i in B] + [BL]
    return Weights, Biases

def xavier_initialize(number_of_layers, nodes_per_hidden_layer, nodes_in_output_layer, input_layer_size=784):
    pass




def get_accuracy(Y_actual, Y_predicted):
    total = len(Y_actual)
    cnt = 0
    for i in range(total):
        if np.argmax(Y_actual[i]) == np.argmax(Y_predicted[i]):
            cnt = cnt + 1

    return (cnt/total)*100


def get_average_delta_WandB(delta_W_acc, delta_B_acc):
    for i in range(1,len(delta_W_acc)):
        for j in range(len(delta_W_acc[0])):
            delta_W_acc[0][j] = np.add(delta_W_acc[0][j] , delta_W_acc[i][j])
            # if i==len(delta_W_acc)-1:
            #     delta_W_acc[0][j] = delta_W_acc[0][j] / len(delta_W_acc)

        for j in range(len(delta_B_acc[0])):
            delta_B_acc[0][j] = np.add(delta_B_acc[0][j] , delta_B_acc[i][j])
            # if i==len(delta_B_acc)-1:
            #     delta_B_acc[0][j] = delta_B_acc[0][j] / len(delta_B_acc)

    return delta_W_acc[0], delta_B_acc[0]


#---------------------------------------------------

# Derivative functions

def derivative_sigmoid(a):
    return sigmoid(a) * (1-sigmoid(a))

def derivative_tanh(a):
    return 1 - (tanh(a)**2)

def derivative_relu(a):
    if a<=0:
        return 0
    return 1


#---------------------------------------------------
# Pre Avtivation function


def pre_activation(W, h, b):
    return np.add(np.matmul(W,h) , b)


#------------------------------------------------------

# forward propagation 


def forward_propagation(X, Weights, Biases, number_of_layers, activation_function):
    
    if activation_function=="relu":
        activation = relu_vector
    elif activation_function=="tanh":
        activation = tanh_vector
    else:
        activation = sigmoid_vector
    
    A = []
    H = [X]
    for i in range(number_of_layers-2):
        A.append(pre_activation(Weights[i],H[i],Biases[i]))
        H.append(activation(A[i]))

    A.append(pre_activation(Weights[-1], H[-1], Biases[-1]))

    y_pred = softmax(A[-1])

    return H, A, y_pred

#---------------------------------------------------------------------

# Backward Propagation

def backward_propagation(H, A, W, y_actual, y_pred, number_of_layers, activation_function):
    
    if activation_function=="relu":
        derivative = derivative_relu
    elif activation_function=="tanh":
        derivative = derivative_tanh
    else:
        derivative = derivative_sigmoid
    # delta_A = [0 for i in range(number_of_layers-1)]
    delta_W = [0 for i in range(number_of_layers-1)]
    delta_B = [0 for i in range(number_of_layers-1)]
    # delta_H = [0 for i in range(number_of_layers-2)]

    # gradient with respect to output
    # delta_A[-1] = -(y_actual-y_pred)
    delta_A = -(y_actual-y_pred)
    delta_H = None


    for k in reversed(range(number_of_layers-1)):

        # gradient with respect to parameters
        # delta_W[k] = np.outer(delta_A[k],H[k-1])
        delta_W[k] = np.outer(delta_A, H[k])
        # delta_B[k] = delta_A[k]
        delta_B[k] = delta_A

        if k==0:
            break
        # gradient with respect to layer below
        # delta_H[k-1] = np.matmul(W[k].transpose() , delta_A[k])
        delta_H = np.matmul(W[k].transpose() , delta_A)

        #gradient with respect to layer below (i.e. pre-activation)
        # delta_A[k-1] = hadamard_product(delta_H[k-1],[derivative(i) for i in A[k-1]])
        delta_A = hadamard_product(delta_H,[derivative(i) for i in A[k-1]])
        

    return delta_W, delta_B


#---------------------------------------------------------------


# gradient descent algorithms


def update_weights_and_biases(learning_rate, Weights, Biases, delta_Weights, delta_Biases, l2_regularization_constant):
    for i in range(len(Weights)):
        # Weights[i] = Weights[i] - learning_rate * delta_Weights[i]
        # Biases[i] = Biases[i] - learning_rate * delta_Biases[i]
        for j in range(len(Weights[i])):
            Weights[i][j] = Weights[i][j] - learning_rate * delta_Weights[i][j] - (learning_rate * l2_regularization_constant * Weights[i][j])

        for j in range(len(Biases[i])):
            Biases[i][j] = Biases[i][j] - learning_rate * delta_Biases[i][j] - (learning_rate * l2_regularization_constant * Biases[i][j])

    return Weights, Biases


def gradient_descent_stochastic(X, Y, learning_rate, number_of_layers, batch_size, Weights, Biases, activation_function, l2_regularization_constant, beta=0, epsilon=0):

    # Weights, Biases = random_initialize(number_of_layers,nodes_per_hidden_layer,nodes_in_output_layer)
    itr = 0

    while itr<X.shape[0]:
        # H, A, y_pred = forward_propagation(X[itr*batch_size:(itr+1)*batch_size], Weights, Biases, number_of_layers)
        H, A, y_pred = forward_propagation(X[itr], Weights, Biases, number_of_layers, activation_function)
        # return None, None
        # delta_Weights, delta_Biases = backward_propagation(H, A, Weights, Y[itr*batch_size:(itr+1)*batch_size], y_pred, number_of_layers)
        delta_Weights, delta_Biases = backward_propagation(H, A, Weights, Y[itr], y_pred, number_of_layers, activation_function)        
        Weights , Biases = update_weights_and_biases(learning_rate, Weights, Biases, delta_Weights, delta_Biases, l2_regularization_constant)
        itr = itr + 1
    return Weights, Biases

def gradient_descent_mini_batch(X, Y, learning_rate, number_of_layers,  batch_size, Weights, Biases, activation_function, l2_regularization_constant, beta=0, epsilon=0):
    itr = 0
    delta_W_acc = []
    delta_B_acc = []

    while itr<X.shape[0]:
        H, A, y_pred = forward_propagation(X[itr], Weights, Biases, number_of_layers, activation_function)

        delta_Weights, delta_Biases = backward_propagation(H, A, Weights, Y[itr], y_pred, number_of_layers, activation_function)

        delta_W_acc.append(delta_Weights)
        delta_B_acc.append(delta_Biases)

        itr = itr + 1
        if itr%batch_size==0:
            delta_W_avg, delta_B_avg = get_average_delta_WandB(delta_W_acc, delta_B_acc)

            Weights, Biases = update_weights_and_biases(learning_rate, Weights, Biases, delta_W_avg, delta_B_avg, l2_regularization_constant)

            delta_W_acc = []
            delta_B_acc = []
            delta_W_avg = 0
            delta_B_avg = 0


    if delta_B_acc and delta_W_acc:
        delta_W_avg, delta_B_avg = get_average_delta_WandB(delta_W_acc, delta_B_acc)

        Weights, Biases = update_weights_and_biases(learning_rate, Weights, Biases, delta_W_avg, delta_B_avg, l2_regularization_constant)

    return Weights, Biases

def accumulate_history(prev, current, prev_factor=1, current_factor=1):
    for i in range(len(prev)):
        prev[i] = np.add(prev[i]*prev_factor, current[i]*current_factor)

    return prev

def gradient_descent_momentum_based(X, Y, learning_rate, number_of_layers,  batch_size, Weights, Biases, activation_function, l2_regularization_constant, beta, epsilon=0):
    
    itr = 0
    u_t_weights = np.zeros_like(Weights)
    u_t_biases = np.zeros_like(Biases)
    # u_t_list = [u_t]
    while itr<X.shape[0]:
        H, A, y_pred = forward_propagation(X[itr], Weights, Biases, number_of_layers, activation_function)
        delta_Weights, delta_Biases = backward_propagation(H, A, Weights, Y[itr], y_pred, number_of_layers, activation_function)
        # u_t = beta * u_t + delta_Weights
        u_t_weights = accumulate_history(u_t_weights,delta_Weights,prev_factor=beta)
        u_t_biases = accumulate_history(u_t_biases,delta_Biases, prev_factor=beta)
        # u_t_list.append()
        itr = itr + 1

        if itr%batch_size==0:
            Weights, Biases = update_weights_and_biases(learning_rate, Weights, Biases, u_t_weights, u_t_biases, l2_regularization_constant)

    if itr%batch_size!=0:
        Weights, Biases = update_weights_and_biases(learning_rate, Weights, Biases, u_t_weights, u_t_biases, l2_regularization_constant)

    return Weights, Biases


def square_each_term(a):
    temp = []
    for i in range(len(a)):
        temp.append(np.multiply(a[i],a[i]))
    return temp

def modify_deltas_RMSProp(v_t, w_t, epsilon):
    temp = []
    for i in range(len(v_t)):
        temp.append(np.divide(w_t[i], (np.sqrt(v_t[i]) + epsilon)))
    return temp

def gradient_descent_RMSProp(X, Y, learning_rate, number_of_layers,  batch_size, Weights, Biases, activation_function, l2_regularization_constant, beta, epsilon):
    
    itr = 0
    v_t_weights = np.zeros_like(Weights)
    v_t_biases = np.zeros_like(Biases)

    while itr<X.shape[0]:
        H, A, y_pred = forward_propagation(X[itr], Weights, Biases, number_of_layers, activation_function)
        delta_Weights, delta_Biases = backward_propagation(H, A, Weights, Y[itr], y_pred, number_of_layers, activation_function)

        v_t_weights = accumulate_history(v_t_weights,square_each_term(delta_Weights),prev_factor=beta, current_factor=1-beta)
        v_t_biases = accumulate_history(v_t_biases,square_each_term(delta_Biases), prev_factor=beta, current_factor=1-beta)

        itr = itr + 1

        if itr%batch_size==0:
            Weights, Biases = update_weights_and_biases(learning_rate, Weights, Biases, modify_deltas_RMSProp(v_t_weights, delta_Weights, epsilon), modify_deltas_RMSProp(v_t_biases, delta_Biases, epsilon),l2_regularization_constant)
    
    if itr%batch_size!=0:
        Weights, Biases = update_weights_and_biases(learning_rate, Weights, Biases, modify_deltas_RMSProp(v_t_weights, delta_Weights, epsilon), modify_deltas_RMSProp(v_t_biases, delta_Biases, epsilon),l2_regularization_constant)
    
    
    return Weights, Biases


def modify_W_B_NAGD(u_t, w_t, beta):
    temp = []
    for i in range(len(u_t)):
        temp.append(np.subtract(w_t[i],beta*u_t[i]))
    return temp


def gradient_descent_nesterov_accelarated(X, Y, learning_rate, number_of_layers,  batch_size, Weights, Biases, activation_function, l2_regularization_constant, beta, epsilon=0):
    itr = 0
    u_t_weights = np.zeros_like(Weights)
    u_t_biases = np.zeros_like(Biases)
    # u_t_list = [u_t]
    while itr<X.shape[0]:
        H, A, y_pred = forward_propagation(X[itr], modify_W_B_NAGD(u_t_weights, Weights, beta), modify_W_B_NAGD(u_t_biases, Biases, beta), number_of_layers, activation_function)
        delta_Weights, delta_Biases = backward_propagation(H, A, Weights, Y[itr], y_pred, number_of_layers, activation_function)
        # u_t = beta * u_t + delta_Weights
        u_t_weights = accumulate_history(u_t_weights,delta_Weights,prev_factor=beta)
        u_t_biases = accumulate_history(u_t_biases,delta_Biases, prev_factor=beta)
        # u_t_list.append()
        itr = itr + 1

        if itr%batch_size==0:
            Weights, Biases = update_weights_and_biases(learning_rate, Weights, Biases, u_t_weights, u_t_biases, l2_regularization_constant)

    if itr%batch_size!=0:
        Weights, Biases = update_weights_and_biases(learning_rate, Weights, Biases, u_t_weights, u_t_biases, l2_regularization_constant)


    return Weights, Biases

#-------------------------------------------------------------------------------------------

#training



def train_model(X, Y, epochs=1, num_of_hidden_layers=1, size_of_layers=4, learning_rate=0.1, optimizer="sgd", batch_size=4, l2_regularization_constant=0.001, weight_init_type="random", activation_function="sigmoid", beta=0, epsilon=1e-10):

    if weight_init_type=="random":
        initialize = random_initialize
    else:
        initialize = xavier_initialize

    if optimizer=="mini_batch":
        gradient = gradient_descent_mini_batch
    elif optimizer=="mbgd":
        gradient = gradient_descent_momentum_based
    elif optimizer=="rmsprop":
        gradient = gradient_descent_RMSProp
    elif optimizer=="nagd":
        gradient = gradient_descent_nesterov_accelarated
    else:
        gradient = gradient_descent_stochastic

    Weights, Biases = initialize(num_of_hidden_layers+2,size_of_layers,Y.shape[1], X.shape[1])

    for epoch in range(epochs):
        print("Epoch number", epoch+1, " started")
        Weights, Biases = gradient(X, Y, learning_rate, num_of_hidden_layers+2, batch_size, Weights, Biases, activation_function, l2_regularization_constant, beta=0, epsilon=0)
        print("Epoch ",epoch+1, " finished.")
    return Weights, Biases




#----------------------------------------------------------------------------------------

# testing 


def validate(X, Weights, Biases, activation_function):
    # some calculations
    H, A, Y_pred = forward_propagation(X, Weights=Weights, Biases=Biases, number_of_layers=len(Weights)+1, activation_function=activation_function)
    return Y_pred

#-------------------------------------------------------------------------




In [None]:
# Taking input data and normalizing data


(trainX, trainY), (testX, testY) = fashion_mnist.load_data()
validateX = trainX[54000:]
validateY = trainY[54000:]
trainX = trainX[:54000]
trainY = trainY[:54000]

trainx = trainX.reshape(trainX.shape[0],-1)/255
validatex = validateX.reshape(validateX.shape[0],-1)/255
testx = testX.reshape(testX.shape[0],-1)/255

# output dataset conversion One hot encoding
import numpy as np
trainy = [np.zeros(10) for i in range(trainX.shape[0])]
validatey = [np.zeros(10) for i in range(validateX.shape[0])]
testy = [np.zeros(10) for i in range(testX.shape[0])]

for i in range(trainX.shape[0]):
    trainy[i][trainY[i]] = 1
    
for i in range(validateX.shape[0]):
    validatey[i][validateY[i]] = 1
    
for i in range(testX.shape[0]):
    testy[i][testY[i]] = 1
    
trainy = np.array(trainy)
testy = np.array(testy)


In [None]:
# Display few images

import matplotlib.pyplot as plt

cnt = 0
label_display_name = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
label = []

for i in range(1, trainX.shape[0]):

    if trainY[i] not in label:
        cnt = cnt + 1
        label.append(trainY[i])
        plt.subplot(2, 5, cnt)
        # Insert ith image with the color map 'grap'
        plt.imshow(trainX[i], cmap=plt.get_cmap('gray'))
        plt.title(label_display_name[trainY[i]])

    if cnt == 10:
        break

# Display the entire plot
plt.show()

In [None]:
## Training the model
X = trainx,
Y = trainy,
epochs = 1,
num_of_hidden_layers = 1,
size_of_layers = 4, 
learning_rate = 0.1,
optimizer = "sgd",
batch_size = 4,
l2_regularization_constant = 0.001,
weight_init_type = "random",
activation_function = "sigmoid",
beta = 0,
epsilon = 1e-10



Weights, Biases = train_model(
                        X=X,
                        Y=Y,
                        epochs=epochs,
                        num_of_hidden_layers=num_of_hidden_layers,
                        size_of_layers=size_of_layers,
                        learning_rate=learning_rate,
                        optimizer=optimizer,
                        batch_size=batch_size,
                        l2_regularization_constant=l2_regularization_constant,
                        weight_init_type=weight_init_type,
                        activation_function=activation_function,
                        beta=beta,
                        epsilon=epsilon
                    )

In [None]:
# validation 
Y_predict = []
for i in validateX:
    Y_predict.append(validate(i, Weights, Biases))

print("validation accuracy",get_accuracy(validatey,Y_predict))