In [1]:
import numpy as np
from torchvision import datasets,transforms
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sb
import pandas as pd
np.random.seed(42)

# loding dataset

In [2]:


def load_data():
    #Define transformation

    transform= transforms.ToTensor()

    #Load the training dataset

    train_dataset= datasets.FashionMNIST(root='./data',train=True,download=True,transform=transform)

    #Load the test dataset separately

    test_dataset= datasets.FashionMNIST(root='./data',train=False,download=True,transform=transform)
    return train_dataset,test_dataset



# preprocessing

In [3]:
def preprocess(train_dataset,test_dataset):

    n,sizex,sizey=train_dataset.data.shape
    train_data=train_dataset.data.reshape(n,sizex*sizey)
    train_data=train_data.numpy()
    n,sizex,sizey=test_dataset.data.shape
    test_data=test_dataset.data.reshape(n,sizex*sizey)
    test_data=test_data.numpy()
    # train_data=[]
    # for data in train_dataset.data:
    #     new_data=np.array(data.numpy().flatten())
    #     train_data.append(new_data)
    

    # test_data=[]
    # for data in test_dataset.data:
    #     new_data=np.array(data.numpy().flatten())
    #     test_data.append(new_data)
    return train_data,test_data



In [4]:
#########    spliting   ##########
from sklearn.model_selection import train_test_split
def split(train_data,train_target):
    train_data, val_data, train_target, val_target = train_test_split(train_data, train_target, test_size=0.2, random_state=42)
    return train_data, val_data, train_target, val_target



In [5]:
########   one hot encoding  ########
def one_hot_encoding(y):
    n_values = np.max(y) + 1
    return np.eye(n_values)[y]
    

In [6]:
########### cross entropy loss ##########
def CategoricalCrossEntropy(y_true, y_pred):
    epsilon = 1e-11
    y_pred = np.clip(y_pred, epsilon, 1. - epsilon)
    return -np.sum(y_true * np.log(y_pred))/len(y_true)
    

In [7]:
#################   Adam optimizer   ################
def adam_optimizer(output_grad,m_t,v_t,t,w,learning_rate,beta1,beta2,epsilon):
    m = beta1 * m_t + (1 - beta1) * output_grad
    v = beta2 * v_t + (1 - beta2) * output_grad**2
    m_hat = m / (1 - beta1**t)
    v_hat = v / (1 - beta2**t)
    # print(t)
    w = w - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
    return w , m , v


In [8]:
################  Dropout layer  ################
class Dropout:
    def __init__(self,dropout_rate):
        self.dropout_rate=dropout_rate
        self.mask=None
    def forward(self,X,trainig=True):
        if trainig:
            self.mask=np.random.binomial(1,1-self.dropout_rate,X.shape)/(1-self.dropout_rate)
            return X*self.mask
        else:
            return X
    def backward(self,grad,learning_rate):
        return grad*self.mask
    def clear(self):
        self.mask=None

In [9]:
class batchNormalization:
    def __init__(self,input_size):
        self.gamma = np.ones(input_size)
        self.beta = np.zeros(input_size)
        self.gamma_m=np.zeros(input_size)
        self.beta_m=np.zeros(input_size)
        self.gamma_v=np.zeros(input_size)
        self.beta_v=np.zeros(input_size)
        self.beta1=0.9
        self.beta2=0.999
        self.epsilon=1e-8
        self.t=1

        self.running_mean = np.zeros(input_size)
        self.running_var = np.ones(input_size)
        self.momentum = 0.9
    
    def forward(self,x,training=True):
        
        if training==True:
            self.input = x
            self.mean = np.mean(x,axis=0)
            self.var = np.var(x,axis=0)
            self.x_hat = (x - self.mean) / np.sqrt(self.var + self.epsilon)
            self.y = self.gamma * self.x_hat + self.beta

            # Update running statistics for inference
            self.running_mean = (self.momentum * self.running_mean +
                                (1 - self.momentum) * self.mean)
            self.running_var = (self.momentum * self.running_var +
                                (1 - self.momentum) * self.var)
        else:
            self.x_hat = (x - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
            self.y = self.gamma * self.x_hat + self.beta
        return self.y
    def backward(self,grad,learning_rate):
        m = grad.shape[0]
        self.dgamma = np.sum(grad * self.x_hat,axis=0)
        self.dbeta = np.sum(grad,axis=0)
        
        dx_hat = grad * self.gamma
        dvar = np.sum(dx_hat * (self.input - self.mean) * -0.5 * (self.var + self.epsilon)**-1.5,axis=0)
        dmean = np.sum(dx_hat * -1 / np.sqrt(self.var + self.epsilon),axis=0) + dvar * np.sum(-2 * (self.input - self.mean),axis=0) / m
        dx = dx_hat / np.sqrt(self.var + self.epsilon) + dvar * 2 * (self.input - self.mean) / m + dmean / m
        self.gamma, self.gamma_m, self.gamma_v = adam_optimizer(self.dgamma,self.gamma_m,self.gamma_v,self.t,self.gamma,learning_rate,self.beta1,self.beta2,self.epsilon)
        self.beta, self.beta_m, self.beta_v = adam_optimizer(self.dbeta,self.beta_m,self.beta_v,self.t,self.beta,learning_rate,self.beta1,self.beta2,self.epsilon)
        self.t+=1
        return dx

    def clear(self):
        self.input=None
        self.mean=None
        self.var=None
        self.dgamma=None
        self.dbeta=None
        self.x_hat=None
        self.y=None
        self.gamma_m=None
        self.beta_m=None
        self.gamma_v=None
        self.beta_v=None
        self.beta1=None
        self.beta2=None
        self.t=None
        self.momentum = None

     

In [10]:
class SoftMax:
    def forward(self, x):
        exps = np.exp(x-np.max(x, axis=1, keepdims=True))
        self.activation=exps / np.sum(exps, axis=1, keepdims=True)
        return self.activation
    def backward(self,y_actual,learning_rate):
        return self.activation-y_actual  ### dL/dz= A[level]-y
    def clear(self):
        self.activation=None
        

In [11]:
##########  ReLU  ##########
class ReLU:
    def forward(self, x):
        self.input = x
        return np.maximum(0, x)
    def backward(self,output_gradient,learning_rate):
        return np.multiply(output_gradient,np.where(self.input>0,1,0))
    def clear(self):
        self.input=None
        
   

In [12]:
####### dense layer ########
class dense:
    def __init__(self, input_size, output_size):

        ###momentum and variance initialization
        self.m_weights = np.zeros((input_size, output_size))
        self.m_bias = np.zeros(output_size)
        self.v_weights = np.zeros((input_size, output_size))
        self.v_bias = np.zeros(output_size)
        self.beta1=0.9
        self.beta2=0.999
        self.epsilon=1e-8
        self.t=1
        # Xavier Initialization
        self.weights = np.random.randn(input_size, output_size) * np.sqrt(2. / (input_size + output_size))
        self.bias = np.random.randn(output_size)
        # self.weights = np.ones((input_size, output_size))
        # self.bias = np.ones(output_size)
        # print(self.weights)
        # print(self.bias)
      

    def forward(self, input):
        self.input = input
        self.output = np.dot(input, self.weights) + self.bias
        return self.output
    
    def backward(self ,output_gradient, learning_rate):
        
        backward_gradient = np.dot(output_gradient, self.weights.T)/output_gradient.shape[1]
        weights_gradient = np.dot(self.input.T, output_gradient)/output_gradient.shape[0]
        bias_gradient = np.mean(output_gradient, axis=0)
        self.weights,self.m_weights,self.v_weights =adam_optimizer(weights_gradient,self.m_weights,self.v_weights,self.t,self.weights,learning_rate,self.beta1,self.beta2,self.epsilon)
        self.bias,self.m_bias,self.v_bias =adam_optimizer(bias_gradient,self.m_bias,self.v_bias,self.t,self.bias,learning_rate,self.beta1,self.beta2,self.epsilon)
        self.t+=1

        
        return backward_gradient
    def clear(self):
        self.input=None
        self.output=None
        self.m_weights=None
        self.m_bias=None
        self.v_weights=None
        self.v_bias=None
        self.beta1=None
        self.beta2=None
        self.epsilon=None
        self.t=None

    

# Fnn model

In [13]:
class FNN:
    def __init__(self,train_X,train_Y,val_X,val_Y):
        self.train_X=train_X
        self.train_Y=train_Y
        self.val_X=val_X
        self.val_Y=val_Y
    
    def train(self,model,epochs,batch_size,learning_rate):

        training_loss=[]
        validation_loss=[]
        training_accuracy=[]
        validation_accuracy=[]
        validation_f1_score=[]
        best_f1_score=0
        best_model=model
        for i in range(epochs):

            train_loss=0
            val_loss=0
            train_accuracy=0
            val_accuracy=0
            total=0
            indices=np.random.permutation(self.train_X.shape[0])
            self.train_X=self.train_X[indices]
            self.train_Y=self.train_Y[indices]
            batch_count=0
            for j in range(0,self.train_X.shape[0],batch_size):
                batch_X= np.array(self.train_X[j:j+batch_size])
                batch_Y=np.array(self.train_Y[j:j+batch_size])
                output=batch_X
                for layer in model:
                    if isinstance(layer,batchNormalization):
                        output=layer.forward(output,training=True)
                    else:
                        output=layer.forward(output)
                
                loss=CategoricalCrossEntropy(batch_Y,output)
                batch_count+=1
                train_loss+=loss
                accuracy=np.sum(np.argmax(output,axis=1)==np.argmax(batch_Y,axis=1))
                train_accuracy+=accuracy
                total+=len(batch_Y)
                output_grad=batch_Y
                for layer in reversed(model):
                    output_grad=layer.backward(output_grad,learning_rate)
            
            training_loss.append(train_loss/batch_count)
            training_accuracy.append(train_accuracy/total)
            output=np.array(self.val_X)
            for layer in model:
                if isinstance(layer,batchNormalization) :
                    output=layer.forward(output,training=False)
                elif isinstance(layer,Dropout):
                    output=layer.forward(output,trainig=False)
                else:
                    output=layer.forward(output)
            val_loss=CategoricalCrossEntropy(self.val_Y,output)
            validation_loss.append(val_loss)
            val_accuracy=np.sum(np.argmax(output,axis=1)==np.argmax(self.val_Y,axis=1))
            validation_accuracy.append(val_accuracy/len(self.val_Y))
            validation_f1=f1_score(np.argmax(self.val_Y,axis=1),np.argmax(output,axis=1),average='macro')  
            validation_f1_score.append(validation_f1) 
            # print("Epoch: ",i," Training Loss: ",train_loss/batch_count," Validation Loss: ",val_loss," Training Accuracy: ",train_accuracy/total," Validation Accuracy: ",val_accuracy/len(self.val_Y),"validation_f1_Score: ",validation_f1)
            if(validation_f1 > best_f1_score):
                best_f1_score=validation_f1
                best_model=model
       
        
        
        plt.plot(training_loss,label='Training Loss')
        plt.plot(validation_loss,label='Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.text(15,(max(training_loss)+min(training_loss))/2,'learning rate: %s' %(learning_rate))
        plt.legend()
        plt.show()
        plt.plot(training_accuracy,label='Training Accuracy')
        plt.plot(validation_accuracy,label='Validation Accuracy')
        plt.text(15,(max(training_accuracy)+min(training_accuracy))/2,'learning rate: %s' %(learning_rate))
        plt.xlabel('Epochs')
        plt.ylabel('accuracy')
        plt.legend()
        plt.show()

        plt.plot(validation_f1_score,label='Validation F1 Score')
        plt.xlabel('Epochs')
        plt.ylabel('F1 Score')
        plt.text(15,(max(validation_f1_score)+min(validation_f1_score))/2,'learning rate: %s' %(learning_rate))
        plt.legend()
        plt.show()
       
        ## confusion matrix
        
        output=np.array(self.val_X)
        for layer in best_model:
            if isinstance(layer,batchNormalization):
                output=layer.forward(output,training=False)
            elif isinstance(layer,Dropout):
                output=layer.forward(output,trainig=False)
            else:
                output=layer.forward(output)
        y_pred=np.argmax(output,axis=1)
        y_true=np.argmax(self.val_Y,axis=1)
        con_m=confusion_matrix(y_true,y_pred)
        # print(con_m)
        con_m=pd.DataFrame(con_m,index=["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"],
                           columns=["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"])
        plt.figure(figsize=(10,7))
        sb.heatmap(con_m,annot=True,fmt='d')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()


        


        return best_model


            

            
          
        

## model Architecture

In [14]:
def get_model(type):
    model=[]
    # if type==0:
    #     model.append(dense(784,128))
    #     model.append(ReLU())
    #     model.append(dense(128,64))
    #     model.append(ReLU())
    #     model.append(dense(64,10))
    #     model.append(SoftMax())
    if type==0:
        model.append(dense(784,128))
        model.append(batchNormalization(128))
        model.append(ReLU())
        model.append(dense(128,64))
        model.append(batchNormalization(64))
        model.append(ReLU())
        model.append(dense(64,10))
        model.append(SoftMax())
    elif type==1:
        model.append(dense(784,128))
        model.append(batchNormalization(128))
        model.append(ReLU())
        model.append(Dropout(0.2))
        model.append(dense(128,64))
        model.append(batchNormalization(64))
        model.append(ReLU())
        model.append(Dropout(0.2))
        model.append(dense(64,10))
        model.append(SoftMax())
    elif type==2:
        model.append(dense(784,256))
        model.append(batchNormalization(256))
        model.append(ReLU())
        model.append(Dropout(0.3))
        model.append(dense(256,128))
        model.append(batchNormalization(128))
        model.append(ReLU())
        model.append(Dropout(0.2))
        model.append(dense(128,64))
        model.append(batchNormalization(64))
        model.append(ReLU())
        model.append(Dropout(0.2))
        model.append(dense(64,10))
        model.append(SoftMax())
    
    return model

## Training

In [15]:
def FashionMNIST():
    train_dataset,test_dataset=load_data()
    train_data,test_data=preprocess(train_dataset,test_dataset)
    train_targets=train_dataset.targets.numpy()
    test_targets=test_dataset.targets.numpy()
    X_train,X_val,y_train,y_val=split(train_data,train_targets)
    y_train=one_hot_encoding(y_train)
    y_val=one_hot_encoding(y_val)
    y_test=one_hot_encoding(test_targets)
    X_train=X_train/255
    X_val=X_val/255
    test_data=test_data/255
    
    fnn=FNN(X_train,y_train,X_val,y_val)

    learning_rate=[0.005,0.001,0.0005,0.0001]
    model_number=3
    best_models=[]
    for i in range(model_number):
        for lr in learning_rate:
            best_models.append(fnn.train(get_model(i),30,256,lr))

    best_model=None
    best_f1_score=0

    for model in best_models:
        output=np.array(X_val)
        for layer in model:
            if isinstance(layer,batchNormalization):
                output=layer.forward(output,training=False)
            elif isinstance(layer,Dropout):
                output=layer.forward(output,trainig=False)
            else:
                output=layer.forward(output)
        val_accuracy=np.sum(np.argmax(output,axis=1)==np.argmax(y_val,axis=1))
        val_f1=f1_score(np.argmax(y_val,axis=1),np.argmax(output,axis=1),average='macro')  
        if(val_f1>best_f1_score):
            best_f1_score=val_f1
            best_model=model
        # print("\n")
        # print("validation Accuracy: ",val_accuracy/len(y_val),"validation_f1_Score: ",val_f1)

         
    return best_model,test_data,y_test
    
            


########## uncomment this to train the model ###########

# best_model,test_data,y_test=FashionMNIST()
# print("Best Model: ",best_model)

#### save in pickle file
# for layer in best_model:
#     layer.clear()
# import pickle
# with open('1905062_model.pkl','wb') as f:
#     pickle.dump(best_model,f)










## Testing

In [16]:
# ###### Read pickle file ######

# print("Testing")
# with open('1905062_model.pkl','rb') as f:
#     model=pickle.load(f)

# output=np.array(test_data)
# for layer in model:
    
#     if isinstance(layer,batchNormalization):
#         output=layer.forward(output,training=False)
#     elif isinstance(layer,Dropout):
#         output=layer.forward(output,trainig=False)
#     else:
#         output=layer.forward(output)
# test_accuracy=np.sum(np.argmax(output,axis=1)==np.argmax(y_test,axis=1))
# test_f1=f1_score(np.argmax(y_test,axis=1),np.argmax(output,axis=1),average='macro')
# print("Testing Accuracy: ",test_accuracy/len(y_test),"Testing_f1_Score: ",test_f1)
# con_m=confusion_matrix(np.argmax(y_test,axis=1),np.argmax(output,axis=1))
# con_m=pd.DataFrame(con_m,index=["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"],
#                            columns=["T-shirt/top","Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot"])
# plt.figure(figsize=(10,7))
# sb.heatmap(con_m,annot=True,fmt='d')
# plt.title('Confusion Matrix')
# plt.xlabel('Predicted')
# plt.ylabel('True')
    