# INM 702 Coursework Code: Task 3
## Implementation of Softmax classifier
### By: Jasveen Kaur and Nikhil Vallakati

Importing the necessary packages for matrix computation

In [63]:
import numpy as np
import matplotlib.pyplot as plt

Defining a softmax classifier function:

In [64]:
def softmax_basic(z):
        exps = np.exp(z)
        sums = np.sum(exps)
        return np.divide(exps, sums)

    
def softmax_grad(s): 
    #a = np.diag(np.diag(s))
    S_vector = np.diag(s)
    S_matrix = np.transpose(S_vector)
    return np.diag(s) - (S_matrix * np.transpose(S_matrix))

Defining a sample input array (6x3) along with its label (6x1) (same as task 1 and task 2) to implement forward and backward pass:
2 more input sets were defined, one with higher input values and the other with high negative input value, to examine the numerical computation problem that softmax classifier undergoes.

In [66]:
input_set = np.array([[0,1,0],
                      [0,0,1],
                      [1,0,0],
                      [1,1,0],
                      [1,1,1],
                      [0,1,1],
                     ])#Dependent variable
input_set2 = np.array([[-2000,-6000,-2000],
                      [-2000,-2000,-6000],
                      [-6000,-2000,-2000],
                      [-6000,-6000,-2000],
                      [-6000,-6000,-6000],
                      [-2000,-6000,-6000],
                     ])#Dependent variable
input_set3 = np.array([[2000,6000,2000],
                      [2000,2000,6000],
                      [6000,2000,2000],
                      [6000,6000,2000],
                      [6000,6000,6000],
                      [2000,6000,6000],
                     ])#Dependent variable
labels = np.array([[1,
                    0,
                    0,
                    1,
                    1,
                    0,]])
labels = labels.reshape(6,1)

Forward and backward pass on the above defined array, with Relu on input layer and softmax classifier on the output layer to calculate probabilities and backpropagate through the model.

In [107]:
class neural_network(object):

    def __init__(self, n_hidden, epochs, lr, seed):

        self.random = np.random.RandomState(seed)
        self.n_hidden = n_hidden
        self.epochs = epochs
        self.lr = lr
        self.seed = seed
    
    #relu activation function
    def relu(self, x):
        return np.maximum(0, x) 
    
    #derivative of relu activation function(element)
    def relu_d_element(self, x):
        if x > 0:
            return 1
        elif x <= 0:
            return 0
    
    #derivative of relu activation function(array)
    def relu_d_array(self, x):
        x[x<=0] = 0
        x[x>0] = 1
        return x
        
    #softmax function definition   
    def softmax_basic(self, x):
        exp_scores = np.exp(x)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        return probs

    #softmax to calculate gradient
    def softmax_grad(self, s): 
        #a = np.diag(np.diag(s))
        S_vector = np.diag(s)
        S_matrix = np.transpose(S_vector)
        return np.diag(s) - (S_matrix * np.transpose(S_matrix))
    
    #forward pass
    def forward_pass(self, X):
      
        z1 = np.dot(X, self.w1) + self.b1
        a1 = self.relu(z1)
        
        z_out = np.dot(a1, self.w_out) + self.b_out
        a_out = self.softmax_basic(z_out)

        return z1, a1, z_out, a_out   

    #computing the loss term
    def compute_cost(self, y_enc, a_out):
        
        #Normal loss function   
        #term1 = a_out - y_enc 
        #cost = term1.sum()
        #return cost
    
        #Cross Entropy Loss
        y_enc = y_enc.argmax(axis=1)
        m = y_enc.shape[0]
        log_probs = -np.log(a_out[range(m),y_enc])
        loss = np.sum(log_probs)/m
        return loss
    
    #predicting the output
    def predict_out(self, X):
        z1, a1,z_out, a_out = self.forward_pass(X)
        y_pred = np.argmax(a_out, axis=1)
        return y_pred
    
    #calculating the accuracy 
    def accuracy(self, y, y_pred, X):
        return ((np.sum(y.T == y_pred)).astype(np.float) / X.shape[0])
    
    #Training the network
    def train(self, X_train, y_train):
        
        n_output = np.unique(y_train).shape[0]
        n_features = X_train.shape[1]

        #Initializing the weights
        
        #hidden layer
        self.b1 = np.zeros(self.n_hidden)
        self.w1 = self.random.normal(loc=0.0, scale=0.1,size=(n_features, self.n_hidden))
        
        #output layer
        self.b_out = np.zeros(n_output)
        self.w_out = self.random.normal(loc=0.0, scale=0.1, size=(self.n_hidden, n_output))
        
        self.results = {'cost': [], 'train_acc': []}  
        
        #training epochs
        for i in range(self.epochs):
            
            #forward propagation
            z1, a1, z_out, a_out = self.forward_pass(X_train)
            #data_cost = self.compute_cost(y_train, a_out)
            
            # Backpropagation
            
            sigma_out = a_out - labels #sigma_out = error
            relu_derivative = self.relu_d_array(a1)
            sigma_h1 = (np.dot(sigma_out, self.w_out.T) * relu_derivative)
            
            grad_w1 = np.dot(X_train.T, sigma_h1)
            grad_b1 = np.sum(sigma_h1, axis=0)

            grad_w_out = np.dot(a1.T, sigma_out)
            grad_b_out = np.sum(sigma_out, axis=0)

            delta_w1 = grad_w1
            delta_w_out = grad_w_out  

            delta_b1 = grad_b1
            delta_b_out = grad_b_out
                
             #updating the weights
            self.w1 -= self.lr * delta_w1
            self.w_out -= self.lr * delta_w_out

            self.b1 -= self.lr * delta_b1           
            self.b_out -= self.lr * delta_b_out
            
            
            #evaluating the trained model with updated weights
            z1, a1, z_out, a_out = self.forward_pass(X_train)
            
            cost = self.compute_cost(y_enc=labels, a_out=a_out)
            y_train_pred = self.predict_out(X_train)

            train_acc = self.accuracy(y_train, y_train_pred, X_train)
            
            print("epoch:", i+1)
            print("Accuracy:","{:.2f}".format(train_acc*100),"% ||","loss:","{:.3f}".format(cost))
            
            self.results['cost'].append(cost)
            self.results['train_acc'].append(train_acc)
            
        return self

Initializing the parameters and propagating through one layer network for input_set

In [111]:
Model = neural_network(n_hidden=7, epochs=20, lr=0.0001, seed=1)
Model.train(X_train=input_set, y_train=labels)

epoch: 1
Accuracy: 66.67 % || loss: 0.693
epoch: 2
Accuracy: 66.67 % || loss: 0.693
epoch: 3
Accuracy: 66.67 % || loss: 0.693
epoch: 4
Accuracy: 66.67 % || loss: 0.693
epoch: 5
Accuracy: 66.67 % || loss: 0.693
epoch: 6
Accuracy: 66.67 % || loss: 0.693
epoch: 7
Accuracy: 66.67 % || loss: 0.693
epoch: 8
Accuracy: 66.67 % || loss: 0.693
epoch: 9
Accuracy: 66.67 % || loss: 0.693
epoch: 10
Accuracy: 66.67 % || loss: 0.693
epoch: 11
Accuracy: 66.67 % || loss: 0.693
epoch: 12
Accuracy: 66.67 % || loss: 0.693
epoch: 13
Accuracy: 66.67 % || loss: 0.693
epoch: 14
Accuracy: 66.67 % || loss: 0.693
epoch: 15
Accuracy: 66.67 % || loss: 0.693
epoch: 16
Accuracy: 66.67 % || loss: 0.693
epoch: 17
Accuracy: 66.67 % || loss: 0.693
epoch: 18
Accuracy: 66.67 % || loss: 0.693
epoch: 19
Accuracy: 66.67 % || loss: 0.693
epoch: 20
Accuracy: 66.67 % || loss: 0.693


<__main__.neural_network at 0x21c89585860>

Initializing the parameters and propagating through one layer network for input_set2 (High negative values)

In [114]:
Model = neural_network(n_hidden=7, epochs=80, lr=0.0001, seed=1)
Model.train(X_train=input_set2, y_train=labels)

epoch: 1
Accuracy: 50.00 % || loss: 95.002
epoch: 2
Accuracy: 50.00 % || loss: 27.516
epoch: 3
Accuracy: 50.00 % || loss: 14.339
epoch: 4
Accuracy: 50.00 % || loss: 29.510
epoch: 5
Accuracy: 66.67 % || loss: 14.899
epoch: 6
Accuracy: 50.00 % || loss: 36.413
epoch: 7
Accuracy: 50.00 % || loss: 22.890
epoch: 8
Accuracy: 50.00 % || loss: 33.285
epoch: 9
Accuracy: 50.00 % || loss: 37.685
epoch: 10
Accuracy: 50.00 % || loss: 27.232
epoch: 11
Accuracy: 50.00 % || loss: 32.071
epoch: 12
Accuracy: 33.33 % || loss: 25.222
epoch: 13
Accuracy: 50.00 % || loss: 21.154
epoch: 14
Accuracy: 50.00 % || loss: 34.921
epoch: 15
Accuracy: 66.67 % || loss: 19.297
epoch: 16
Accuracy: 50.00 % || loss: 54.204
epoch: 17
Accuracy: 50.00 % || loss: 27.830
epoch: 18
Accuracy: 50.00 % || loss: 54.412
epoch: 19
Accuracy: 50.00 % || loss: 28.096
epoch: 20
Accuracy: 66.67 % || loss: 45.938
epoch: 21
Accuracy: 50.00 % || loss: 35.901
epoch: 22
Accuracy: 66.67 % || loss: 50.895
epoch: 23
Accuracy: 50.00 % || loss: 37.1



<__main__.neural_network at 0x21c896559e8>

Initializing the parameters and propagating through one layer network for input_set3 (High positive values)

In [115]:
Model = neural_network(n_hidden=7, epochs=80, lr=0.0001, seed=1)
Model.train(X_train=input_set3, y_train=labels)

epoch: 1
Accuracy: 66.67 % || loss: 3.117
epoch: 2
Accuracy: 50.00 % || loss: 2.768
epoch: 3
Accuracy: 50.00 % || loss: 0.000
epoch: 4
Accuracy: 33.33 % || loss: 5.725
epoch: 5
Accuracy: 50.00 % || loss: 0.074
epoch: 6
Accuracy: 16.67 % || loss: 9.015
epoch: 7
Accuracy: 16.67 % || loss: 6.125
epoch: 8
Accuracy: 16.67 % || loss: 5.635
epoch: 9
Accuracy: 16.67 % || loss: 5.149
epoch: 10
Accuracy: 33.33 % || loss: 3.982
epoch: 11
Accuracy: 16.67 % || loss: 12.453
epoch: 12
Accuracy: 16.67 % || loss: 12.000
epoch: 13
Accuracy: 16.67 % || loss: nan
epoch: 14
Accuracy: 50.00 % || loss: nan
epoch: 15
Accuracy: 50.00 % || loss: nan
epoch: 16
Accuracy: 50.00 % || loss: nan
epoch: 17
Accuracy: 50.00 % || loss: nan
epoch: 18
Accuracy: 50.00 % || loss: nan
epoch: 19
Accuracy: 50.00 % || loss: nan
epoch: 20
Accuracy: 50.00 % || loss: nan
epoch: 21
Accuracy: 50.00 % || loss: nan
epoch: 22
Accuracy: 50.00 % || loss: nan
epoch: 23
Accuracy: 50.00 % || loss: nan
epoch: 24
Accuracy: 50.00 % || loss: nan



<__main__.neural_network at 0x21c89655da0>

The accuracy obtained by softmax classifier: It gave reasonable results with the first input matrix (input_set). However, the other two matrices show that if we have too high and too low input values, then the loss function faces issues with numerical computation (as described in the report) and produces a "nan" loss value.