<a href="https://colab.research.google.com/github/kartik177/alexnet-pytorch/blob/master/pranit1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
import random
from sklearn import datasets

In [2]:
class Neural_Network():
    def __init__(self, neurons, Activations): 
        # arguments: an array "neurons" consist of number of neurons for each layer, 
        # an array "activations" consisting of activation functions used for the hidden layers and output layer
        self.inputSize = neurons[0] # Number of neurons in input layer
        self.outputSize = neurons[-1] # Number of neurons in output layer
        self.layers = len(neurons)
        self.weights = [] #weights for each layer
        self.biases = [] #biases in each layer 
        self.layer_activations = [] #activations in each layer
        for i in range(len(neurons)-1): 
            self.weights.append(np.random.rand(neurons[i+1],neurons[i])) #weight matrix between layer i and layer i+1
            self.biases.append(np.random.rand(neurons[i+1],1))
            self.layer_activations.append(Activations[i]) #activations for each layer
        
            
    def sigmoid(self, z): # sigmoid activation function
        #Fill in the details to compute and return the sigmoid activation function 
        return (1.0/(1.0+np.exp(-z)))
      
    
    def sigmoidPrime(self,z): # derivative of sigmoid activation function
        #Fill in the details to compute and return the derivative of sigmoid activation function
        #print(type(z))
        return self.sigmoid(z)*(1-self.sigmoid(z))

                          
    def tanh(self, z): # hyperbolic tan activation function
        #Fill in the details to compute and return the tanh activation function    
        c = (np.exp(z)- np.exp(-z))/(np.exp(z)+np.exp(-z))
        return c
    
    def tanhPrime(self,x): # derivative of hyperbolic tan activation function
        #Fill in the details to compute and return the derivative of tanh activation function
        return (1 - (self.tanh(x))**2)
                          
    def linear(self, z): # Linear activation function
        #Fill in the details to compute and return the linear activation function 
        return z
    
    def linearPrime(self,x): # derivative of linear activation function
        #Fill in the details to compute and return the derivative of activation function    
        for y in x:
            y = 1
        return x

    def ReLU(self,z): # ReLU activation function
        #Fill in the details to compute and return the ReLU activation function  
        
        for y in z:
            if y<0:
                y=0
        
        return z
    
    def ReLUPrime(self,z): # derivative of ReLU activation function
        #Fill in the details to compute and return the derivative of ReLU activation function
        v = []
        for y in z:
            if y > 0 :
                v.append([1])
            else:
                v.append([0])
        f = np.array(v)
        
        return f
    
    def softmax(self, a):
        q = []
        sum_prob= 0
        for j in range (0,len(a),1):
            sum_prob = sum_prob + np.exp(a[j])
            
            
        for i in range (0,len(a),1):
            q.append(np.exp(a[i])/sum_prob)
            
            
        return np.array(q)
    
    def forward(self, a): # function of forward pass which will receive input and give the output of final layer
        # Write the forward pass using the weights and biases to find the predicted value and return them.
        layer_activations_a = [a] #store the input as the input layer activations
        layer_dot_prod_z = []
        for i, param in enumerate(zip(self.biases, self.weights)):
            b, w = param[0], param[1]
            
            if self.layer_activations[i].lower()=='sigmoid':
                z = np.dot(w, a)+b
               
                a = self.sigmoid(z)
                a = self.softmax(a)
            elif self.layer_activations[i].lower()=='relu':
                z = np.dot(w, a)+b
               
                a = self.ReLU(z)
            elif self.layer_activations[i].lower()=='tanh':   
                z = np.dot(w, a)+b
               
                a = self.tanh(z)
            elif self.layer_activations[i].lower()=='linear':
                z = np.dot(w, a)+b
                a = self.linear(z) 
            
            layer_dot_prod_z.append(z)    
            layer_activations_a.append(a)
            
        
        return a, layer_dot_prod_z, layer_activations_a
                          
            
    
    def backward(self, x, y, zs, activations): # find the loss and return derivative of loss w.r.t every parameter
        # Write the backpropagation algorithm here to find the gradients of weights and biases and return them.
        grad_b = [np.zeros(b.shape) for b in self.biases]
        grad_w = [np.zeros(w.shape) for w in self.weights]
        # b, w 3*1
        # backward pass
        if self.layer_activations[-1].lower()=='sigmoid':
            delt = []
            
            for i in range (0,len(activations[-1]),1):
                delt.append(self.softmax(activations[-1][i])* (1- self.softmax(activations[-1][i])) * -y[i]/activations[-1][i])
            
           
        elif self.layer_activations[-1].lower()=='relu':
            delta = (activations[-1] - y) *  self.ReLUPrime(zs[-1])
        elif self.layer_activations[-1].lower()=='tanh':   
            delta = (activations[-1] - y) *  self.tanhPrime(zs[-1])

        elif self.layer_activations[-1].lower()=='linear':
            delta = (activations[-1] - y) *  self.linearPrime(zs[-1])        
        
        
        
    
        # fill in the appropriate details for gradients of w and b
        delta = np.array(delt)
        grad_b[-1] = delta
        
        grad_w[-1] = delta*(activations[-2].transpose())
        delta = np.matmul(((self.weights[-1]).transpose()) ,delta)
        
        for l in range(2, self.layers): # Here l is in backward sense i.e. last l th layer
            z = zs[-l]
            
            if self.layer_activations[-l].lower()=='sigmoid':
                prime = self.sigmoidPrime(z)
            elif self.layer_activations[-l].lower()=='relu':
                
                prime = self.ReLUPrime(z)
            elif self.layer_activations[-l].lower()=='tanh':   
                prime = self.tanhPrime(z)
            elif self.layer_activations[-l].lower()=='linear':
                prime = self.linearPrime(z)
               
        
            p = np.array(prime[:,0])
            #Compute delta, gradients of b and w 
            
           
            k =np.matmul((self.weights[-l].transpose()),np.diag(p))
            grad_b[-l] = delta
            grad_w[-l] = np.matmul((np.diag(p)),delta)
            grad_w[-l] = np.matmul(grad_w[-l],(activations[-l-1].transpose()))
            delta = np.matmul(k,delta)
        
        return (grad_b, grad_w)                 

    def update_parameters(self, grads, learning_rate): # update the parameters using the gradients
        # update weights and biases using the gradients and the learning rate
        
        grad_b, grad_w = grads[0], grads[1]  
        grad_w = np.array(grad_w)
        grad_b = np.array(grad_b)
        
        
        
        
        for j in range(0,len(grad_w),1):
            max_element = grad_w[j].max()
            if max_element != 0:
                grad_w[j] = grad_w[j]/(max_element)
            else:
                pass
            
        for j in range(0,len(grad_b),1):
            max_element = grad_b[j].max()
            if max_element != 0:
                grad_b[j] = grad_b[j]/(max_element)
            else:
                pass   
        
       
        #Implement the update rule for weights  and biases
        self.weights = self.weights - np.multiply(learning_rate,grad_w)
        self.biases = self.biases - np.multiply(learning_rate,grad_b)
        
    def loss(self, predicted, actual):
        #Implement the loss function
        l= 0.5*(predicted-actual)**2
     
        return sum(l)
    
    def cross_entropy(self, predicted, actual):
        CE = -actual*np.log(predicted)
        return sum(CE)     
    
    
    def train(self, X, Y, minibatch=False): # receive the full training data set
        
        lr = 0.01        # learning rate
        epochs = 25 # number of epochs
        loss_list = []
        CE_loss_list = []
        if minibatch==False:
            for e in range(epochs): 
                losses = []
                CE_losses = []
                for q in range(len(X)):
                    
                    train_x = np.resize(X[q],(X[q].shape[0],1)) 
                    if not onehotencoded: 
                        train_y = np.resize(Y[q],(1,1)) 
                    else:
                        train_y = np.resize(Y[q],(Y[q].shape[0],1))
                    
                    
                    out, dot_prod_z, activations_a = self.forward(train_x)
                    
                    loss = self.cross_entropy(out, train_y)
                    
                    CE_loss = self.cross_entropy(out,train_y)
                    grads = self.backward(train_x, train_y, dot_prod_z, activations_a) # find the gradients using backward pass
                    self.update_parameters(grads, lr)
                 
                    losses.append(loss)
                    CE_losses.append(CE_loss)
                loss_list.append(np.mean(np.array(losses)))
                CE_loss_list.append(np.mean(np.array(CE_losses)))
                
                print(f'Epoch: {e} Loss: {np.mean(np.array(losses))} ')
        else:
            minibatchsize = 20
            create_minibatches(X,Y,minibatchsize)
            
            for e in range(epochs):
                #Complete the training code with minibatches 
                losses = []
                CE_losses = []
                minibatch = create_minibatches(X,Y,minibatchsize)
                for minibatch in minibatch:
                    x_mini,y_mini = minibatch
                    x_train = np.resize(x_mini,(minibatchsize, x_mini.shape[1]))
                    y_train = []
                    for i in range(len(y_mini)):
                        if onehotencoded:
                            train_y = np.resize(np.argmax(y_mini[i]),(1,1)) 
                            y_train.append(train_y)
                        else:
                            y_train.append(y_mini[i])
                        
                    output, dot_prod_z ,activations_a = self.forward(x_train)
                    
                    loss = self.cross_entropy(output,y_train)
                   
                    CE_loss = self.cross_entropy(out,y_train)
                    grads = self.backward(x_train,y_train, dot_prod_z , activations_a)
                    self.update_parameters(grads, lr)
                    losses.append(loss)
                    CE_losses.append(CE_loss)
                loss_list.append(np.mean(np.array(losses)))
                CE_loss_list.append(np.mean(np.array(CE_losses)))
                
                print(f'Epoch: {e} Loss: {np.mean(np.array(losses))} ')
        return loss_list,CE_loss_list
        
    def predict(self, x):
        print ("Input : \n" + str(x))
        prediction,_,_ = self.forward(x)
        print ("Output: \n" + str(prediction))


In [3]:
# a method for creating one hot encoded labels 
def onehotencoding(Y_data):
    
    Y = []
    for i in range (0,len(Y_data),1):
        k = [0]*10
        k[Y_data[i]] = 1
        Y.append(k) 
        
    return np.array(Y)
    
    
    

#a method to create minibatches 
def create_minibatches(X,Y,minibatchsize):
    numbatches = int(np.ceil(len(X)/minibatchsize))
    idx = np.arange(len(X))
    np.random.shuffle(idx)
    X_minibatches = []
    Y_minibatches = [] 
    for i in range(numbatches):
        idx_minibatch = idx[i*minibatchsize:min(len(idx),(i+1)*minibatchsize)]
        xn = np.take(X,idx_minibatch,axis=0) 
        yn = np.take(Y,idx_minibatch,axis=0)
        X_minibatches.append(xn)
        Y_minibatches.append(yn)
    return X_minibatches, Y_minibatches

def test_create_minibatches():
    X = []
    Y = []
    inputsize = 3
    minibatch = False
    onehotencoded = False
    n_batch = 20
    batch_size = 5
    for i in range(50):
        if(i % 2 == 0):
            X.append([np.random.randint(1,10) for i1 in range(inputsize)])
            Y.append(1)
        else:
            X.append([np.random.randint(-10,1) for i1 in range(inputsize)])
            Y.append(0)
    X = np.array(X)
    Y = np.array(Y)

    X_mb, Y_mb = create_minibatches(X,Y,6)
    print(X_mb, Y_mb)

#test_create_minibatches()

# Generating some training data

In [4]:
X = []
Y = []
inputsize = 64
minibatch = False
onehotencoded = True
n_batch = 20
batch_size = 5
for i in range(500):
    if(i % 2 == 0):
        X.append([random.randint(1,10) for i1 in range(inputsize)])
        Y.append(1)
    else:
        X.append([random.randint(-10,1) for i1 in range(inputsize)])
        Y.append(0)
X = np.array(X)
Y = np.array(Y)
if onehotencoded:
    Y = onehotencoding(Y)

if minibatch==False:
    train_X = X
    train_Y = Y
else:
    train_X = []
    train_Y = []
    for i in range(n_batch):
        xn, yn = create_minibatch(X,Y,batch_size)
        train_X.append(xn)
        train_Y.append(yn)
    train_X = np.concatenate(train_X, axis=0).reshape((n_batch,batch_size,inputsize))
    train_Y = np.concatenate(train_Y, axis=0).reshape((n_batch,batch_size,-1))

In [6]:
data = datasets.load_digits(10,True)
train_X= np.array(data[0])
train_Y = onehotencoding(data[1])

mean = np.sum(train_X,axis = 0)/np.size(train_X,0)
sd = np.std(train_X,axis = 0,ddof = 1)

for j in range (0,np.size(train_X,1),1):
    if sd[j]!=0:
        for i in range (0,np.size(train_X,0),1):
            train_X[i,j] = (train_X[i,j]- mean[j])/sd[j]
    else:
        pass
  
    

In [None]:



print(train_X.shape)
print(train_Y.shape)


(1797, 64)
(1797, 10)


# Defining the network

In [7]:
#D_in is input dimension
#H1 is dimension of first hidden layer 
#H2 is dimension of second hidden layer
#D_out is output dimension.
D_in, H1, H2,D_out = inputsize, 256,128, 10 #You can add more layers if you wish to 

neurons = [D_in, H1, H2, D_out] # list of number of neurons in the layers sequentially.
activation_functions = ['sigmoid','sigmoid','sigmoid'] #activations in each layer (Note: the input layer does not have any activation)
my_neuralnet = Neural_Network(neurons, activation_functions )


# Training the network

In [9]:

loss,CE_loss = my_neuralnet.train(train_X,train_Y,minibatch=minibatch)

Epoch: 0 Loss: 2.3039369755266805 


KeyboardInterrupt: ignored

# Prediction for a data point after the training

In [None]:
my_neuralnet.predict(train_X[0].reshape(64,1))

In [None]:
plt.plot(loss)
plt.plot(CE_loss)
plt.xlabel('Epochs')
plt.ylabel('loss')
plt.show()