In [126]:
reset -f

In [127]:
import numpy as np
import matplotlib.pyplot as plt
import os
import h5py
import time

def load_data():
    os.chdir('C:\\Users\\s106961\\Desktop\\Deep Learning\\Data')
    with h5py.File('train_128.h5','r') as H:
        data = np.copy(H['data'])
    with h5py.File('train_label.h5','r') as H:
        label = np.copy(H['label'])
    return data, label

data, label = load_data()

In [128]:
#Activation functions and their derivatives

def sigmoid(inp):
    return (1/(1 + np.exp(-inp)))

def tanh(inp):
    return (np.exp(inp) - np.exp(-inp))/(np.exp(inp) + np.exp(-inp))

def relu(inp):
    return np.maximum(inp,0)

def softmax(inp):
    inp_max = inp.max()
    inp_norm = inp - inp_max
    return np.exp(inp_norm) / np.sum(np.exp(inp_norm), axis=1) [:,None]

def leaky_relu(inp):
    return np.maximum(inp,0.1*inp)  
   
def gradient_sigmoid(inp):
    return sigmoid(inp) * (1 - sigmoid(inp))

def gradient_tanh(inp):
    return (1 -tanh(inp)) * (1 + tanh(inp))

def gradient_relu(inp):
    inp[inp>0] = 1
    inp[inp<0] = 0
    return inp

def gradient_leaky_relu(inp):
    inp[inp<0] = 0.1
    inp[inp>=0] = 1
    return inp

In [139]:
class MLP:
    
    def __init__(self,data,label,hidden_units=[50,25],activation='relu',descent_method='momentum_gradient_descent',
                 learning_rate = 0.01,batch_size = 100,shuffle = False, regularization = 0.001,
                 dropout_input_layer_ratio = 0.8, dropout_hidden_layer_ratio = 0.5):
        
        self.data = data
        self.label = label
        self.mu = data.mean()
        self.sigma = data.std()
        self.X = self.batch_normalize(data)
        self.y = label
        self.hidden_units = hidden_units
        self.activation = activation
        self.descent_method = descent_method
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.no_hidden = len(hidden_units)
        self.no_features = data.shape[1]
        self.no_label = len(np.unique(label))
        self.dim = len(label)
        self.shuffle = shuffle
        self.Weights, self.Bias = self.weight_init()
        self.regularization = regularization
        self.dropout_input_layer_ratio = dropout_input_layer_ratio
        self.dropout_hidden_layer_ratio = dropout_hidden_layer_ratio
        self.activation_fn = {'sigmoid': sigmoid, 'tanh': tanh, 'relu': relu, 'leaky_relu': leaky_relu}
        self.gradient_fn = {'sigmoid': gradient_sigmoid, 'tanh': gradient_tanh, 'relu': gradient_relu, 'leaky_relu':gradient_leaky_relu}
        self.error = []
    
    def batch_normalize(self, data):
        data = (data - self.mu) / self.sigma
        return data
    
    def weight_init(self):
    
        Weights = {}
        Bias = {}
        no_hidden = self.no_hidden
        no_features = self.no_features
        labels = self.no_label
        
        for n, units in enumerate(self.hidden_units):
            Weights['layer_{}'.format(n+1)] = np.random.uniform(low=-np.sqrt(2. / (no_features)),high=np.sqrt(2. / (no_features)),size=(no_features, units))
            Bias['layer_{}'.format(n+1)] = np.random.uniform(low=-np.sqrt(2. / (no_features)),high=np.sqrt(2. / (no_features)),size=(units))
            no_features = units

        Weights['layer_{}'.format(no_hidden+1)] = np.random.uniform(low=-np.sqrt(2. / (no_features)), high=np.sqrt(2. / (no_features)),size=(no_features, labels))
        Bias['layer_{}'.format(no_hidden+1)] = np.random.uniform(low=-np.sqrt(2. / (no_features)),high=np.sqrt(2. / (no_features)),size=(labels))

        return Weights, Bias 
    
    def mini_batch(self,iteration):
        
        idx = np.array(range(0,self.dim)).astype(np.int)
        if self.shuffle == True: 
            np.random.shuffle(idx)
            start = 0
            end = batch_size
        else:
            start = (iteration * self.batch_size) % self.dim
            end = start + self.batch_size

        X = self.X[idx[start:end]]
        y = self.y[idx[start:end]]
        return X, y
    
    def output_label_hot_encode(self,inp,batch_size):
        
        one_hot_array = np.zeros((batch_size,self.no_label))
        one_hot_array[np.arange(batch_size),inp] = 1
        return one_hot_array
        
    def accuracy(self,y_hat, y_act):
        
        return round((np.sum((y_hat.argmax(axis=1) == y_act.argmax(axis=1)))/y_act.shape[0] * 100),2)
    
    def cross_entropy_error(self, y_act, y_hat):
        
        Weights = self.Weights
        regularization = self.regularization
        
        regularized_loss = 0
        for layer in Weights:
            regularized_loss += np.sum(np.square(Weights[layer]))
        return np.sum(np.multiply(y_act, np.log(y_hat)) * -1) + (0.5 * regularization  * regularized_loss)
    
    def forward_propagation(self, X, y, dropout_units, predict = False):
        
        Weights = self.Weights
        Bias = self.Bias
        no_hidden = self.no_hidden
        activation = self.activation
        
        layers = {}
        layers['layer_{}'.format(0)] = X #input layer
        if predict == False:
            layers['layer_{}'.format(0)][:,dropout_units['layer_{}'.format(0)]] = 0

        for l in range(no_hidden):
            layers['layer_wx_{}'.format(l+1)] = np.matmul(layers['layer_{}'.format(l)], Weights['layer_{}'.format(l+1)]) + Bias['layer_{}'.format(l+1)]
            layers['layer_{}'.format(l+1)] = self.activation_fn[activation](layers['layer_wx_{}'.format(l+1)])

            if predict == False:
                layers['layer_wx_{}'.format(l+1)][:,dropout_units['layer_{}'.format(l+1)]] = 0
                layers['layer_{}'.format(l+1)][:,dropout_units['layer_{}'.format(l+1)]] = 0

        layers['layer_wx_{}'.format(no_hidden+1)] = np.matmul(layers['layer_{}'.format(no_hidden)], Weights['layer_{}'.format(no_hidden+1)])
        layers['layer_{}'.format(no_hidden+1)] = softmax(layers['layer_wx_{}'.format(no_hidden+1)])

        y_hat = layers['layer_{}'.format(no_hidden+1)]

        return (layers, y_hat)
    
    def backward_propagation(self, layers, y_act, y_hat):
    
        Weights = self.Weights
        Bias = self.Bias
        no_hidden = self.no_hidden
        activation = self.activation
        regularization = self.regularization
        
        gradients = {}
        dim = y_act.shape[0]

        gradients['dL/dA_layer_{}'.format(no_hidden+1)]  = ((y_act - y_hat) * -1)/dim

        for i in range(no_hidden,-1,-1):

            gradients['dL/dW_layer{}'.format(i+1)] = np.zeros(Weights['layer_{}'.format(i+1)].shape)
            for j in range(Weights['layer_{}'.format(i+1)].shape[0]):
                for k in range(Weights['layer_{}'.format(i+1)].shape[1]):
                    gradients['dL/dW_layer{}'.format(i+1)][j][k] = np.sum((np.multiply(gradients['dL/dA_layer_{}'.format(i+1)][:,k], layers['layer_{}'.format(i)][:,j]))) + (regularization * Weights['layer_{}'.format(i+1)][j][k])

            gradients['dL/dB_layer{}'.format(i+1)] = np.sum(gradients['dL/dA_layer_{}'.format(i+1)],axis=0)
            if i != 0:
                gradients['dL/dH_layer{}'.format(i)] = np.matmul(gradients['dL/dA_layer_{}'.format(i+1)],np.transpose(Weights['layer_{}'.format(i+1)]))
                gradients['dL/dA_layer_{}'.format(i)] = self.gradient_fn[activation](layers['layer_wx_{}'.format(i)]) * gradients['dL/dH_layer{}'.format(i)]

        return gradients

    def predict(self, data):
        
        data = self.batch_normalize(data)
        layers, y_hat = self.forward_propagation(self.X, self.y, dropout_units = None, predict =True)
        y_act = self.output_label_hot_encode(self.y, self.dim)
        return y_hat, y_act
    
    def gradient_descent(self, gradients):
        
        Weights = self.Weights
        Bias = self.Bias
        learning_rate = self.learning_rate
        no_hidden = self.no_hidden
        
        for i in range(no_hidden+1,0,-1):
            Weights['layer_{}'.format(i)] = Weights['layer_{}'.format(i)] - (learning_rate * gradients['dL/dW_layer{}'.format(i)])
            Bias['layer_{}'.format(i)] = Bias['layer_{}'.format(i)] - (learning_rate * gradients['dL/dB_layer{}'.format(i)])

        return Weights, Bias
    
    def momentum_gradient_descent(self, gradients, prev_gradients, rho = 0.9 ):
        
        Weights = self.Weights
        Bias = self.Bias
        learning_rate = self.learning_rate
        no_hidden = self.no_hidden
        
        momentum_gradient = {}
        for i in range(no_hidden+1,0,-1):
            momentum_gradient['dL/dW_layer{}'.format(i)] = (rho * prev_gradients['dL/dW_layer{}'.format(i)]) + (learning_rate * gradients['dL/dW_layer{}'.format(i)])
            momentum_gradient['dL/dB_layer{}'.format(i)] = (rho * prev_gradients['dL/dB_layer{}'.format(i)]) + (learning_rate * gradients['dL/dB_layer{}'.format(i)])
            Weights['layer_{}'.format(i)] = Weights['layer_{}'.format(i)] - momentum_gradient['dL/dW_layer{}'.format(i)]
            Bias['layer_{}'.format(i)] = Bias['layer_{}'.format(i)] - momentum_gradient['dL/dB_layer{}'.format(i)]

        return Weights, Bias, momentum_gradient
    
    def adam(self, gradients, prev_gradients, prev_velocity, beta1, beta2, time):
        
        momentum_gradient = {}
        velocity = {}
        ephsilon = 1e-5
        
        Weights = self.Weights
        Bias = self.Bias
        learning_rate = self.learning_rate
        no_hidden = self.no_hidden
        
        for i in range(no_hidden+1,0,-1):
            momentum_gradient['dL/dW_layer{}'.format(i)] = ((beta1 * prev_gradients['dL/dW_layer{}'.format(i)]) + ((1-beta1) * gradients['dL/dW_layer{}'.format(i)]))#/(1-beta1**time)
            momentum_gradient['dL/dB_layer{}'.format(i)] = ((beta1 * prev_gradients['dL/dB_layer{}'.format(i)]) + ((1-beta1) * gradients['dL/dB_layer{}'.format(i)]))#/(1-beta1**time)
            velocity['dL/dW_layer{}'.format(i)] = (beta2 * prev_velocity['dL/dW_layer{}'.format(i)]) + ((1-beta2) * np.square(gradients['dL/dW_layer{}'.format(i)]))#/(1-beta2**time)
            velocity['dL/dB_layer{}'.format(i)] = (beta2 * prev_velocity['dL/dB_layer{}'.format(i)]) + ((1-beta2) * np.square(gradients['dL/dB_layer{}'.format(i)]))#/(1-beta2**time)

            Weights['layer_{}'.format(i)] = Weights['layer_{}'.format(i)] - ((learning_rate * momentum_gradient['dL/dW_layer{}'.format(i)])/(np.sqrt(velocity['dL/dW_layer{}'.format(i)]) + ephsilon))
            Bias['layer_{}'.format(i)] = Bias['layer_{}'.format(i)] - ((learning_rate * momentum_gradient['dL/dB_layer{}'.format(i)])/(np.sqrt(velocity['dL/dB_layer{}'.format(i)]) + ephsilon))

        return Weights, Bias, momentum_gradient, velocity
    
    def dropout(self):
        
        dropout_units = {}
        drop_units = np.arange(self.no_features)
        np.random.shuffle(drop_units)
        dropout_units['layer_{}'.format(0)] = drop_units[:int(self.no_features * (1-self.dropout_input_layer_ratio))]

        for (i, units) in enumerate(self.hidden_units,1):
            drop_units = np.arange(units)
            np.random.shuffle(drop_units)
            dropout_units['layer_{}'.format(i)] = drop_units[:int(units * (1-self.dropout_hidden_layer_ratio))]
        return dropout_units
    
    def train(self, epochs):
        
        start = time.time()             

        for i in range(epochs):
            dropout_units = self.dropout()
            X, y = self.mini_batch(iteration = i)
            y_act = self.output_label_hot_encode(y, self.batch_size)
            layers, y_hat = self.forward_propagation(X, y, dropout_units)
            gradients = self.backward_propagation(layers, y_act, y_hat)

            if self.descent_method == 'gradient_descent':
                self.Weights, self.Bias = self.gradient_descent(gradients)
            elif self.descent_method == 'momentum_gradient_descent':
                if i == 0:
                    prev_gradients = {layer:gradients[layer] * 0 for layer in gradients}
                self.Weights, self.Bias, prev_gradients = self.momentum_gradient_descent(gradients, prev_gradients, rho = 0.9)
            elif self.descent_method == 'adam':
                if i == 0:
                    prev_gradients = {layer:gradients[layer] * 0 for layer in gradients}
                    prev_velocity = {layer:gradients[layer] * 0 for layer in gradients}
                self.Weights, self.Bias, prev_gradients, prev_velocity = self.adam(gradients, prev_gradients, prev_velocity, beta1 = 0.9, beta2 = 0.999, time = i+1)

            ce_error = self.cross_entropy_error(y_act, y_hat)

            pred_y_hat, pred_y_act = self.predict(self.data)
            curr_accuracy = self.accuracy(pred_y_hat, pred_y_act)
            self.error.append(ce_error)
            
            if i%100 == 0: 
                print('Epoch:' + str(i) + ' Error:' + str(round(ce_error,2)) + ' Accuracy:' + str(curr_accuracy))

        end = time.time()
        print('Training time: ' + str(round((end - start)/60)) + ' mins')

In [144]:
model_1 = MLP(data,label,hidden_units=[50,25],activation='relu',descent_method='adam',
              learning_rate = 0.01,batch_size = 100,shuffle = False,regularization = 0.001,
              dropout_input_layer_ratio = 0.8, dropout_hidden_layer_ratio = 0.5)
model_1.train(epochs = 1000)

Epoch:0 Error:233.23 Accuracy:14.93
Epoch:100 Error:142.33 Accuracy:69.8
Epoch:200 Error:121.34 Accuracy:73.24
Epoch:300 Error:125.86 Accuracy:77.33
Epoch:400 Error:164.13 Accuracy:78.74
Epoch:500 Error:113.12 Accuracy:78.77
Epoch:600 Error:116.77 Accuracy:78.98
Epoch:700 Error:112.44 Accuracy:79.99
Epoch:800 Error:89.55 Accuracy:80.43
Epoch:900 Error:112.79 Accuracy:79.58
Training time: 6 mins


In [118]:
model_2 = MLP(data,label,hidden_units=[50,10],activation='sigmoid',descent_method='adam',
                 learning_rate = 0.01,batch_size = 100,shuffle = False,regularization = 0.001)
model_2.train(epochs = 1000)

Epoch:0 Error:234.14 Accuracy:10.0
Epoch:100 Error:220.37 Accuracy:44.31
Epoch:200 Error:201.15 Accuracy:44.67
Epoch:300 Error:181.52 Accuracy:49.02
Epoch:400 Error:180.77 Accuracy:50.18
Epoch:500 Error:253.93 Accuracy:51.22
Epoch:600 Error:211.91 Accuracy:51.94
Epoch:700 Error:192.96 Accuracy:47.28
Epoch:800 Error:226.83 Accuracy:51.62
Epoch:900 Error:235.54 Accuracy:48.89
Training time: 4 mins
