# Assignment 1
## Still need to implement momentum and mini-batch SGD

In [13]:
import math, random
import numpy as np
from datetime import datetime

In [14]:
# Load in the training set
X_train = np.load('Assignment1-Dataset/train_data.npy')
y_train = np.load('Assignment1-Dataset/train_label.npy')

# # Load in the test set
X_test = np.load('Assignment1-Dataset/test_data.npy')
y_test = np.load('Assignment1-Dataset/test_label.npy')

print(X_train.shape)
print(X_test.shape)

# Normalize (optional)
def normalize(X, X2):
    X = (X - X.mean(axis=0)) / X.std(axis=0)
    X2 = (X2 - X.mean(axis=0)) / X.std(axis=0)
    return X, X2
X_train, X_test = normalize(X_train, X_test)
# X_test = normalize(X_test)

(50000, 128)
(10000, 128)


## Coding the Neural Network

In [23]:
class NN:
    
    def __init__(self, input_dim, output_dim, n_hidden_layer, batch_size):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.batch_size = batch_size
        self.n_hidden_layer = n_hidden_layer
        self.network = self._build_network()

    # < ---- Basic numpy functions ---- > #
    # Sigmoid activation function
    def _sigmoid(self, x):
        return 1.0/(1.0+math.exp(-x))

    # Sigmoid derivative
    def _sigmoid_derivative(self, sigmoid):
        return sigmoid*(1.0-sigmoid)

    # One-hot encoding
    def _one_hot_encoding(self, idx, output_dim):
        x = np.zeros(output_dim, dtype=np.int)
        x[idx] = 1
        return x

    # ReLu activation function
    def _relu(self, x):
        return max(0, x)

    # ReLu derivative
    def _relu_derivative(self, x):
        if x < 0: return 0
        else: return 1
    
    # < ---- Construct the Batch Normalization Layer ---- >
    def batch_normalize(self, x):
        norm = np.linalg.norm(x)
        if norm == 0: return x
        else: return x / norm
        
    # Define the drop out layer
    def drop_out(self, x, rate):
        # Create a random list of index that the node input will be set to 0
        set_0 = round(len(x) * rate)
        
        #Set the input to 0
        for i in random.sample(range(len(x)), set_0):
            x[i] = 0
        return x
            
    def accuracy(self, x, y):
        count = 0
        assert len(x) == len(y)
        for i in range(len(x)):
            if x[i] == y[i]: count += 1
        acc = count / len(x)
        return acc

    # < ---- Building the network architecture ---- > #
    def _build_network(self):
        
        # Create a single fuly connected layer
        def fc_layers(input_dim, output_dim):
            layer = []
            for i in range(output_dim): # Add a weight between each node and unit
                weights= [random.random() for _ in range(input_dim)] # Determine FC layer with randomised/normalized w initialization
                node = {"weight" : weights, 
                         "output": None,
                         "delta": None}
                layer.append(node) # Create the layer
            return layer

        network = [] # Build the network layer by layer
        # If there is no hidden layer, we only have one output layer
        if len(self.n_hidden_layer) == 0:
            network.append(fc_layers(self.input_dim, self.output_dim))
            
        # Otherwise, we will add hidden layers
        else:
            # First add a fully connected layer with n hidden units in the layer
            # input dim is the input from the previous layer
            # n_hidden_layer is the output of this layer aka the hidden units
            network.append(fc_layers(self.input_dim, self.n_hidden_layer[0]))
            
            # Add layers with each layer output a n-1 dimension
#             network.append(fc_layers(self.n_hidden_layer[0], self.n_hidden_layer[0]))
#             network.append(fc_layers(64, 32))
            network.append(fc_layers(self.n_hidden_layer[0], self.n_hidden_layer[0]))
            # Add the output layer with output_dim == n_classes
            network.append(fc_layers(self.n_hidden_layer[0], self.output_dim))

        return network

    # < ---- Training the model ---- > #

    # Training the network
    def train(self, X, y, n_epochs=100, lr=0.005, batch_size=256):
        print("------------------------------------------------------")
        print("Training model......")
        acc = []
        for epoch in range(n_epochs):
            print("------------------------------------------------------")
            start = datetime.now()
            print("Epoch {}:".format(epoch+1))
            # Mini-batch training
            n_batches = round(len(y)/batch_size)
            for i in range(n_batches):
                for _X, _y, in zip(X[i*batch_size:(i+1)*batch_size], y[i*batch_size:(i+1)*batch_size]):
                    y_label = self._one_hot_encoding(_y, self.output_dim)
                    self._forward_pass(_X)
                    self._back_propagation(y_label)
                    self._update_weights(_X, lr)
                    
            end = datetime.now()
            # Make predictions for training and test data
            ypred_train = self.predict(X)
            acc_train = self.accuracy(y, ypred_train)
            
            ypred_test = self.predict(X_test)
            acc_test = self.accuracy(y_test, ypred_test)
            
            print("Train: {}".format(acc_train), "Test: {}".format(acc_test), "Duration: {}".format(end-start))
            acc.append(acc_train)
            
        return acc

    # Forward-pass function
    def _forward_pass(self, x):
        # Set function for relu and sigmoid activation
        relu = self._relu
        transfer = self._sigmoid
        # Set input
        x_in = x
        
        # Starting from the first layer, for each layer we forward pass the input
        for layer in self.network[:-1]:
            # Create a list to save the output vector
            x_out = []
            # For each node in the layer, we collect the dot product of the weights and the product
            for node in layer:
                # Get inner product of the input and weights and use the ReLu activation
                node['output'] = transfer(np.dot(node['weight'], x_in))
                x_out.append(node['output'])
#             x_in = self.drop_out(x_out, 0.2)
            # Add a batch normalization layer
#             x_in = self.batch_normalize(x_out) # Pass the output of this layer as the input to the next layer
            x_in = x_out
        # Similar handling for the last output later
        x_out = []
        for node in self.network[-1]:
            # Get inner product of the input and weights and use the Sigmoid activation
            node['output'] = transfer(np.dot(node['weight'], x_in))
            x_out.append(node['output'])
        x_in = x_out # Pass the output of this layer as the input to the next layer\

        return x_in

    # Back propagation function
    def _back_propagation(self, y_label):
        relu_derivative = self._relu_derivative #for relu activations
        transfer_derivative = self._sigmoid_derivative # for sigmoid activations
        n_layers = len(self.network) # 7
        for i in reversed(range(n_layers)): # i = range(7)
            # Backpropagate from the output later
            if i == n_layers - 1: # if i == 6 (the last layer)
                for j, node in enumerate(self.network[i]):
                    err = node['output'] - y_label[j]
                    node['delta'] = err * transfer_derivative(node['output'])
            else:
                # Weighted sum of gradient from upper layer
                for j, node in enumerate(self.network[i]):
                    err = sum([node_['weight'][j] * node_['delta'] for node_ in self.network[i+1]])
                    node['delta'] = err * transfer_derivative(node['output'])

    def _update_weights(self, x, lr):
        for i, layer in enumerate(self.network):
            if i == 0:
                inputs = x
            else:
                inputs = [node_['output'] for node_ in self.network[i-1]]

                # Update weights
                for node in layer:
                    for j, inpt in enumerate(inputs):
                        node['weight'][j] -= node['delta'] * lr * inpt

    # < ---- Making predictions ---- #
    def predict(self, x):
        pred = np.array([np.argmax(self._forward_pass(_x)) for _x in x], dtype=np.int)
        return pred

## Evaluating the model

In [26]:
def accuracy(x, y):
    count = 0
    assert len(x) == len(y)
    for i in range(len(x)):
        if x[i] == y[i]: count += 1
    acc = count / len(x)
    return acc
# Set up
hidden_layers = [16] # number of nodes in hidden layers i.e. [layer1, layer2, ...]
lr = 0.05 # learning rate
n_epochs = 50 # number of training epoch
batch_size = 128
N, d = X_train.shape
n_classes = len(np.unique(y_train))

print(" Data description --->  X.shape = {}, y.shape = {}, n_classes = {}\n".format(X_train.shape, y_train.shape, n_classes))
print("Model details:")
print(" input_dim = {}".format(d))
print(" hidden_layers = {}".format(hidden_layers))
print(" output_dim = {}".format(n_classes))
print(" batch_size = {}".format(batch_size))
print(" learning rate = {}".format(lr))
print(" n_epochs = {}".format(n_epochs))


# Build neural network classifier model and train
model = NN(input_dim=d, output_dim=n_classes, n_hidden_layer=hidden_layers, batch_size=batch_size) #, seed=seed_weights

print("Number of layers:", len(model.network))

 Data description --->  X.shape = (50000, 128), y.shape = (50000, 1), n_classes = 10

Model details:
 input_dim = 128
 hidden_layers = [16]
 output_dim = 10
 batch_size = 128
 learning rate = 0.05
 n_epochs = 50
Number of layers: 3


In [27]:
model1 = model.train(X_train, y_train, lr=lr, n_epochs=n_epochs, batch_size=batch_size)

------------------------------------------------------
Training model......
------------------------------------------------------
Epoch 1:
Train: 0.10686 Test: 0.1106 Duration: 0:00:26.969067
------------------------------------------------------
Epoch 2:
Train: 0.11186 Test: 0.1238 Duration: 0:00:26.813000
------------------------------------------------------
Epoch 3:
Train: 0.1159 Test: 0.1281 Duration: 0:00:26.787624
------------------------------------------------------
Epoch 4:
Train: 0.11896 Test: 0.144 Duration: 0:00:26.814501
------------------------------------------------------
Epoch 5:
Train: 0.12152 Test: 0.1468 Duration: 0:00:27.119508
------------------------------------------------------
Epoch 6:
Train: 0.12384 Test: 0.1497 Duration: 0:00:27.118000
------------------------------------------------------
Epoch 7:
Train: 0.12518 Test: 0.1521 Duration: 0:00:27.053495
------------------------------------------------------
Epoch 8:
Train: 0.12836 Test: 0.1566 Duration: 0:00:

In [None]:
# print("------------------------------------------------------")
# print("Training model......")

# # Make predictions for training and test data
# ypred_train = model.predict(X_train)
# ypred_test = model.predict(X_test)
# print()
# print("------------------------------------------------------")
# print("Calculating accuracies.....")
# acc_train = accuracy(y_train, ypred_train)
# acc_test = accuracy(y_test, ypred_test)
# print(acc_train)
# print(acc_test)