In [69]:
import struct
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from datetime import datetime
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import numpy as np
import time
from jupyterthemes import jtplot
from torch.utils.tensorboard import SummaryWriter
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)

wandb.init(project="ReLu_test", sync_tensorboard=True)


In [70]:
tf.keras.datasets.mnist.load_data(path="mnist.npz");
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

In [71]:
x_train.shape

(60000, 28, 28)

In [72]:
y_train[2]

np.uint8(4)

In [73]:
y_train

array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)

In [74]:
sample = np.zeros(10)
sample[1] = 1
display(sample)

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [75]:
y_train.shape

(60000,)

In [76]:
x_train = (x_train.reshape(-1, 28 * 28) / 255).astype('float32')
x_test = (x_test.reshape(-1, 28 * 28) / 255).astype('float32')
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [77]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [78]:
y_train.shape

(60000, 10)

In [79]:
x_train.shape

(60000, 784)

**Model Procedures**

*Forward Pass:*

\\(Z_i = W_i \bullet x^T + b_i \\)

\\(A_i = \sigma(Z_i)\\)

\\(\hat{y} = A_i\\)

where \\(\sigma\\) is a nonlinear transformation

*Loss Function* with regularization

\\(L(y,\hat{y}) = -\frac{1}{m} \Sigma_j \Sigma_i y_i log(\hat{y_i}) + \frac{\lambda}{2*m} * (
\Sigma_w w^2)\\)

*Back propagation: here we use differental equations and use the chain rule first starting with the cost function and work backwards until we get to weights since we want to learn the weights that give a better fit*

\\(\frac{\delta L}{\delta w_i} = \frac{\delta L}{\delta \hat{y}} * \frac{\delta \hat{y}}{\delta z} * \frac{\delta z}{\delta w_i}\\)

*Update weights*

\\(w_i = w_i * \delta w_i - \frac {(w_i * \lambda * \eta)}{m}\\)

where \\(\eta\\) is the learning rate

Sigmoid - Activation function

In [80]:
class MultiLayerPerceptron():
    def __init__(self, sizes, epochs=10, l_rate=0.001):
        self.sizes = sizes
        self.epochs = epochs
        self.l_rate = l_rate
        self.losses = []
        self.accuracy = []

        # we save all parameters in the neural network in this dictionary
        self.params = self.initialization()

    def sigmoid(self, x, derivative=False):
        if derivative:
            return (np.exp(-x))/((np.exp(-x)+1)**2)
        return 1/(1 + np.exp(-x))

    def relu(self, x, derivative=False):
        if derivative:
            return np.where(x > 0, 1, 0)
        return np.maximum(0, x)

    def softmax(self, x, derivative=False):
        # Numerically stable with large exponentials
        exps = np.exp(x - x.max())
        if derivative:
            return exps / np.sum(exps, axis=0) * (1 - exps / np.sum(exps, axis=0))
        return exps / np.sum(exps, axis=0)

    def initialization(self):
        # number of nodes in each layer
        input_layer=self.sizes[0]
        hidden_1=self.sizes[1]
        hidden_2=self.sizes[2]
        hidden_3=self.sizes[3]
        hidden_4=self.sizes[4]
        output_layer=self.sizes[5]

        params = {
            'W1':np.random.randn(hidden_1, input_layer) * np.sqrt(1. / hidden_1),
            'W2':np.random.randn(hidden_2, hidden_1) * np.sqrt(1. / hidden_2),
            'W3':np.random.randn(hidden_3, hidden_2) * np.sqrt(1. / hidden_3),
            'W4':np.random.randn(hidden_4, hidden_3) * np.sqrt(1. / hidden_4),
            'W5':np.random.randn(output_layer, hidden_4) * np.sqrt(1. / output_layer),
        }

        return params

    def forward_pass(self, x_train):
        params = self.params

        # input layer activations becomes sample
        params['A0'] = x_train

        # input layer to hidden layer 1
        params['Z1'] = np.dot(params["W1"], params['A0'])
        params['A1'] = self.relu(params['Z1'])

        # hidden layer 1 to hidden layer 2
        params['Z2'] = np.dot(params["W2"], params['A1'])
        params['A2'] = self.relu(params['Z2'])

        # hidden layer 2 to hidden 3
        params['Z3'] = np.dot(params["W3"], params['A2'])
        params['A3'] = self.relu(params['Z3'])

        #Hidden layer 3 to 4
        params['Z4'] = np.dot(params["W4"], params['A3'])
        params['A4'] = self.relu(params['Z4'])

        #Hidden layer 4 to ouput
        params['Z5'] = np.dot(params["W5"], params['A4'])
        params['A5'] = self.softmax(params['Z5'])

        return params['A5']

    def backward_pass(self, y_train, output):
        '''
            This is the backpropagation algorithm, for calculating the updates
            of the neural network's parameters.

            Note: There is a stability issue that causes warnings. This is 
                  caused  by the dot and multiply operations on the huge arrays.
                  
                  RuntimeWarning: invalid value encountered in true_divide
                  RuntimeWarning: overflow encountered in exp
                  RuntimeWarning: overflow encountered in square
        '''
        params = self.params
        change_w = {}

        #Calculate W5 update
        error = 2 * (output - y_train) / output.shape[0] * self.softmax(params['Z5'], derivative=True)
        change_w['W5'] = np.outer(error, params['A4'])

        #Calculate W4 Update
        error = np.dot(params['W5'].T, error) * self.relu(params['Z4'], derivative=True)
        change_w['W4'] = np.outer(error, params['A3'])

        # Calculate W3 update
        error = np.dot(params['W4'].T, error) * self.relu(params['Z3'], derivative=True)
        change_w['W3'] = np.outer(error, params['A2'])

        # Calculate W2 update
        error = np.dot(params['W3'].T, error) * self.relu(params['Z2'], derivative=True)
        change_w['W2'] = np.outer(error, params['A1'])

        # Calculate W1 update
        error = np.dot(params['W2'].T, error) * self.relu(params['Z1'], derivative=True)
        change_w['W1'] = np.outer(error, params['A0'])

        return change_w
    
    def loss_function(self, output, y_train, epsilon=1e-12):
        '''
            Computes cross entropy between targets (encoded as one-hot vectors)
            and predictions. 
            Input: predictions (N, k) ndarray
                   targets (N, k) ndarray        
            Returns: scalar
        '''
        output = np.clip(output, epsilon, 1. - epsilon)
        N = output.shape[0]
        ce = -np.sum(y_train*np.log(output+1e-9))/N
        return ce

    def update_network_parameters(self, changes_to_w):
        '''
            Update network parameters according to update rule from
            Stochastic Gradient Descent.

            θ = θ - η * ∇J(x, y), 
                theta θ:            a network parameter (e.g. a weight w)
                eta η:              the learning rate
                gradient ∇J(x, y):  the gradient of the objective function,
                                    i.e. the change for a specific theta θ
        '''
        
        for key, value in changes_to_w.items():
            self.params[key] -= self.l_rate * value

    def compute_accuracy(self, x_val, y_val):
        '''
            This function does a forward pass of x, then checks if the indices
            of the maximum value in the output equals the indices in the label
            y. Then it sums over each prediction and calculates the accuracy.
        '''
        predictions = []

        for x, y in zip(x_val, y_val):
            output = (self.forward_pass(x))
            pred = np.argmax(output)
            predictions.append(pred == np.argmax(y))
        
        return np.mean(predictions)

    def train(self, x_train, y_train, x_test, y_test):
        writer = SummaryWriter("runs/experiment_name")

        start_time = time.time()
        for iteration in tqdm(range(1, self.epochs + 1, 1)):
            loss = 0.0
            for x, y in zip(x_train, y_train):
                output = self.forward_pass(x)
                loss += self.loss_function(output, y)
                
                changes_to_w = self.backward_pass(y, output)
                self.update_network_parameters(changes_to_w)

            self.losses.append(loss / x_train.shape[0])
            acc = self.compute_accuracy(x_test, y_test)
            self.accuracy.append(acc)

            writer.add_scalar("Loss/train", self.losses[-1], iteration)
            writer.add_scalar("Accuracy/test", self.accuracy[-1], iteration)

            wandb.log({"Loss/train": self.losses[-1], "Accuracy/test": self.accuracy[-1]})

            print('Epoch: {0}, Time Spent: {1:.2f}s, Accuracy: {2:.2f}%'.format(
                iteration+1, time.time() - start_time, acc * 100
            ))
        writer.close()
        wandb.finish()


In [81]:
MLP = MultiLayerPerceptron(sizes=[28 * 28, 128, 64, 32, 16, 10], epochs=25, l_rate=0.01)
MLP.train(x_train, y_train, x_test, y_test)

  4%|▍         | 1/25 [00:08<03:31,  8.83s/it]

Epoch: 2, Time Spent: 8.83s, Accuracy: 90.22%


  8%|▊         | 2/25 [00:17<03:22,  8.83s/it]

Epoch: 3, Time Spent: 17.65s, Accuracy: 92.55%


 12%|█▏        | 3/25 [00:26<03:13,  8.81s/it]

Epoch: 4, Time Spent: 26.45s, Accuracy: 93.67%


 16%|█▌        | 4/25 [00:35<03:04,  8.78s/it]

Epoch: 5, Time Spent: 35.18s, Accuracy: 94.36%


 20%|██        | 5/25 [00:43<02:55,  8.75s/it]

Epoch: 6, Time Spent: 43.89s, Accuracy: 94.91%


 24%|██▍       | 6/25 [00:52<02:46,  8.75s/it]

Epoch: 7, Time Spent: 52.62s, Accuracy: 95.21%


 28%|██▊       | 7/25 [01:01<02:36,  8.70s/it]

Epoch: 8, Time Spent: 61.21s, Accuracy: 95.53%


 32%|███▏      | 8/25 [01:09<02:27,  8.67s/it]

Epoch: 9, Time Spent: 69.82s, Accuracy: 95.47%


 36%|███▌      | 9/25 [01:18<02:18,  8.68s/it]

Epoch: 10, Time Spent: 78.53s, Accuracy: 95.54%


 40%|████      | 10/25 [01:27<02:09,  8.66s/it]

Epoch: 11, Time Spent: 87.14s, Accuracy: 95.68%


 44%|████▍     | 11/25 [01:35<02:01,  8.66s/it]

Epoch: 12, Time Spent: 95.79s, Accuracy: 95.81%


 48%|████▊     | 12/25 [01:44<01:52,  8.67s/it]

Epoch: 13, Time Spent: 104.48s, Accuracy: 95.77%


 52%|█████▏    | 13/25 [01:53<01:43,  8.66s/it]

Epoch: 14, Time Spent: 113.12s, Accuracy: 95.92%


 56%|█████▌    | 14/25 [02:01<01:34,  8.63s/it]

Epoch: 15, Time Spent: 121.67s, Accuracy: 95.93%


 60%|██████    | 15/25 [02:10<01:26,  8.64s/it]

Epoch: 16, Time Spent: 130.33s, Accuracy: 96.24%


 64%|██████▍   | 16/25 [02:18<01:17,  8.63s/it]

Epoch: 17, Time Spent: 138.94s, Accuracy: 96.26%


 68%|██████▊   | 17/25 [02:27<01:09,  8.63s/it]

Epoch: 18, Time Spent: 147.58s, Accuracy: 96.30%


 72%|███████▏  | 18/25 [02:36<01:00,  8.62s/it]

Epoch: 19, Time Spent: 156.18s, Accuracy: 96.41%


 76%|███████▌  | 19/25 [02:44<00:51,  8.66s/it]

Epoch: 20, Time Spent: 164.93s, Accuracy: 96.36%


 80%|████████  | 20/25 [02:53<00:43,  8.65s/it]

Epoch: 21, Time Spent: 173.56s, Accuracy: 96.45%


 84%|████████▍ | 21/25 [03:02<00:34,  8.64s/it]

Epoch: 22, Time Spent: 182.18s, Accuracy: 96.62%


 88%|████████▊ | 22/25 [03:11<00:26,  8.70s/it]

Epoch: 23, Time Spent: 191.03s, Accuracy: 96.61%


 92%|█████████▏| 23/25 [03:19<00:17,  8.78s/it]

Epoch: 24, Time Spent: 199.99s, Accuracy: 96.46%


 96%|█████████▌| 24/25 [03:29<00:08,  8.87s/it]

Epoch: 25, Time Spent: 209.08s, Accuracy: 10.09%


100%|██████████| 25/25 [03:37<00:00,  8.72s/it]

Epoch: 26, Time Spent: 217.97s, Accuracy: 10.09%





0,1
Accuracy/test,▇██████████▇█████████████████▁█████████▁
Loss/train,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▁▁▁▁▁▁▁▁▁█
global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███

0,1
Accuracy/test,0.1009
Loss/train,1.86676
global_step,25.0
