### elementary implementation of a simple neural network with 2 hidden layers 

In [1]:
# imports
import tensorflow as tf
import numpy as np
import time
from matplotlib import pyplot as plt

# settings 
%matplotlib inline
np.random.seed(1515)

In [2]:
# load data
mnist = tf.keras.datasets.mnist
(x_train, y_train),  (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train/255.0, x_test/255.0

# convert mnist data to 2d
x_train_nn = np.reshape(x_train, (60000, x_train.shape[1]**2))
x_test_nn = np.reshape(x_test, (x_test.shape[0], x_test.shape[1]**2))
empty_array = np.zeros(shape = (60000, 10))
empty_array2 = np.zeros(shape = (x_test_nn.shape[0], 10))

# brute force one hot encoding
for i in range(60000):
    empty_array[i, y_train[i]] = 1
for i in range(x_test_nn.shape[0]):
    empty_array2[i, y_test[i]] = 1

y_train_2 = y_train[:]
y_train = empty_array
y_test = empty_array2

In [6]:
def create_parameters(no_hidden_units):
    """
    function to create parameters for 2 hidden layers
    with corresponding bias parameters
    """
    
    bw1 = np.random.normal(size = (no_hidden_units))
    bw2 = np.random.normal(size = (10))
    w1 = np.random.normal(size = (784, no_hidden_units))
    w2 = np.random.normal(size = (no_hidden_units, 10))
    return(w1, w2, bw1, bw2)

w_1, w_2, bw1, bw2 = create_parameters(50)

def sigmoid(w_1, bw1, x_train):
    """
    sigmoid activation function
    """
    
    a = np.dot(x_train.T, w_1) + bw1
    hidden_units = (1+np.exp(-a))**-1
    return hidden_units.T

def propagate(w_1, w_2, bw1, bw2, x_train):
    """
    propagate inputs through network
    """
    hidden_units = sigmoid(w_1, bw1, x_train)
    outputs = np.dot(hidden_units.T, w_2) + bw2
    outputs -= np.max(outputs)
    outputs = np.exp(outputs)
    sum_top = np.sum(outputs, axis = 0)
    outputs = outputs/sum_top
    outputs += 10**-7
    return hidden_units, outputs

def backpropagate(w_1, w_2, bw1, bw2, x_train, y_train, no_hidden_units):
    """
    calculate gradients through backpropagation
    """
    hidden_units, outputs = propagate(w_1, w_2, bw1, bw2, x_train)
    d2 = outputs - y_train
    d2 = np.reshape(d2, (d2.shape[0], 1))
    hidden_units = np.reshape(hidden_units, (hidden_units.shape[0], 1))
    w2_grad = np.multiply(hidden_units, d2.T)
    bw2_grad = d2
    d1 = np.multiply(np.multiply(hidden_units, (1-hidden_units)), np.dot(w_2, d2))
    d1 = np.reshape(d1, (d1.shape[0], 1))
    x_train = np.reshape(x_train, (x_train.shape[0], 1))
    w1_grad = np.multiply(x_train, d1.T)
    bw1_grad = d1    
    return(w1_grad, w2_grad, bw1_grad, bw2_grad)

def update_weights(w_1, w_2, bw1, bw2, x_train, y_train, no_hidden_units):
    """
    update weights using stochastic gradient descent
    """
    s = np.arange(x_train.shape[0])
    np.random.shuffle(s)
    x_train = x_train[s]
    y_train = y_train[s]
    for i in range(60000):
        x_train_data = x_train[i, :]
        y_train_data = y_train[i, :]
        w1_grad, w2_grad, bw1_grad, bw2_grad = backpropagate(w_1, w_2, bw1, bw2, x_train_data, y_train_data, no_hidden_units)
        w_1 = w_1 - 0.02*w1_grad
        w_2 = w_2 - 0.02*w2_grad
        bw1 = bw1 - 0.02*bw1_grad.flatten()
        bw2 = bw2 - 0.02*bw2_grad.flatten()
    return(w_1, w_2, bw1, bw2)

def cross_entropy_loss(w_1, w_2, bw1, bw2, x_train, y_train, no_hidden_units):
    obj = 0
    counter = 0
    for i in range(x_train.shape[0]):
        y_pos = np.argmax(y_train[i, :])
        hidden_units, outputs = propagate(w_1, w_2, bw1, bw2, x_train[i, :])
        interm = -np.log(outputs[y_pos])
        obj += interm
        if np.argmax(outputs) == (y_pos):
            counter += 1
    return(obj, counter, counter/ x_train.shape[0])
        
def main(w_1, w_2, bw1, bw2, x_train, y_train, no_hidden_units):
    """
    train network with 10 epochs
    """
    for i in range(10):
        w_1, w_2, bw1, bw2  = update_weights(w_1, w_2, bw1, bw2, x_train, y_train, no_hidden_units)
        x1, _, x2 = cross_entropy_loss(w_1, w_2, bw1, bw2, x_train, y_train, no_hidden_units)
        print("loss: %f, accuracy: %f" % (x1, x2))
    return(w_1, w_2, bw1, bw2)

# get final weights
w_1, w_2, bw1, bw2 = main(w_1, w_2, bw1, bw2, x_train_nn, y_train, 50)

loss: 19429.436377, accuract: 0.902233
loss: 14959.790563, accuract: 0.924600
loss: 12832.460343, accuract: 0.935467
loss: 11114.862115, accuract: 0.944167
loss: 10057.775235, accuract: 0.949033
loss: 9361.336589, accuract: 0.952667
loss: 8121.620933, accuract: 0.958550
loss: 7677.409774, accuract: 0.961633
loss: 7375.470176, accuract: 0.962917
loss: 6802.592994, accuract: 0.965933


In [181]:
# test accuracy
# objective, number of correct samples, and lastly accuracy
cross_entropy_loss(w_1, w_2, bw1, bw2, x_test_nn, y_test, 50)

(1769.120161146334, 9467, 0.9467)