<a href="https://colab.research.google.com/github/harperd/machine-learning/blob/master/notebooks/neural-network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Network

Create a neural network to recognize hand-written digits (0 to 9).

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.style as style
import google.colab as colab

from sklearn.preprocessing import OneHotEncoder
from scipy.io import loadmat

# Allow saving our graphs in the notebook
%matplotlib inline

style.use('dark_background')

In [0]:
mat_file = colab.files.upload()
!ls -l

Saving ex3data1.mat to ex3data1.mat
total 14676
-rw-r--r-- 1 root root 7511764 Jan  3 20:29 ex3data1.mat
-rw-r--r-- 1 root root 7511764 Jan  3 19:50 ex4data1.mat
drwxr-xr-x 1 root root    4096 Dec 18 16:52 sample_data


In [4]:
mat_data = loadmat('ex4data1.mat')
mat_data

{'X': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
 '__version__': '1.0',
 'y': array([[10],
        [10],
        [10],
        ...,
        [ 9],
        [ 9],
        [ 9]], dtype=uint8)}

In [5]:
X = mat_data['X']
y = mat_data['y']

print(f'X Shape: {X.shape}')
print(f'y Shape: {y.shape}')
print()
print(f'X = {X}')
print()
print(f'y = {y}')

X Shape: (5000, 400)
y Shape: (5000, 1)

X = [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

y = [[10]
 [10]
 [10]
 ...
 [ 9]
 [ 9]
 [ 9]]


![Hand written numbers](https://github.com/harperd/machine-learning/blob/master/images/ex3-1.png?raw=1)

In [23]:
# Create a scikit-learn One-Hot encoder.
encoder = OneHotEncoder(
    # Will return sparse matrix ( matrix in which most of the elements are zero)
    # if set True else will return an array.
    sparse=False,
    # Auto determines categories automatically from the training data.
    categories='auto')

# Fit then transform the data.
# Fitting uses the Standard Scalar to calculate the mean and standard deviation
# of the categories and centers the data around a mean of 0.
# Transform will encode the categories with labels, or in this case, one-hot.
y_onehot = encoder.fit_transform(y)

categories = np.unique(y_onehot, axis=0)

for i in range(len(categories)):
  onehot_category = []

  for k in range(len(categories)):
    if categories[k,i] == 1:
      onehot_category = categories[k]
      break
  
  print(f'Category: {i} \tEncoded: {onehot_category}')

Category: 0 	Encoded: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Category: 1 	Encoded: [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
Category: 2 	Encoded: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
Category: 3 	Encoded: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
Category: 4 	Encoded: [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
Category: 5 	Encoded: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
Category: 6 	Encoded: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
Category: 7 	Encoded: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
Category: 8 	Encoded: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
Category: 9 	Encoded: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


## Setup

In [0]:
learning_rate = 1
hidden_layer_size = 25
input_layer_size = X.shape[1]
num_labels = y_onehot.shape[1]

## Weight Initialization

In [16]:
def initialize_weights(X, y, input_layer_size, hidden_layer_size, num_labels):
  size = hidden_layer_size * ( input_layer_size + 1 ) + num_labels * ( hidden_layer_size + 1 )
  W = ( np.random.random(size=size) - 0.5 ) * 0.25
  return W

W = initialize_weights(X, y, input_layer_size, hidden_layer_size, num_labels)

# Unravel to get each weight
W1 = np.matrix(np.reshape(W[:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, (input_layer_size + 1))))
W2 = np.matrix(np.reshape(W[hidden_layer_size * (input_layer_size + 1):], (num_labels, (hidden_layer_size + 1))))

W1.shape, W2.shape

((25, 401), (10, 26))

## Forward Propogation

![](https://github.com/harperd/machine-learning/blob/master/images/nn-forward-propagation.png?raw=1)

In [17]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

def forward_propagate(X, y, W1, W2, hidden_size, num_labels):
  # Get the number of examples.
  m = X.shape[0]

  #
  # Compute the hypothesis for the first hidden layer
  #

  # Add a new first column to our inputs, X, with all ones (bias).
  a1 = np.insert(X, 0, values=np.ones(m), axis=1)

  # Compute our hypothesis z2 using our inputs X + bias column.
  z2 = a1 * W1.T

  # Apply our Sigmoid activation function to scale the hypothesis results
  # between 0 and 1.
  a2 = np.insert(sigmoid(z2), 0, values=np.ones(m), axis=1)

  #
  # Compute the hypothesis for the output layer
  #

  # Compute our hypothesis z3 using the outputs (hypothesis) from the previous
  # hidden layer.
  z3 = a2 * W2.T

  # Apply our Sigmoid activation function to scale the hypothesis results
  # between 0 and 1.
  h = sigmoid(z3)

  return a1, z2, a2, z3, h

a1, z2, a2, z3, h = forward_propagate(X, y, W1, W2, hidden_layer_size, num_labels)

a1.shape, z2.shape, a2.shape, z3.shape, h.shape

((5000, 401), (5000, 25), (5000, 26), (5000, 10), (5000, 10))

## Cost Function and Regularization

![](https://github.com/harperd/machine-learning/blob/master/images/nn-cost.png?raw=1)

The above cost function shows the first, second and regularization terms.

In [32]:
# Compare the output against the true labels.
def compute_cost(X, y, h, W1, W2, learning_rate):
  m = X.shape[0]
  J = 0

  # Compare the category labels (y) with the actual results (h)
  for i in range(m):
    first_term = np.multiply(-y[i,:], np.log(h[i,:]))
    second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
    J += np.sum(first_term - second_term)
  
  J = J / m

  return regularize_l2(J, X, W1, W2, learning_rate)
  
def regularize_l2(J, X, W1, W2, learning_rate):
  m = X.shape[0]
  return J + (float(learning_rate) / (2 * m)) * (np.sum(np.power(W1[:,1:], 2)) + np.sum(np.power(W2[:,1:], 2)))

J = compute_cost(X, y_onehot, h, W1, W2, learning_rate)

print(f'Cost = {J}')

Cost = 7.121222150880251


## Back Propogation

![](https://github.com/harperd/machine-learning/blob/master/images/nn-back-propagation.png?raw=1)

In [36]:
# To determine if our back propagation code is working as expected, we can check
# it with estimating what the actual gradient should be. This is also called 
# numerical differentiation.
def sigmoid_gradient(z):
  return np.multiply(sigmoid(z), (1 - sigmoid(z)))

def back_propogation(X, y, h, W1, W2, a1, z2, a2, learning_rate):
  m = X.shape[1]
  delta1 = np.zeros(W1.shape)  # (25, 401)
  delta2 = np.zeros(W2.shape)  # (10, 26)

  # perform backpropagation
  for t in range(m):
      a1t = a1[t,:]  # (1, 401)
      z2t = z2[t,:]  # (1, 25)
      a2t = a2[t,:]  # (1, 26)
      ht = h[t,:]  # (1, 10)
      yt = y[t,:]  # (1, 10)
      
      d3t = ht - yt  # (1, 10)
      
      z2t = np.insert(z2t, 0, values=np.ones(1))  # (1, 26)
      d2t = np.multiply((W2.T * d3t.T).T, sigmoid_gradient(z2t))  # (1, 26)
      
      delta1 = delta1 + (d2t[:,1:]).T * a1t
      delta2 = delta2 + d3t.T * a2t
      
  delta1 = delta1 / m
  delta2 = delta2 / m

  # add the gradient regularization term
  delta1[:,1:] = delta1[:,1:] + (W1[:,1:] * learning_rate) / m
  delta2[:,1:] = delta2[:,1:] + (W2[:,1:] * learning_rate) / m

  # unravel the gradient matrices into a single array
  grad = np.concatenate((np.ravel(delta1), np.ravel(delta2)))

  return grad

grad = back_propogation(X, y, h, W1, W2, a1, z2, a2, learning_rate)

print(f'Cost = {J}, Gradient = {grad.shape}')

Cost = 7.121222150880251, Gradient = (10285,)
