<a href="https://colab.research.google.com/github/harperd/machine-learning/blob/master/notebooks/neural-network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Network

Create a neural network to recognize hand-written digits (0 to 9).

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style
import pandas as pd
import google.colab as colab
import scipy.optimize as opt
import io

from scipy.io import loadmat

# Allow saving our graphs in the notebook
%matplotlib inline

style.use('dark_background')

In [3]:
mat_file = colab.files.upload()
!ls -l

Saving ex4data1.mat to ex4data1.mat
total 7340
-rw-r--r-- 1 root root 7511764 Sep 16 00:40 ex4data1.mat
drwxr-xr-x 1 root root    4096 Aug 27 16:17 sample_data


In [4]:
mat_data = loadmat('ex4data1.mat')
mat_data

{'X': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
 '__version__': '1.0',
 'y': array([[10],
        [10],
        [10],
        ...,
        [ 9],
        [ 9],
        [ 9]], dtype=uint8)}

In [5]:
X = mat_data['X']
y = mat_data['y']

print(f'X Shape: {X.shape}')
print(f'y Shape: {y.shape}')

X Shape: (5000, 400)
y Shape: (5000, 1)


![Hand written numbers](https://github.com/harperd/machine-learning/blob/master/images/ex3-1.png?raw=1)

In [22]:
def add_bias(m):
  if(len(m.shape) == 2):
    return np.concatenate(
      (np.ones(m.shape[0])[:, np.newaxis], m),
      axis = 1)
  
  if(len(m.shape) == 1):
    return np.insert(m, 0, 1, axis = 0)
  
def describe_network(network):
  layers_dims = network['layers_dims']
  parameters = network['parameters']
  
  L = len(layers_dims)
  INPUT_LAYER = 1
  OUTPUT_LAYER = L
  
  print('The following describes the configured network architecture:')
  print()
  
  for l in range(1, L + 1):
    size_l = layers_dims[l - 1]
    
    if(l == INPUT_LAYER):
      W = parameters[f'W{str(l)}'].shape
      print(f'Layer1:Input[x{size_l}] --> ', end = '')
      print(f'W{l}[{W[0]}x{W[1]}] --> ', end = '')
    elif(l == OUTPUT_LAYER):
      print(f'Layer{l}:Output[x{size_l}]')
    else:
      W = parameters["W" + str(l)].shape
      print(f'Layer{l}:Hidden[x{size_l}] --> ', end = '')
      print(f'W{l}[{W[0]}x{W[1]}] --> ', end = '')

# Xavier Initialization since we are using Sigmoid to avoid vanishing
# or exploading gradients if our network is a deep network.
def xavier(prev_layer_size):
    return np.sqrt(1 / prev_layer_size)
  
def initialize_parameters(layers_dims):
    np.random.seed(1)               
    parameters = {}
    
    # Get the number of layers
    L = len(layers_dims)            
    
    # For each layer initalize the weights and bias vector
    for i in range(1, L):
        current_layer_size = layers_dims[i - 1]
        next_layer_size    = layers_dims[i]
        
        # print(f'Initializing W{l} parameters matrix with size {next_layer_size} x {current_layer_size + 1}')
        
        # TO DO: Initialize between -1 and 1
        
        # Create a weight matrix with size of (m) x (n)
        # where rows (m) is the number of activation functions in the next layer
        # and the number of columns (n) is the number of activation functions 
        # from the current layer plus 1 for the bias column.
        W = np.random.randn(
              next_layer_size,       # number of rows (m)
              current_layer_size + 1 # number of columns (n)
            ) * .01 ## TO DO: FIX xavier(current_layer_size)
        
        assert W.shape == (next_layer_size, current_layer_size + 1)
        
        parameters[f'W{i}'] = W

    return parameters
  
def to_vec(y, K):
  y_vec = {}
  example = 1
  
  for i in y:
    vec = np.zeros((K, 1))
    index = y[i][0] if y[i][0] < 10 else 0
    vec[index][0] = 1
    y_vec[example] = {
        'vector': vec,
        'digit': index
    }
    example = example + 1
    
  return y_vec
  
def initialize_network(layers_dims, y):
    network = {}
    K = len(np.unique(y))
    network['y'] = to_vec(y, K)
    network['classes'] = K
    network['layers_dims'] = layers_dims
    network['parameters'] = initialize_parameters(layers_dims)
    
    return network

network = initialize_network([X.shape[1], 25, 10], y)
print()
describe_network(network)


The following describes the configured network architecture:

Layer1:Input[x400] --> W1[25x401] --> Layer2:Hidden[x25] --> W2[10x26] --> Layer3:Output[x10]


In [24]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

def g(z):
  return add_bias(sigmoid(z))

def convert_to_output_vec(h, K):  
    # Drop the bias unit
    output_units = np.delete(h, 0)
    
    # Get the maxium probability (one-vs-all)
    max_probability = max(output_units)
    
    # Set the max probability to 1 and others to 0
    # so first create a K-dimentional vector of all zeros
    v = np.zeros((K, 1))
    
    # Then, get the index of the max probability and set that in our
    # hypothesis vector.
    i = np.where(output_units == max_probability)[0][0]
    v[i] = 1
    
    return v
  
def feed_forward(network, X):
  print('Propagating forward...')
  layers_dims = network['layers_dims']
  parameters = network['parameters']

  K = network['classes']
  L = len(layers_dims)
  INPUT_LAYER = 1
  OUTPUT_LAYER = L
  
  passes = {}
  
  example = 1
  
  for x in add_bias(X):
    compute = {}
    
    for index_l in range(0, L):
      l = index_l + 1

      if(l == INPUT_LAYER):
        compute['a1'] = x
      else:
        a = compute[f'a{l - 1}']
        W = parameters[f'W{l - 1}']
        z = W @ a
        h = g(z)
        compute[f'a{l}'] = h
        
        if(l == OUTPUT_LAYER):
          compute['output_vec'] = convert_to_output_vec(h, K)
          passes[example] = compute
          example = example + 1
          
  return passes
  
%time network['forward_passes'] = feed_forward(network, X)
examples = len(network['forward_passes'])
print(f'Processed {examples} examples.')

Propagating forward...
CPU times: user 749 ms, sys: 550 ms, total: 1.3 s
Wall time: 665 ms
Processed 5000 examples.


In [27]:
def compute_cost(network, X):
  m = X.shape[0] # Number of examples
  K = 10         # Number of labels (digits 0 - 9)
  
  # Loop through all of the 5000 training examples
  for i in range(2, m + 1):
    output_vec = network['forward_passes'][i]['output_vec']
    y_vec = network['y'][i]['vector']
    y_digit = network['y'][i]['digit']
    
    print(output_vec)
    print(y_vec)
    print(y_digit)
    
    # Including the bias units in the cost is not a big deal but you generally 
    # want to omit them hence below we are not regularizing the bias units so 
    # our limits will start at 1.
    #for k in range(1, K + 1):
    #  print(f'Output unit {k} = {output_units[k]}')
    #  print(max(output_units))
      
    break

compute_cost(network, X)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]]
[[1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
0
