# Programing exercise - Neural Networks

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

# load MATLAB files
from scipy.io import loadmat

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
 
#%config InlineBackend.figure_formats = {'pdf',}
%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('darkgrid')

### load data

In [3]:
data = loadmat('data/ex4data1.mat')
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [4]:
y = data['y']
X = np.c_[np.ones((data['X'].shape[0],1)), data['X']]

print('X: ', X.shape, " (with intercept)")
print('y: ', y.shape)

X:  (5000, 401)  (with intercept)
y:  (5000, 1)


In [5]:
weights = loadmat('data/ex3weights.mat')
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [6]:
theta1, theta2 = weights['Theta1'], weights['Theta2']
print('theta1: ', theta1.shape)
print('theta2: ', theta2.shape)
params = np.r_[theta1.ravel(), theta2.ravel()]
print('params: ', params.shape)

theta1:  (25, 401)
theta2:  (10, 26)
params:  (10285,)


#### Neural Networks

## Neural networks - feed forword and cost function

In [8]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [9]:
def sigmoid_gradient(z):
    return sigmoid(z)*(1-sigmoid(z))

In [21]:
def nn_cost_function(nn_params, input_layer_size, hidden_layer_size, num_labels, features, classes, reg):
    theta1 = nn_params[0:(hidden_layer_size*(input_layer_size+1))].reshape(hidden_layer_size, (input_layer_size+1))
    theta2 = nn_params[(hidden_layer_size*(input_layer_size+1)):].reshape(num_labels, (hidden_layer_size+1))
    
    m = features.shape[0]
    y_matrix = pd.get_dummies(classes.ravel()).as_matrix()
    
    #cost
    a1 = features
    
    z2 = theta1.dot(a1.T) # 25x401 * 401x5000 = 25x5000
    a2 = np.c_[np.ones((features.shape[0],1)),sigmoid(z2.T)] # 5000x26 
    
    z3 = theta2.dot(a2.T) # 10x26 * 26x5000 = 10x5000
    a3 = sigmoid(z3)
    
    J = -1*(1/m)*np.sum((np.log(a3.T)*(y_matrix)+np.log(1-a3).T*(1-y_matrix))) + (reg/(2*m))*(np.sum(np.square(theta1[:,1:])) + np.sum(np.square(theta2[:,1:])))
    
    # gradients
    d3 = a3.T - y_matrix
    d2 = theta2[:, 1:].T.dot(d3.T) * sigmoid_gradient(z2)
    
    delta1 = d2.dot(a1)
    delta2 = d3.T.dot(a2)
    theta1_ = np.c_[np.ones((theta1.shape[0], 1)), theta1[:,1:]]
    theta2_ = np.c_[np.ones((theta2.shape[0], 1)), theta2[:,1:]]
    
    theta1_grad = delta1/m + (theta1_*reg)/m
    theta2_grad = delta2/m + (theta2_*reg)/m
    
    return J, theta1_grad, theta2_grad

In [23]:
# rugularized parameter = 0
nn_cost_function(params, 400, 25, 10, X, y, 0)[0]

0.28762916516131887

In [24]:
# rugularized parameter = 1
nn_cost_function(params, 400, 25, 10, X, y, 1)[0]

0.38376985909092359

In [25]:
[sigmoid_gradient(i) for i in [-1, -0.5, 0, 0.5, 1]]

[0.19661193324148185,
 0.23500371220159449,
 0.25,
 0.23500371220159449,
 0.19661193324148185]