# Session 6 : Multiple Hidden Layers, Target binary

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

<p style="font-family: Arial; font-size:1.2em;color:black;">
By Pramod Sharma : pramod.sharma@prasami.com
<p>

In [None]:
# Lets import some libraries
import os

import numpy as np

import pandas as pd

import tensorflow as tf

import matplotlib.pyplot as plt

from sklearn import datasets, linear_model

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.metrics import plot_confusion_matrix

from sklearn.preprocessing import MinMaxScaler

import shutil

%matplotlib inline

In [None]:
# Some basic parameters

inpDir = '../input' # location where input data is stored
outDir = '../output' # location to store outputs
RANDOM_STATE = 24 # for initialization ----- REMEMBER: to remove at the time of promotion to production
EPOCHS = 20000 # number of cycles to run

# Set parameters for decoration of plots
params = {'legend.fontsize' : 'large',
          'figure.figsize'  : (9,6),
          'axes.labelsize'  : 'x-large',
          'axes.titlesize'  :'x-large',
          'xtick.labelsize' :'large',
          'ytick.labelsize' :'large',
         }

plt.rcParams.update(params) # update rcParams

## Generate Data Set
<p style="font-family: Arial; font-size:1.2em;color:black;">
Use Sklearn's dataset generator <a href="http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html">make_moon</a> dataset generator.
</p>

In [None]:
X, y = datasets.make_moons(n_samples=1280, shuffle=True, noise=0.2, random_state=RANDOM_STATE)

<p style="font-family: Arial; font-size:1.1em;color:brown;">
<strong>Note</strong>: All two dimensional matrix are represented by Caps and all arrays (vectors) are represented by small case.
</p>

In [None]:
# Plot the data
plt.scatter(X[:,0], X[:,1], s=30, c=y, cmap=plt.cm.Spectral)

plt.grid()

In [None]:
def fn_plot_decision_boundary(pred_func, X, y):
    '''
        Attrib:
           pred_func : function based on predict method of the classifier
           X : feature matrix
           y : targets
       Return:
           None           
    '''
    
    # Set min and max values and give it some padding
    xMin, xMax = X[:, 0].min() - .05, X[:, 0].max() + .05
    yMin, yMax = X[:, 1].min() - .05, X[:, 1].max() + .05
    
    # grid size for mesh grid
    h = 0.01
    
    # Generate a grid of points with distance 'h' between them
    xx, yy = np.meshgrid(np.arange(xMin, xMax, h), np.arange(yMin, yMax, h))
    
    # Predict the function value for the whole grid
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    
    # Make its shape same as that of xx 
    Z = Z.reshape(xx.shape)
    
    # Now we have Z value corresponding to each of the combination of xx and yy
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
    
    # plot the points as well
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='black')

In [None]:
#  Split the data in training and test sets to measure performance of the model.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE )

print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# Neural Network

## Recap
<img src='../Presentations/images/S6/nn_S6_f1.jpg' style='width: 800px'/>

## The math..
<img src='../Presentations/images/S6/nn_S6_f2.jpg' style='width: 800px'/>

## Multiple hidden layers

<p style="font-family: Arial; font-size:1.2em;color:black;"> 
    Moving over to multilayer network. Our data has <strong>two</strong> features. Hence size of input layer will also be two. The output is binary, we can code it as single column as well as double column output. The hidden layers will be as follows:</p>
<table style="font-family: Arial; font-size:1.2em;color:black;">
    <tr>
        <th>#</th>
        <th>Layer Number</th>
        <th>Nodes </th>
        <th>Activation </th>
    </tr>
    <tr>
        <td>1</td>
        <td>Input Layer</td>
        <td>2</td>
        <td>tanh</td>
    </tr>
    <tr>
        <td>2</td>
        <td>Hidden Layer 1</td>
        <td>5</td>
        <td>tanh</td>
    </tr>
    <tr>
        <td>3</td>
        <td>Hidden Layer 2</td>
        <td>5</td>
        <td>tanh</td>
    </tr>
    <tr>
        <td>4</td>
        <td>Hidden Layer 3</td>
        <td>4</td>
        <td>tanh</td>
    </tr>
    <tr>
        <td>5</td>
        <td>Hidden Layer 4</td>
        <td>3</td>
        <td>tanh</td>
    </tr>
    <tr>
        <td>6</td>
        <td>Output Layer</td>
        <td>2</td>
        <td>softmax</td>
    </tr>
</table>

<img src='../Presentations/images/S6/nn_S6_f3.jpg' style='width: 800px'/>

## Loss Function
<p style="font-family: Arial; font-size:1.2em;color:black;">
The loss for our prediction $\hat{y}$ with respect to the true labels $y$ is given by:
</p>
$$
\begin{aligned}
L(\hat{y},y) =  -y.log\hat{y} - (1-y) . log(1-\hat{y})
\end{aligned}
$$
<p style="font-family: Arial; font-size:1.2em;color:black;">
For all samples:
</p>
$$
\begin{aligned}
J(\hat{y}, y) =  -\frac{1}{m}\sum_{i \in m}y_i.log\hat{y_i} - (1-y_i) . log(1-\hat{y_i})
\end{aligned}
$$

In [None]:
# Helper function to evaluate the total loss on the dataset

def calculate_loss(model, X, y):
    
    # Extract weights and losses from the model
    W1, W2, W3, W4, W5 = model['W1'], model['W2'], model['W3'], model['W4'], model['W5']
    b1, b2, b3, b4, b5 = model['b1'], model['b2'], model['b3'], model['b4'], model['b5']
    
    #***********************************
    # Layer 1
    Z1 = X.dot(W1) + b1 
    A1 = np.tanh(Z1)    # tanh activation
    assert (Z1.shape == A1.shape),"Shape of Z1 and A1 do not match"

    # Layer 2
    Z2 = A1.dot(W2) + b2 
    A2 = np.tanh(Z2)    # tanh activation
    assert (Z2.shape == A2.shape),"Shape of Z2 and A2 do not match"

    # Layer 3
    Z3 = A2.dot(W3) + b3 
    A3 = np.tanh(Z3)    # tanh activation
    assert (Z3.shape == A3.shape),"Shape of Z3 and A3 do not match"

    # Layer 4
    Z4 = A3.dot(W4) + b4 
    A4 = np.tanh(Z4)    # tanh activation
    assert (Z4.shape == A4.shape),"Shape of Z4 and A4 do not match"


    Z5 = A4.dot(W5) + b5
    exp_scores = np.exp(Z5) # softmax for final layer as it is binary classification
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    #*************************************
        
    # Calculating the loss
    # Cross entropy = ground truth x log (predicted)
    # probability of y being correct is 1. hence it will be a vector of [1,1,...,1,1]
    
    correct_logprobs = -np.log(probs[range(num_examples), y]) 
    data_loss = np.sum(correct_logprobs)
    
    # Add regulatization term to loss
    data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    
    return 1./num_examples * data_loss

## Forward Propogation

<img src='../Presentations/images/S6/nn_S6_f4.jpg' style='width: 800px'/>
$
\begin{aligned}
Z^{[l]} & = A^{[l-1]} . W^{[l]} + b^{[l]}\\
A^{[l]} & = \tanh(Z^{[l]}) \\
\end{aligned}
$
<hr>
<p style="font-family: Arial; font-size:1.2em;color:black;">
    And for last layer.</p>
$
\begin{aligned}
Z^{[L]} & = A^{[L-1]} . W^{[L]} + b^{[L]} \\
A^{[L]} & = \mathrm{softmax}(Z^{[L]}) \\
\end{aligned}
$

<p style="font-family: Arial; font-size:1.2em;color:black;">
    Where:
</p>
$
\begin{aligned}
\mathrm{softmax}(z_i) & =  \frac{e^{z_i}}{\sum_{i=1}^n {e^{z_i}}}\\
\end{aligned}
$

## Predict Function
<p style="font-family: Arial; font-size:1.2em;color:black;">
For predictions, we will simply be using the forward propagation. No need to iterate or calculate the back propogation for supervised learning.
</p>

In [None]:
# Helper function to predict an output (0 or 1)

def predict(model, x):
    '''
     Args:
         model
         x: input features
    '''
    # Extract weights and losses from the model
    W1, W2, W3, W4, W5 = model['W1'], model['W2'], model['W3'], model['W4'], model['W5']
    b1, b2, b3, b4, b5 = model['b1'], model['b2'], model['b3'], model['b4'], model['b5']
    
    #***********************************
    # Layer 1
    Z1 = x.dot(W1) + b1 
    A1 = np.tanh(Z1)    # tanh activation
    assert (Z1.shape == A1.shape),"Shape of Z1 and A1 do not match"

    # Layer 2
    Z2 = A1.dot(W2) + b2 
    A2 = np.tanh(Z2)    # tanh activation
    assert (Z2.shape == A2.shape),"Shape of Z2 and A2 do not match"

    # Layer 3
    Z3 = A2.dot(W3) + b3 
    A3 = np.tanh(Z3)    # tanh activation
    assert (Z3.shape == A3.shape),"Shape of Z3 and A3 do not match"

    # Layer 4
    Z4 = A3.dot(W4) + b4 
    A4 = np.tanh(Z4)    # tanh activation
    assert (Z4.shape == A4.shape),"Shape of Z4 and A4 do not match"


    Z5 = A4.dot(W5) + b5
    
    # use softmax
    exp_scores = np.exp(Z5) # softmax for final layer as it is binary classification
    
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    #*************************************    
    return np.argmax(probs, axis=1) # pick with one with highest probabilities

<p style="font-family: Arial; font-size:1.2em;color:black;">
We can use $gradient$ $descent$ to find its minimum. For the purpose of this excercise, we will use $batch$ $gradient$ descent with a fixed learning rate. 
</p>
<p style="font-family: Arial; font-size:1.2em;color:black;">
As an input, gradient descent needs the gradients (vector of derivatives) of the loss function with respect to our parameters: $\frac{\partial{L}}{\partial{W_l}}(= \partial{W^{[l]}})$, $\frac{\partial{L}}{\partial{b_l}}(= \partial{b^{[l]}})$, etc. To calculate these gradients we use the *backpropagation algorithm*, which is a way to efficiently calculate the gradients starting from the output.
    </p>

## Backpropogation
<hr>
<p style="font-family: Arial; font-size:1.2em;color:black;">
    For last layer.</p>
$
\begin{aligned}
\partial{Z^{[L]}}  & = A^{[L]} - y \\
\partial{W^{[L]}}  & = \frac{1}{m} A^{[L-1]T}\circ \partial{Z^{[L]}} \\
\partial{b^{[L]}}  & = \frac{1}{m} \mathrm{np.sum}(\partial{Z^{[L]}}, axis = 1, keepdims = True) \\
\end{aligned}
$
<hr>
<p style="font-family: Arial; font-size:1.2em;color:black;">
    For any other layer</p>
$
\begin{aligned}
\partial{A^{[l]}}  & = \partial{Z^{[l+1]}} . \partial{W^{[l+1]T}}\\
\partial{Z^{[l]}}  & = \partial{A^{[l]}} * ( 1-A^{[l]}**2)\\
\partial{W^{[l]}}  & = \frac{1}{m} A^{[l-1]T}\circ \partial{Z^{[l]}} \\
\partial{b^{[l]}}  & = \frac{1}{m} \mathrm{np.sum}(\partial{Z^{[l]}}, axis = 1, keepdims = True) \\
\\
\end{aligned}
$

In [None]:
# prepare the Model

def build_model(param, X, y, num_passes=20000,  print_loss=False):
    
    '''
        nn_hdim : Number of nodes in the hidden layer
        X : Features to train on
        y : Targets to train on
        num_passes : Number of passes through the training data for gradient descent
        print_loss : If True, print the loss every 1000 iterations
    '''
    # set Random Seed
    np.random.seed(RANDOM_STATE)
    
    # Initialize the parameters to random values. We need to learn these.
    W1 = np.random.randn(param['nn_hdim'][0], param['nn_hdim'][1]) / np.sqrt(param['nn_hdim'][0])
    b1 = np.zeros((1, param['nn_hdim'][1]))
    
    W2 = np.random.randn(param['nn_hdim'][1], param['nn_hdim'][2]) / np.sqrt(param['nn_hdim'][1])
    b2 = np.zeros((1, param['nn_hdim'][2]))
    
    W3 = np.random.randn(param['nn_hdim'][2], param['nn_hdim'][3]) / np.sqrt(param['nn_hdim'][2])
    b3 = np.zeros((1, param['nn_hdim'][3]))
    
    W4 = np.random.randn(param['nn_hdim'][3], param['nn_hdim'][4]) / np.sqrt(param['nn_hdim'][3])
    b4 = np.zeros((1, param['nn_hdim'][4]))
   
    W5 = np.random.randn(param['nn_hdim'][4], nn_output_dim) / np.sqrt(param['nn_hdim'][4])
    b5 = np.zeros((1, nn_output_dim))

    # This is what we return at the end
    model = {}
    
    assert (W1.shape == tuple(param['nn_hdim'][0:2])), 'Incorrect shape of W1 :{}'.format(W1.shape)
    
    
    # Gradient descent. For each batch...
    for i in range(0, num_passes):

        # Forward propagation
        # Layer 1
        Z1 = X.dot(W1) + b1 
        A1 = np.tanh(Z1)    # tanh activation
        assert (Z1.shape == A1.shape),"Shape of Z1 and A1 do not match"
        
        # Layer 2
        Z2 = A1.dot(W2) + b2 
        A2 = np.tanh(Z2)    # tanh activation
        assert (Z2.shape == A2.shape),"Shape of Z2 and A2 do not match"
        
        # Layer 3
        Z3 = A2.dot(W3) + b3 
        A3 = np.tanh(Z3)    # tanh activation
        assert (Z3.shape == A3.shape),"Shape of Z3 and A3 do not match"

        # Layer 4
        Z4 = A3.dot(W4) + b4 
        A4 = np.tanh(Z4)    # tanh activation
        assert (Z4.shape == A4.shape),"Shape of Z4 and A4 do not match"


        Z5 = A4.dot(W5) + b5
        exp_scores = np.exp(Z5) # softmax for final layer as it is binary classification
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        #print ('probs.shape', probs.shape)

        #########################
        #### Backpropagation #### 
        #########################
        
        # Layer 5 **********************************
        dZ5 = probs # shape = 1024, 2
        dZ5[range(num_examples), y] -= 1 # dL/db = dL/dz = (a-y). 
        #  As Y is single dimension subtract one from its class
        
        dW5 = (A4.T).dot(dZ5)/num_examples 
        db5 = np.sum(dZ5, axis=0, keepdims=True) / num_examples # db5 is vertical sum of delta5
        dA4 = dZ5.dot(W5.T)
        assert (dW5.shape == W5.shape),"Shape of dW5 {} and W5 {} do not match".format(dW5.shape, W5.shape)
        assert (dA4.shape == A4.shape),"Shape of dA4 {} and A4 {} do not match".format(dA4.shape, A4.shape)
        
        # Layer 4 **********************************
        dZ4 = dA4 * (1 - np.power(A4, 2))
        assert (dZ4.shape == Z4.shape),"Shape of dZ4 {} and Z4{} do not match".format(dZ4.shape, Z4.shape)
        
        dW4 = (A3.T).dot(dZ4)/num_examples
        assert (dW4.shape == W4.shape),"Shape of dW4 {} and W4 {} do not match".format(dW4.shape, W4.shape)
        
        db4 = np.sum(dZ4, axis=0, keepdims=True) / num_examples 
        dA3= dZ4.dot(W4.T)
        assert (dA3.shape == A3.shape),"Shape of dA3 {} and A3 {} do not match".format(dA3.shape, A3.shape)

        # Layer 3 **********************************
        dZ3 = dA3 * (1 - np.power(A3, 2))
        assert (dZ3.shape == Z3.shape),"Shape of dZ3 {} and Z3{} do not match".format(dZ3.shape, Z3.shape)
        
        dW3 = (A2.T).dot(dZ3)/num_examples
        assert (dW3.shape == W3.shape),"Shape of dW3 {} and W3 {} do not match".format(dW3.shape, W3.shape)
        
        db3 = np.sum(dZ3, axis=0, keepdims=True) / num_examples 
        dA2= dZ3.dot(W3.T)
        assert (dA2.shape == A2.shape),"Shape of dA2 {} and A2 {} do not match".format(dA2.shape, A2.shape)

        # Layer 2 **********************************
        dZ2 = dA2 * (1 - np.power(A2, 2))
        assert (dZ2.shape == Z2.shape),"Shape of dZ2 {} and Z2{} do not match".format(dZ2.shape, Z2.shape)
        
        dW2 = (A1.T).dot(dZ2)/num_examples
        assert (dW2.shape == W2.shape),"Shape of dW2 {} and W2 {} do not match".format(dW2.shape, W2.shape)
        
        db2 = np.sum(dZ2, axis=0, keepdims=True) / num_examples 
        dA1= dZ2.dot(W2.T)
        assert (dA1.shape == A1.shape),"Shape of dA1 {} and A1 {} do not match".format(dA1.shape, A1.shape)

        # Layer 1 **********************************
        dZ1 = dA1 * (1 - np.power(A1, 2))
        assert (dZ1.shape == Z1.shape),"Shape of dZ1 {} and Z1{} do not match".format(dZ1.shape, Z1.shape)
        
        dW1 = (X.T).dot(dZ1)/num_examples
        assert (dW1.shape == W1.shape),"Shape of dW2 {} and W2 {} do not match".format(dW2.shape, W2.shape)
        
        db1 = np.sum(dZ1, axis=0, keepdims=True) / num_examples
        assert (db1.shape == b1.shape),"Shape of db1 {} and b1 {} do not match".format(db1.shape, b1.shape)
        #dA0= dZ1.dot(W1.T)
        #assert (dA1.shape == A1.shape),"Shape of dA1 {} and A1 {} do not match".format(dA1.shape, A1.shape)

        # Gradient descent parameter update
        W1 += -epsilon * dW1
        b1 += -epsilon * db1
        
        W2 += -epsilon * dW2
        b2 += -epsilon * db2
        
        W3 += -epsilon * dW3
        b3 += -epsilon * db3
        
        W4 += -epsilon * dW4
        b4 += -epsilon * db4
        
        W5 += -epsilon * dW5
        b5 += -epsilon * db5
        
        # Assign new parameters to the model
        model = { 
            'W1': W1, 'b1': b1,
            'W2': W2, 'b2': b2,
            'W3': W3, 'b3': b3,
            'W4': W4, 'b4': b4,
            'W5': W5, 'b5': b5,}
        
        
        curr_loss = calculate_loss(model, X, y)
        loss.append(curr_loss)
        num_epoch.append(i)
        
        # Print the loss.
        if print_loss and i % 5000 == 0:    
            print("Loss after iteration %i: %f" %(i, curr_loss))
            
    return model, probs

In [None]:
# define nodes in each of dims
layer_param = {}
layer_param['nn_hdim'] = [2,5,5,4,3,2]

# lists to facilitate plotting 
loss = []
num_epoch = []

In [None]:
num_examples = len(X_train) # training set size
#nn_input_dim = 2 # input layer dimensionality
nn_output_dim = 2 # output layer dimensionality

# Gradient descent parameters
epsilon = 0.1 # learning rate for gradient descent
reg_lambda = 0.0 # regularization strength

EPOCHS = 20000
num_passes=EPOCHS

# Build a model with a 4-dimensional hidden layer
model, probs = build_model(layer_param, X_train, y_train, num_passes = num_passes, print_loss=True)

fn_plot_decision_boundary(lambda x: predict(model, x), X_train, y_train) # plot decision boundary for this plot


In [None]:
def fn_make_predicitions(pred_func, X):
    y_pred = pred_func(X)
    return y_pred

In [None]:
y_pred = fn_make_predicitions(lambda x: predict(model, x), X_train)
print('Accruacy score on Test Data :', accuracy_score(y_train, y_pred))

In [None]:
y_pred = fn_make_predicitions(lambda x: predict(model, x), X_test)

print('Accruacy score on Test Data :', accuracy_score(y_test, y_pred))

In [None]:
fig, axes = plt.subplots(1,2 , figsize = (15,6))

l_range = 10000


ax = axes[0]
ax.plot(num_epoch[100:], loss[100:]); # for zooming and scaling ignore first two

# little beautification
txtstr = "Errors: \n  Start : {:7.4f}\n   End : {:7.4f}".format(loss[0],loss[-1]) #text to plot
# properties  matplotlib.patch.Patch 
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

# place a text box in upper left in axes coords

ax.text(0.6, 0.95, txtstr, transform=ax.transAxes, fontsize=14,
        verticalalignment='top', bbox=props)

ax.set_xlabel("Epochs")
ax.set_ylabel("Error")
ax.set_title('Overall')
ax.grid();

ax = axes[1]
ax.plot(num_epoch[-l_range:], loss[-l_range:]); # for zooming and scaling ignore first two

# little beautification
txtstr = "Errors: \n  Start : {:7.4f}\n   End : {:7.4f}".format(loss[-l_range],loss[-1]) #text to plot
# properties  matplotlib.patch.Patch 
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

# place a text box in upper left in axes coords

ax.text(0.6, 0.95, txtstr, transform=ax.transAxes, fontsize=14,
        verticalalignment='top', bbox=props)

ax.set_xlabel("Epochs")
ax.set_ylabel("Error")
ax.set_title('Last {} records'.format(l_range))
ax.grid();