# Artificial Neural Network


## 1. Data Preprocessing

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


### 1.1 Importing the dataset

In [0]:
#for some reasons, the data file on github has some problems when reading
#datafile = 'https://github.com/jchen8000/MachineLearning/blob/master/Classification/data/Churn_Modelling.csv'

#Found the same data file from internet
datafile = 'https://floobits.com/calvinlow18/ANN/raw/Churn_Modelling.csv'
dataset = pd.read_csv(datafile)


In [3]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
print(X.shape)
print(y.shape)

(10000, 10)
(10000,)


### 1.2 Encoding categorical data

Encode the country name (string) to 0, 1, 2 etc. 
Encode female/male (string) to 0, 1

Also need One Hot Encoding, see [Label Encoder vs. One Hot Encoder](https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621)

In [0]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])



In [6]:
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


### 1.3 Splitting the dataset into the Training set and Test set


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### 1.4 Feature Scaling

In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [9]:
print( X_train.shape )
print( X_test.shape )
print( X_train[0:10,:])


(8000, 11)
(2000, 11)
[[-0.5698444   1.74309049  0.16958176 -1.09168714 -0.46460796  0.00666099
  -1.21571749  0.8095029   0.64259497 -1.03227043  1.10643166]
 [ 1.75486502 -0.57369368 -2.30455945  0.91601335  0.30102557 -1.37744033
  -0.00631193 -0.92159124  0.64259497  0.9687384  -0.74866447]
 [-0.5698444  -0.57369368 -1.19119591 -1.09168714 -0.94312892 -1.031415
   0.57993469 -0.92159124  0.64259497 -1.03227043  1.48533467]
 [-0.5698444   1.74309049  0.03556578  0.91601335  0.10961719  0.00666099
   0.47312769 -0.92159124  0.64259497 -1.03227043  1.27652776]
 [-0.5698444   1.74309049  2.05611444 -1.09168714  1.73658844  1.04473698
   0.8101927   0.8095029   0.64259497  0.9687384   0.55837842]
 [ 1.75486502 -0.57369368  1.29325423 -1.09168714 -0.17749539 -1.031415
   0.44253504  0.8095029   0.64259497 -1.03227043  1.63252134]
 [-0.5698444  -0.57369368  1.6128308   0.91601335  0.77954653 -1.37744033
   0.30432823 -0.92159124 -1.55619021 -1.03227043  0.48149647]
 [-0.5698444   1.743090

## 2. Neural Network Model

![Neural Network Model](https://cdn-images-1.medium.com/max/800/1*l78dvvJFf0cOJnXTJglR7A.png)

### 2.1 Neural Network Cost Function

> ## $ \min_\Theta J(\Theta)=-\frac{\mathrm{1} }{m} \sum_{i=1}^{m}  \sum_{k=1}^{K}\left[ y_k^{(i)} log((h_\Theta(x^{(i)}))_k) + (1 - y_k^{(i)}) log (1 - (h_\Theta(x^{(i)}))_k) \right]  + \frac{\mathrm{\lambda}}{2m}  \sum_{l=1}^{L-1} \sum_{i=1}^{S_l}\sum_{j=1}^{S_l+1}( \Theta_{ji}^{(l)})^2$

> Where $ h_\Theta(x)  \in  \mathbb{R}^K, (h_\Theta(x))_i = i^{th} output  $

> $ L = $ total no. of layers in neural network

> $ S_l = $ no. of units (not couning bias unit ) in layer $ l $

In [0]:
def sigmoid(z):
    return 1/(1+np.exp(-z))
  
def nnCostFunc(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda):
    theta1 = np.reshape(nn_params[:hidden_layer_size*(input_layer_size+1)], (hidden_layer_size, input_layer_size+1), 'F')
    theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1), 'F')
    print(theta2.shape)

    m = len(y)
    ones = np.ones((m,1))
    a1 = np.hstack((ones, X))
    a2 = sigmoid(a1 @ theta1.T)
    a2 = np.hstack((ones, a2))
    h = sigmoid(a2 @ theta2.T)
    
    y_d = pd.get_dummies(y.flatten())
    
    temp1 = np.multiply(y_d, np.log(h))
    temp2 = np.multiply(1-y_d, np.log(1-h))
    temp3 = np.sum(temp1 + temp2)
    
    sum1 = np.sum(np.sum(np.power(theta1[:,1:],2), axis = 1))
    sum2 = np.sum(np.sum(np.power(theta2[:,1:],2), axis = 1))
    
    return np.sum(temp3 / (-m)) + (sum1 + sum2) * lmbda / (2*m)


### 2.2 Sigmoid Gredient

> ## $\frac{\mathrm{d} }{\mathrm{d} z}g(z) = g(z)(1-g(z)) $

> where

> ## $ g(z) = sigmoid(z) = \frac{\mathrm{1} }{\mathrm{1} + e^{-z} }  $






In [0]:
def sigmoidGrad(z):
    return np.multiply(sigmoid(z), 1-sigmoid(z))

### 2.3 Backpropagation

> ## $  \delta^{(4)}_j = a_j^{(4)} - y_j $,  ( total number of layers $ L = 4 $ )

> ## $  \delta^{(3)} = ( \Theta^{(3)} )^T  \delta^{(4)} .* g'(z^{(3)}) $

> ## $  \delta^{(2)} = ( \Theta^{(2)} )^T  \delta^{(3)} .* g'(z^{(2)}) $

In [0]:
def nnGrad(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda):
    
    initial_theta1 = np.reshape(nn_params[:hidden_layer_size*(input_layer_size+1)], (hidden_layer_size, input_layer_size+1), 'F')
    initial_theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1), 'F')
    y_d = pd.get_dummies(y.flatten())
    delta1 = np.zeros(initial_theta1.shape)
    delta2 = np.zeros(initial_theta2.shape)
    m = len(y)
    
    for i in range(X.shape[0]):
        ones = np.ones(1)
        a1 = np.hstack((ones, X[i]))
        z2 = a1 @ initial_theta1.T
        a2 = np.hstack((ones, sigmoid(z2)))
        z3 = a2 @ initial_theta2.T
        a3 = sigmoid(z3)

        d3 = a3 - y_d.iloc[i,:][np.newaxis,:]
        z2 = np.hstack((ones, z2))
        d2 = np.multiply(initial_theta2.T @ d3.T, sigmoidGrad(z2).T[:,np.newaxis])
        delta1 = delta1 + d2[1:,:] @ a1[np.newaxis,:]
        delta2 = delta2 + d3.T @ a2[np.newaxis,:]
        
    delta1 /= m
    delta2 /= m
    #print(delta1.shape, delta2.shape)
    delta1[:,1:] = delta1[:,1:] + initial_theta1[:,1:] * lmbda / m
    delta2[:,1:] = delta2[:,1:] + initial_theta2[:,1:] * lmbda / m
        
    return np.hstack((delta1.ravel(order='F'), delta2.ravel(order='F')))  

### 2.4 Random initialization


In [0]:
def randInitializeWeights(L_in, L_out):
    epsilon = 0.12
    return np.random.rand(L_out, L_in+1) * 2 * epsilon - epsilon


# neural network hyperparameters
input_layer_size = 11
hidden_layer_size = 10
num_labels = 1
lmbda = 1  
  
initial_theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_theta2 = randInitializeWeights(hidden_layer_size, num_labels)

# unrolling parameters into a single column vector
nn_initial_params = np.hstack((initial_theta1.ravel(order='F'), initial_theta2.ravel(order='F')))

### 2.5 Run Backpropagation


In [21]:
nn_backprop_Params = nnGrad(nn_initial_params, input_layer_size, hidden_layer_size, num_labels, X_train, y_train, lmbda)


ValueError: ignored

### 2.5 Predicting a single new observation

Predict if the customer with the following informations will leave the bank:



## 3. Build a simple Neural Network



In [0]:
import numpy as np

# X = (hours studying, hours sleeping), y = score on test, xPredicted = 4 hours studying & 8 hours sleeping (input data for prediction)
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
xPredicted = np.array(([4,8]), dtype=float)

# scale units
X = X/np.amax(X, axis=0) # maximum of X array
xPredicted = xPredicted/np.amax(xPredicted, axis=0) # maximum of xPredicted (our input data for the prediction)
y = y/100 # max test score is 100



In [24]:
X

array([[0.66666667, 1.        ],
       [0.33333333, 0.55555556],
       [1.        , 0.66666667]])

In [25]:
y

array([[0.92],
       [0.86],
       [0.89]])

In [0]:
class Neural_Network(object):
  def __init__(self):
  #parameters
    self.inputSize = 2
    self.outputSize = 1
    self.hiddenSize = 3

  #weights
    self.W1 = np.random.randn(self.inputSize, self.hiddenSize) # (3x2) weight matrix from input to hidden layer
    self.W2 = np.random.randn(self.hiddenSize, self.outputSize) # (3x1) weight matrix from hidden to output layer

  def forward(self, X):
    #forward propagation through our network
    self.z = np.dot(X, self.W1) # dot product of X (input) and first set of 3x2 weights
    self.z2 = self.sigmoid(self.z) # activation function
    self.z3 = np.dot(self.z2, self.W2) # dot product of hidden layer (z2) and second set of 3x1 weights
    o = self.sigmoid(self.z3) # final activation function
    return o

  def sigmoid(self, s):
    # activation function
    return 1/(1+np.exp(-s))

  def sigmoidPrime(self, s):
    #derivative of sigmoid
    return s * (1 - s)

  def backward(self, X, y, o):
    # backward propagate through the network
    self.o_error = y - o # error in output
    self.o_delta = self.o_error*self.sigmoidPrime(o) # applying derivative of sigmoid to error

    self.z2_error = self.o_delta.dot(self.W2.T) # z2 error: how much our hidden layer weights contributed to output error
    self.z2_delta = self.z2_error*self.sigmoidPrime(self.z2) # applying derivative of sigmoid to z2 error

    self.W1 += X.T.dot(self.z2_delta) # adjusting first set (input --> hidden) weights
    self.W2 += self.z2.T.dot(self.o_delta) # adjusting second set (hidden --> output) weights

  def train(self, X, y, epoch):
    for i in range(epoch):
      o = self.forward(X)
      self.backward(X, y, o)
      print("epoch:[", i, "], Loss: ", str(np.mean(np.square(y - self.forward(X))))  )

  def saveWeights(self):
    np.savetxt("w1.txt", self.W1, fmt="%s")
    np.savetxt("w2.txt", self.W2, fmt="%s")

  def predict(self):
    print( "Predicted data based on trained weights: ")
    print( "Input (scaled): \n" + str(xPredicted) )
    print( "Output: \n" + str(self.forward(xPredicted)) )



In [38]:
NN = Neural_Network()

NN.train(X, y, 100)

#NN.saveWeights()
NN.predict()

epoch:[ 0 ], Loss:  0.15940922113981854
epoch:[ 1 ], Loss:  0.1130085398199746
epoch:[ 2 ], Loss:  0.08137473531018784
epoch:[ 3 ], Loss:  0.059988404279404946
epoch:[ 4 ], Loss:  0.04534348679900465
epoch:[ 5 ], Loss:  0.035089158218525714
epoch:[ 6 ], Loss:  0.027727686440318682
epoch:[ 7 ], Loss:  0.022312586926029263
epoch:[ 8 ], Loss:  0.01823879298584623
epoch:[ 9 ], Loss:  0.015111716592849033
epoch:[ 10 ], Loss:  0.012668099283086307
epoch:[ 11 ], Loss:  0.010728212715562534
epoch:[ 12 ], Loss:  0.009166622551869055
epoch:[ 13 ], Loss:  0.007893970826052829
epoch:[ 14 ], Loss:  0.006845386176304102
epoch:[ 15 ], Loss:  0.005972953730313139
epoch:[ 16 ], Loss:  0.0052407215476725074
epoch:[ 17 ], Loss:  0.0046213242325411985
epoch:[ 18 ], Loss:  0.004093658028993398
epoch:[ 19 ], Loss:  0.0036412525418236144
epoch:[ 20 ], Loss:  0.003251112233821404
epoch:[ 21 ], Loss:  0.0029128800302744107
epoch:[ 22 ], Loss:  0.002618225237033695
epoch:[ 23 ], Loss:  0.0023603899498039643
epo