# Artificial Neural Networks (ANN)

--------

The problem that we are about to deal with is a classification problem.

We have several independent variables, like credit score, the balance, the number of products...

and based on these independent variables,

we are trying to predict which customers are leaving the bank.

ANN can do a terrific job at doing this,

and making that kind of predictions...

----------

# Theano Libray

Theano is an open source numerical computations library,

very efficient for fast numerical computations.

And that is based on numpy syntax.

------

# Tensorflow Library

Tensorflow is another numerical computations library

that runs very fast computations.

And that can run our CPU or GPU

CPU : Central Processing Unit

GPU : Graphical Processing Unit

----------

# Keras Library

The Keras library is an amazing library to build deep learning models,

in a few lines of code.

Keras is a library based on Theano and Tensorflow,

and exactly as we use scikit-learn to build very efficiently machine learning models.

--------------

# Part 1 : Data Preprocessing

In [1]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset

dataset = pd.read_csv('Churn_Modelling.csv')

In [None]:
"""
the key thing to understand here is that
all these variables here are independent variables.
but the last columns is our dependen variable.
1 : exited, 0 : stayed.
"""

In [3]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
X = dataset.iloc[:, 3:13].values

y = dataset.iloc[:, 13:14].values

In [5]:
X[:10]

array([[619, 'France', 'Female', 42, 2, 0.0, 1, 1, 1, 101348.88],
       [608, 'Spain', 'Female', 41, 1, 83807.86, 1, 0, 1, 112542.58],
       [502, 'France', 'Female', 42, 8, 159660.8, 3, 1, 0, 113931.57],
       [699, 'France', 'Female', 39, 1, 0.0, 2, 0, 0, 93826.63],
       [850, 'Spain', 'Female', 43, 2, 125510.82, 1, 1, 1, 79084.1],
       [645, 'Spain', 'Male', 44, 8, 113755.78, 2, 1, 0, 149756.71],
       [822, 'France', 'Male', 50, 7, 0.0, 2, 1, 1, 10062.8],
       [376, 'Germany', 'Female', 29, 4, 115046.74, 4, 1, 0, 119346.88],
       [501, 'France', 'Male', 44, 4, 142051.07, 2, 0, 1, 74940.5],
       [684, 'France', 'Male', 27, 2, 134603.88, 1, 1, 1, 71725.73]],
      dtype=object)

In [6]:
y[:10]

array([[1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0]], dtype=int64)

In [7]:
# Encoding categorical data
# Encoding the Independent Variable

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [8]:
labelencoder_X1 = LabelEncoder()

X[:, 1]         = labelencoder_X1.fit_transform(X[:, 1])

X[:10]

array([[619, 0, 'Female', 42, 2, 0.0, 1, 1, 1, 101348.88],
       [608, 2, 'Female', 41, 1, 83807.86, 1, 0, 1, 112542.58],
       [502, 0, 'Female', 42, 8, 159660.8, 3, 1, 0, 113931.57],
       [699, 0, 'Female', 39, 1, 0.0, 2, 0, 0, 93826.63],
       [850, 2, 'Female', 43, 2, 125510.82, 1, 1, 1, 79084.1],
       [645, 2, 'Male', 44, 8, 113755.78, 2, 1, 0, 149756.71],
       [822, 0, 'Male', 50, 7, 0.0, 2, 1, 1, 10062.8],
       [376, 1, 'Female', 29, 4, 115046.74, 4, 1, 0, 119346.88],
       [501, 0, 'Male', 44, 4, 142051.07, 2, 0, 1, 74940.5],
       [684, 0, 'Male', 27, 2, 134603.88, 1, 1, 1, 71725.73]],
      dtype=object)

In [9]:
labelencoder_X2 = LabelEncoder()

X[:, 2]         = labelencoder_X2.fit_transform(X[:, 2])

X[:10]

array([[619, 0, 0, 42, 2, 0.0, 1, 1, 1, 101348.88],
       [608, 2, 0, 41, 1, 83807.86, 1, 0, 1, 112542.58],
       [502, 0, 0, 42, 8, 159660.8, 3, 1, 0, 113931.57],
       [699, 0, 0, 39, 1, 0.0, 2, 0, 0, 93826.63],
       [850, 2, 0, 43, 2, 125510.82, 1, 1, 1, 79084.1],
       [645, 2, 1, 44, 8, 113755.78, 2, 1, 0, 149756.71],
       [822, 0, 1, 50, 7, 0.0, 2, 1, 1, 10062.8],
       [376, 1, 0, 29, 4, 115046.74, 4, 1, 0, 119346.88],
       [501, 0, 1, 44, 4, 142051.07, 2, 0, 1, 74940.5],
       [684, 0, 1, 27, 2, 134603.88, 1, 1, 1, 71725.73]], dtype=object)

In [10]:
# to create dummy variables

onehotencoder   = OneHotEncoder(categorical_features = [1])

X               = onehotencoder.fit_transform(X).toarray()

X[:10]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.1900000e+02,
        0.0000000e+00, 4.2000000e+01, 2.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 6.0800000e+02,
        0.0000000e+00, 4.1000000e+01, 1.0000000e+00, 8.3807860e+04,
        1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.1254258e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 5.0200000e+02,
        0.0000000e+00, 4.2000000e+01, 8.0000000e+00, 1.5966080e+05,
        3.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.1393157e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.9900000e+02,
        0.0000000e+00, 3.9000000e+01, 1.0000000e+00, 0.0000000e+00,
        2.0000000e+00, 0.0000000e+00, 0.0000000e+00, 9.3826630e+04],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 8.5000000e+02,
        0.0000000e+00, 4.3000000e+01, 2.0000000e+00, 1.2551082e+05,
        1.0000000e+00, 1.0000000e+00, 1.0000

In [11]:
# first 3 variables : dummy variables

X = X.astype('float64')

X[:10]

array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.1900000e+02,
        0.0000000e+00, 4.2000000e+01, 2.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 6.0800000e+02,
        0.0000000e+00, 4.1000000e+01, 1.0000000e+00, 8.3807860e+04,
        1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.1254258e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 5.0200000e+02,
        0.0000000e+00, 4.2000000e+01, 8.0000000e+00, 1.5966080e+05,
        3.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.1393157e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.9900000e+02,
        0.0000000e+00, 3.9000000e+01, 1.0000000e+00, 0.0000000e+00,
        2.0000000e+00, 0.0000000e+00, 0.0000000e+00, 9.3826630e+04],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 8.5000000e+02,
        0.0000000e+00, 4.3000000e+01, 2.0000000e+00, 1.2551082e+05,
        1.0000000e+00, 1.0000000e+00, 1.0000

In [12]:
# to avoid dummy variable trap,
# we need to drop the first column.

X = X[:, 1:]

X[:10]

array([[0.0000000e+00, 0.0000000e+00, 6.1900000e+02, 0.0000000e+00,
        4.2000000e+01, 2.0000000e+00, 0.0000000e+00, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 1.0000000e+00, 6.0800000e+02, 0.0000000e+00,
        4.1000000e+01, 1.0000000e+00, 8.3807860e+04, 1.0000000e+00,
        0.0000000e+00, 1.0000000e+00, 1.1254258e+05],
       [0.0000000e+00, 0.0000000e+00, 5.0200000e+02, 0.0000000e+00,
        4.2000000e+01, 8.0000000e+00, 1.5966080e+05, 3.0000000e+00,
        1.0000000e+00, 0.0000000e+00, 1.1393157e+05],
       [0.0000000e+00, 0.0000000e+00, 6.9900000e+02, 0.0000000e+00,
        3.9000000e+01, 1.0000000e+00, 0.0000000e+00, 2.0000000e+00,
        0.0000000e+00, 0.0000000e+00, 9.3826630e+04],
       [0.0000000e+00, 1.0000000e+00, 8.5000000e+02, 0.0000000e+00,
        4.3000000e+01, 2.0000000e+00, 1.2551082e+05, 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00, 7.9084100e+04],
       [0.0000000e+00, 1.0000000e+00, 6.4500000e+0

In [13]:
# Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [14]:
# Feature Scaling

# we need to apply feature scaling
# to ease all these calculations.
# because, we don't want to have one independent variable
# dominating another one.

from sklearn.preprocessing import StandardScaler

sc      = StandardScaler()

X_train = sc.fit_transform(X_train)

X_test  = sc.transform(X_test)

In [15]:
X_train[:10]

array([[-0.5698444 ,  1.74309049,  0.16958176, -1.09168714, -0.46460796,
         0.00666099, -1.21571749,  0.8095029 ,  0.64259497, -1.03227043,
         1.10643166],
       [ 1.75486502, -0.57369368, -2.30455945,  0.91601335,  0.30102557,
        -1.37744033, -0.00631193, -0.92159124,  0.64259497,  0.9687384 ,
        -0.74866447],
       [-0.5698444 , -0.57369368, -1.19119591, -1.09168714, -0.94312892,
        -1.031415  ,  0.57993469, -0.92159124,  0.64259497, -1.03227043,
         1.48533467],
       [-0.5698444 ,  1.74309049,  0.03556578,  0.91601335,  0.10961719,
         0.00666099,  0.47312769, -0.92159124,  0.64259497, -1.03227043,
         1.27652776],
       [-0.5698444 ,  1.74309049,  2.05611444, -1.09168714,  1.73658844,
         1.04473698,  0.8101927 ,  0.8095029 ,  0.64259497,  0.9687384 ,
         0.55837842],
       [ 1.75486502, -0.57369368,  1.29325423, -1.09168714, -0.17749539,
        -1.031415  ,  0.44253504,  0.8095029 ,  0.64259497, -1.03227043,
         1.632

In [16]:
X_test[:10]

array([[ 1.75486502, -0.57369368, -0.55204276, -1.09168714, -0.36890377,
         1.04473698,  0.8793029 , -0.92159124,  0.64259497,  0.9687384 ,
         1.61085707],
       [-0.5698444 , -0.57369368, -1.31490297, -1.09168714,  0.10961719,
        -1.031415  ,  0.42972196, -0.92159124,  0.64259497, -1.03227043,
         0.49587037],
       [-0.5698444 ,  1.74309049,  0.57162971, -1.09168714,  0.30102557,
         1.04473698,  0.30858264, -0.92159124,  0.64259497,  0.9687384 ,
        -0.42478674],
       [-0.5698444 , -0.57369368,  1.41696129,  0.91601335, -0.65601634,
        -0.33936434,  0.57533623, -0.92159124, -1.55619021, -1.03227043,
        -0.18777657],
       [ 1.75486502, -0.57369368,  0.57162971,  0.91601335, -0.08179119,
         0.00666099,  1.38961097,  0.8095029 ,  0.64259497,  0.9687384 ,
         0.61684179],
       [-0.5698444 ,  1.74309049,  0.20050853, -1.09168714,  1.73658844,
        -0.68538967,  1.5900207 ,  0.8095029 ,  0.64259497, -1.03227043,
        -0.019

------------

# Part 2 : Now let's make the ANN!

## 1. Importing the Keras libraries and packages

In [17]:
from keras.models import Sequential # to initialize our neural network.

from keras.layers import Dense  # this is the model will use to create layers in our ANN.

Using TensorFlow backend.


## 2. Initializing the ANN

In [18]:
classifier = Sequential()

In [19]:
X

array([[0.0000000e+00, 0.0000000e+00, 6.1900000e+02, ..., 1.0000000e+00,
        1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 1.0000000e+00, 6.0800000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 1.1254258e+05],
       [0.0000000e+00, 0.0000000e+00, 5.0200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 1.1393157e+05],
       ...,
       [0.0000000e+00, 0.0000000e+00, 7.0900000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 4.2085580e+04],
       [1.0000000e+00, 0.0000000e+00, 7.7200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 9.2888520e+04],
       [0.0000000e+00, 0.0000000e+00, 7.9200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 3.8190780e+04]])

## 3. Adding the input layer and the first hidden layer

In [20]:
classifier.add(Dense(1, input_shape = (11, )))

## 4. Adding the second hidden layer

In [21]:
classifier.add(Dense(6, activation = 'relu'))

## 5. Adding the output layer

In [22]:
# if we have 2 different categories for output,
# than we can use the sigmoid function.
# if we had more than 2 categories for output,
# we'd need to use the softmax function.

classifier.add(Dense(1, activation = 'sigmoid'))

## 6. Compiling the ANN

In [23]:
classifier.compile(optimizer = 'adam', # for stochastic gradient descent algorithm
                   loss = 'binary_crossentropy', # to find optimal weights(logarithmic loss)
                   metrics = ['accuracy']
                  )

# loss = 'categorical_crossentropy' for loss, if we have more than 2 categories in the output.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


## 7. Fitting the ANN to the Training set

In [None]:
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)

---------

# Part 3 : Making the predictions and evaluating the model

## 1. Predicting the Test set results

In [25]:
y_pred = classifier.predict(X_test)

y_pred = (y_pred > 0.5)

y_pred[:10]

array([[False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [ True]])

## 2. Making the Confusion Matrix

In [26]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

accuracy = (cm[0, 0] + cm[1, 1]) / np.sum(cm)

In [27]:
accuracy

0.801

----------------

# Part 4 : Predicting a single new observation

In [None]:
"""
Predict if the customer with the following information will leave the bank:
    Geography           : France = [0, 0] <-- corresponds to
    Credit Score        : 600
    Gender              : Male   = [1]    <-- corresponds to
    Age                 : 40
    Tenure              : 3
    Balance             : 60000
    Number of Products  : 2
    Has Credit Card     : Yes    = [1]    <-- corresponds to
    Is Active Member    : Yes    = [1]    <-- corresponds to
    Estimated Salary    : 50000
"""

In [28]:
my_array       = np.array([[0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])

normal_array   = sc.transform(my_array)  # to normalize  

new_prediction = classifier.predict(my_array)

new_y_pred     = (new_prediction > 0.5)

In [29]:
new_y_pred, new_prediction

(array([[ True]]), array([[0.5228143]], dtype=float32))

------------------------

# Part 4 : Evaluating, Improving and Tuning the ANN

## 1. Evaluating the ANN

In [None]:
'''
to fix this variance problem;
k-Fold Cross Validation fix it by splitting the training set
into 10 folds when K = 10, and most of the time K = 10
and we train our model on 9-folds and we test it on the
last remaining fold.
there we take 10 different combination of 9-folds to train
a model and 1-fold to test it.
that means we can train the model and test the model
on 10 combinations of training and test sets.
And that will give us a much better idea of the model
performance because, we take an average of different
accuracies of the 10 evaluations and also compute
the standart deviation to have a look at the variance.
So eventually, our analysis will be much more relevant.
'''

In [32]:
# Keras wrapper will wrap F-fold cross validation by Scikit-learn
# in to the keras model
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(6, activation = 'relu', input_shape = (11, )))
    classifier.add(Dense(6, activation = 'relu'))
    classifier.add(Dense(1, activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', 
                       loss      = 'binary_crossentropy',
                       metrics   = ['accuracy']
                      )
    return classifier

In [33]:
classifier_cv = KerasClassifier(build_fn   = build_classifier,
                                batch_size = 10, 
                                epochs     = 100, 
                                verbose    = 0
                               )

In [34]:
accuracies = cross_val_score(estimator = classifier_cv, 
                             X         = X_train, # the data to fit.
                             y         = y_train, # the target to predict.
                             cv        = 10,      # number of folds.
                             n_jobs    = -1       # '-1' means all CPU's.
                            )

In [35]:
accuracies

array([0.85374999, 0.86500001, 0.8775    , 0.86124998, 0.87      ,
       0.85624999, 0.86000001, 0.85124999, 0.84875   , 0.86750001])

In [36]:
mean            = accuracies.mean()
mean

0.8611249983310699

In [37]:
variance        = accuracies.std()
variance

0.008559537963463334

In [None]:
"""
we are in 'Low Bias Low Variance'.
means, best accuracy low varince
accuracy : % 86.1
variance : % 0.85
"""

## 2. Improving the ANN

In [None]:
"""
Dropout Regularization:
it is the solution for overfitting in deep learning.
Overfitting is when your model was trained too much
on the training set, too much that it becomes much less
performance on the test set and we can observe this
when we have large difference of accuracies between
training set and the test set.
Generally, when overfitting happens, you have a much
higher accuracy on the training set than the test set.
And another way to detect overfitting is when you
observe high variance when applying k-fold cv
because indeed,  when it's overfitted on the training
set, that is when your model learn too much and
this may cause your model won't succeed on 'other' test
sets because the correlations learned too much.
"""

In [None]:
"""
Dropout works this way;
At each iteration of the training, some neurons of your
ANN are randomly disabled to prevent them for being too
dependent on each other when they learn the correlations
and therefore, by over-writing these neurons, the ANN
learns several independent correlations in the data,
because each time there is not the same configuration
of the neurons.
And the fact that we get these independent correlations
of the data, thanks to the fact that the neurons work
more independently, that prevents the neurons from learning
too much and therefore that prevents overfitting.
"""

In [None]:
# Dropout Regularization to reduce overfitting if needed.
from keras.layers import Dropout

classifier = Sequential()

# Adding the input layer and the first hidden layer with dropout
classifier.add(Dense(6, activation = 'relu', input_shape = (11, )))
classifier.add(Dropout(p = 0.1)) # p : fraction of the input units to drop.

# Adding the second hidden layer
classifier.add(Dense(6, activation = 'relu'))
classifier.add(Dropout(p = 0.1))

# Adding the output layer
classifier.add(Dense(1, activation = 'sigmoid'))
classifier.add(Dropout(p = 0.1))

classifier.compile(optimizer = 'adam', 
                   loss      = 'binary_crossentropy',
                   metrics   = ['accuracy']
                  )

classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)

In [31]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

accuracy = (cm[0, 0] + cm[1, 1]) / np.sum(cm)

accuracy

0.801

## 3. Tuning the ANN

In [45]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

def build_classifier(optimizer):
    classifier = Sequential()
    
    classifier.add(Dense(6, activation = 'relu', input_shape = (11, )))
    classifier.add(Dense(6, activation = 'relu'))
    classifier.add(Dense(1, activation = 'sigmoid'))
    
    classifier.compile(optimizer = optimizer, 
                       loss      = 'binary_crossentropy',
                       metrics   = ['accuracy']
                       )
    return classifier

In [46]:
classifier_cv   = KerasClassifier(build_fn = build_classifier)

parameters      = {'batch_size' : [25, 32],
                   'epochs'     : [100, 500],
                   'optimizer'  : ['adam', 'rmsprop']
                   }

grid_search     = GridSearchCV(estimator  = classifier_cv,
                               param_grid = parameters,
                               scoring    = 'accuracy',
                               cv         = 10,
                               verbose    = 1
                               )

In [None]:
grid_search_cv   = grid_search.fit(X = X_train, y = y_train)
best_parameters  = grid_search_cv.best_params_
best_accuracy    = grid_search_cv.best_score_

In [None]:
# best_parameters = {'batch_size' : 32, 'epochs' : 100, 'optimizer' : 'rmsprop'}

In [None]:
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Dense

classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(6, activation = 'relu', input_shape = (11, )))

# Adding the second hidden layer
classifier.add(Dense(6, activation = 'relu'))

# Adding the output layer
classifier.add(Dense(1, activation = 'sigmoid'))

classifier.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy',
                   metrics = ['accuracy'])

classifier.fit(X_train, y_train, batch_size = 32, epochs = 100)

In [39]:
y_pred_tuned   = classifier.predict(X_test)
y_pred_tuned   = (y_pred_tuned > 0.5)

In [40]:
cm_tuned       = confusion_matrix(y_test, y_pred_tuned)
accuracy_tuned = (cm_tuned[0, 0] + cm_tuned[1, 1]) / np.sum(cm_tuned)

In [41]:
accuracy_tuned

0.862

------------------