In [27]:
import numpy as np
import pandas as pd
import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense

## Data preprocessing

In [28]:
# import data
ds = pd.read_csv('data/Churn_Modelling.csv')
ds.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
# pick out colums to be used in model
X = ds.iloc[:, 3:13].values
y = ds.iloc[:, 13].values
print(X[0])

In [None]:
# encode categorical data
le_country = LabelEncoder()
le_gender = LabelEncoder()
X[:, 1 ] = le_country.fit_transform(X[:, 1])
X[:, 2 ] = le_gender.fit_transform(X[:, 2])
print(X[0])

In [None]:
# encode categorical data
ohe = OneHotEncoder(categorical_features=[1])
X = ohe.fit_transform(X).toarray()
print(X[0])

In [None]:
# remove one dummy variable to avoid trap
X = X[:, 1:]
print(X[0])

In [None]:
# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [3]:
# feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train[0]

NameError: name 'X_train' is not defined

In [9]:
X_train

array([[-0.5698444 ,  1.74309049,  0.16958176, ...,  0.64259497,
        -1.03227043,  1.10643166],
       [ 1.75486502, -0.57369368, -2.30455945, ...,  0.64259497,
         0.9687384 , -0.74866447],
       [-0.5698444 , -0.57369368, -1.19119591, ...,  0.64259497,
        -1.03227043,  1.48533467],
       ..., 
       [-0.5698444 , -0.57369368,  0.9015152 , ...,  0.64259497,
        -1.03227043,  1.41231994],
       [-0.5698444 ,  1.74309049, -0.62420521, ...,  0.64259497,
         0.9687384 ,  0.84432121],
       [ 1.75486502, -0.57369368, -0.28401079, ...,  0.64259497,
        -1.03227043,  0.32472465]])

In [10]:
len(X_train)

8000

## Create ANN model

![Neural network](../img/neural_net.jpg)

### Initialize deep learning model

In [11]:
# define model as a sequense of layers
classifier = Sequential()

## Add the input layer and first hidden layer
### units
Previously output_dim.<br>
Units is the number of hidden layers. A rule of thumb is that the number of hidden layers is the average of the number of input layers and the number of output layers. Since there are 11 independent variable and the output is binary the number of hidden layers will be (11 + 1)/2 = 6.
<br>
The number can be fine tuned by parameter tuning.

### kernel_initializer
Previously init.<br>
It defines how the weight is randomly initialized. Uniform means that they will have uniform distribution.

### activation
Activation function for hidden layers. Sigmoid activation function is used.

![sigmoid](../img/sigmoid_activation_function.png)

### input_dim
Number of independent variables

In [12]:
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu', input_dim=11))

## second hidden layer
The paremeter input_dim is not needed since the second layer "knows" how many parameters the first hidden layer will output.

In [13]:
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))

## Output layer
The output will be a yes or no, so the sigmoid activation function is used.
![Rectifier](../img/rectifier_activation_function.png)

In [14]:
classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))

## Compile Neural Network
### optimizer
Algorithm to optimize the weights

### loss
Algorithm to calculate minimized error. Since it is a binary outcome binary_crossentropy is used.

### metrics
Method to evaluate model and improve performance.

In [15]:
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Fit model to training set

### batch_size
After how many rows in the dataset the weights are adjusted

### epoch
Number of rounds run to optimize the weights in the model.


In [16]:
classifier.fit(X_train, y_train, batch_size=5, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x10f37d828>

## Evaluate performance of model

In [17]:
y_pred=classifier.predict(X_test)
y_pred

array([[ 0.21840937],
       [ 0.35601658],
       [ 0.16434492],
       ..., 
       [ 0.17787409],
       [ 0.17337835],
       [ 0.12702857]], dtype=float32)

The y values is a float between 0 and 1. The confusion matrix only handles bools. The y-value must therefore be converted. Less than 0.5 is rounded to 0 and above to 1.

In [18]:
# shorthand for if larger than 0.5 return 1, else 0
y_pred = (y_pred > 0.5)

In [19]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1549,   46],
       [ 268,  137]])

## Prediction for one specific customer
Geography: France<br>
Credit score: 600<br>
Gender: Male<br>
Age: 40<br>
Tenure: 3<br>
Balance: 60000<br>
Num products: 2<br>
Has creditcard: Yes<br>
Is active: Yes<br>
Estimated salary: 50000<br>

In [20]:
# Encode country and gender
# France: 0,0 
# Male: 1    
# Feature scaling must bed added
pred = classifier.predict(sc.transform(np.array([[0.0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])))

In [21]:
pred[0][0]

0.083431952