In [61]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import tree

import jb_helper_functions_prep
from jb_helper_functions_prep import create_enc

In [2]:
df = pd.read_csv('Churn_Modelling.csv')
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
df = create_enc(df, ['Geography', 'Gender'])
df = df[['CreditScore', 'Geography_enc',
       'Gender_enc', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited']]

In [4]:
train, test = train_test_split(df, test_size=.3, random_state=123, stratify=df[['Exited']])

In [5]:
print('Train shape: ' + str(train.shape))
print('Test shape: ' + str(test.shape))

Train shape: (7000, 11)
Test shape: (3000, 11)


In [6]:
y_train = train[['Exited']]
y_test = test[['Exited']]

In [28]:
X_train = train[['IsActiveMember', 'Balance', 'CreditScore', 'Tenure', 'Age']]
X_test = test[['IsActiveMember', 'Balance', 'CreditScore', 'Tenure', 'Age']]

In [29]:
log_reg = LogisticRegression(random_state=123, solver='saga').fit(X_train, y_train)

In [30]:
y_pred = log_reg.predict(X_train)

In [31]:
print('Accuracy of Logistic Regression classifier on training set: {:.6f}'
     .format(log_reg.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.796286


In [32]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      1.00      0.89      5574
           1       0.00      0.00      0.00      1426

   micro avg       0.80      0.80      0.80      7000
   macro avg       0.40      0.50      0.44      7000
weighted avg       0.63      0.80      0.71      7000



In [33]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,5574,0
Actual +,1426,0


### Although this is 80% accurate, it's just not predicting that a customer will churn, which is not good.

### Let's try a decision tree this time.

In [69]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123).fit(X_train, y_train)

In [70]:
y_pred = clf.predict(X_train)
y_pred_proba = clf.predict_proba(X_train)

print('Accuracy of Decision Tree classifier on training set: {:.6f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.830571


In [71]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,5349,225
Actual +,961,465


### Things are looking better.  We're getting a mix of predictions and accuracy is increasing.  We can improve on our false negatives though.

### What about K-Nearest Neighbors?

In [59]:
knn = KNeighborsClassifier(n_neighbors=4, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
print('Accuracy of KNN classifier on training set: {:.6f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.816000


In [60]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,5508,66
Actual +,1222,204


### There is improvement from Logistic Regression, but Decision Tree is still performing better.  Let's try Random Forest.

In [67]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=4, 
                            random_state=123)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_train)
print('Accuracy of random forest classifier on training set: {:.6f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.834714


In [65]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,5439,135
Actual +,1022,404


### We're seeing slightly better results with a random forest, but not by much.  Since decision tree can be more explainable, let's use that model to run on the test set.

In [72]:
print('Accuracy of Decision Tree classifier on test set: {:.6f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.832667


In [None]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(cf.score(X_test, y_test)))