In [18]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import tree

import jb_helper_functions_prep
from jb_helper_functions_prep import create_enc

In [19]:
df = pd.read_csv('Churn_Modelling.csv')
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
df = create_enc(df, ['Geography', 'Gender'])
df = df[['CreditScore', 'Geography_enc',
       'Gender_enc', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited']]

In [4]:
act_df = df[df.IsActiveMember == 1]
non_act_df = df[df.IsActiveMember == 0]

In [5]:
non_train, non_test = train_test_split(non_act_df, test_size=.3, random_state=123, stratify=non_act_df[['Exited']])
act_train, act_test = train_test_split(act_df, test_size=.3, random_state=123, stratify=act_df[['Exited']])

In [6]:
y_train_non = non_train[['Exited']]
y_test_non = non_test[['Exited']]
y_train_act = act_train[['Exited']]
y_test_act = act_test[['Exited']]

In [7]:
X_train_non = non_train[['IsActiveMember', 'Balance', 'CreditScore', 'Tenure', 'Age']]
X_test_non = non_test[['IsActiveMember', 'Balance', 'CreditScore', 'Tenure', 'Age']]
X_train_act = act_train[['IsActiveMember', 'Balance', 'CreditScore', 'Tenure', 'Age']]
X_test_act = act_test[['IsActiveMember', 'Balance', 'CreditScore', 'Tenure', 'Age']]

In [8]:
clf_non = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123).fit(X_train_non, y_train_non)
clf_act = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123).fit(X_train_act, y_train_act)

In [10]:
y_pred_non = clf_non.predict(X_train_non)
y_pred_proba_non = clf_non.predict_proba(X_train_non)

print('Accuracy of Decision Tree classifier on non_training set: {:.6f}'
     .format(clf_non.score(X_train_non, y_train_non)))

y_pred_act = clf_act.predict(X_train_act)
y_pred_proba_act = clf_act.predict_proba(X_train_act)

print('Accuracy of Decision Tree classifier on act_training set: {:.6f}'
     .format(clf_act.score(X_train_act, y_train_act)))

Accuracy of Decision Tree classifier on non_training set: 0.812316
Accuracy of Decision Tree classifier on act_training set: 0.858252


In [13]:
cm_non = pd.DataFrame(confusion_matrix(y_train_non, y_pred_non),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
print('Non-active: ')
cm_non

Non-active: 


Unnamed: 0,Pred -,Pred +
Actual -,2316,167
Actual +,470,441


In [15]:
cm_act = pd.DataFrame(confusion_matrix(y_train_act, y_pred_act),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
print('Active: ')
cm_act

Active: 


Unnamed: 0,Pred -,Pred +
Actual -,3090,1
Actual +,510,4


### We have improved on active customers.  Non-active customers can improve and most likely has different key indicators in that subgroup.