<a href="https://colab.research.google.com/github/jmohsbeck1/jpmc_mle/blob/Mar.-29/Churn_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# John Mohsbeck
# Data Bank Churn dataset
# Data preparation
# EDA
# Feature Importance
# Logistic Regression model
# Performance Metrics

import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
df = pd.read_csv("Churn_Modelling.csv")
print(len(df))
df.head()

10000


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Data Prep

Dataset

records: 10000

columns: 14

In [5]:
df.shape

(10000, 14)

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
RowNumber,1,2,3,4,5
CustomerId,15634602,15647311,15619304,15701354,15737888
Surname,Hargrave,Hill,Onio,Boni,Mitchell
CreditScore,619,608,502,699,850
Geography,France,Spain,France,France,Spain
Gender,Female,Female,Female,Female,Female
Age,42,41,42,39,43
Tenure,2,1,8,1,2
Balance,0.0,83807.86,159660.8,0.0,125510.82
NumOfProducts,1,1,3,2,1


In [7]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [8]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [9]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

Replace blanks with underscore "_" and transform column names to lower case.

In [10]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [12]:
df.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,1,15634602,hargrave,619,france,female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,hill,608,spain,female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,onio,502,france,female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,boni,699,france,female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,mitchell,850,spain,female,43,2,125510.82,1,1,1,79084.1,0


In [13]:
df.nunique()

rownumber          10000
customerid         10000
surname             2931
creditscore          460
geography              3
gender                 2
age                   70
tenure                11
balance             6382
numofproducts          4
hascrcard              2
isactivemember         2
estimatedsalary     9999
exited                 2
dtype: int64

In [14]:
df['exited'].unique()

array([1, 0])

In [15]:
df.dtypes

rownumber            int64
customerid           int64
surname             object
creditscore          int64
geography           object
gender              object
age                  int64
tenure               int64
balance            float64
numofproducts        int64
hascrcard            int64
isactivemember       int64
estimatedsalary    float64
exited               int64
dtype: object

In [16]:
df['exited'].nunique()

2

In [17]:
df['exited'].sample(25)

1454    1
4036    1
3977    0
9561    0
5409    0
4097    0
4542    0
6536    0
859     1
6802    0
8992    0
5927    1
3299    0
8289    0
3375    1
2466    0
1757    1
807     0
1848    0
7605    0
3984    0
8675    0
2278    0
2854    0
9082    0
Name: exited, dtype: int64

In [18]:
df.exited.value_counts()

0    7963
1    2037
Name: exited, dtype: int64

Checking for duplicates.
Found 0 duplicates.

In [20]:
duplicates = df.duplicated()
print(duplicates.sum())

0


##Dataset is Imbalanced

Exited: 1622
Stay: 6378

1. Split df(80/20):  80% => df_train_full, 20% => df_test.

2. Split df_train_full(67/33):  67% => df_train, 33% => df_val.

In [24]:
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)
y_train = df_train.exited.values
y_val = df_val.exited.values
del df_train['exited']
del df_val['exited']


# EDA

In [25]:
df_train_full.isnull().sum()

rownumber          0
customerid         0
surname            0
creditscore        0
geography          0
gender             0
age                0
tenure             0
balance            0
numofproducts      0
hascrcard          0
isactivemember     0
estimatedsalary    0
exited             0
dtype: int64

In [26]:
df_train_full.exited.value_counts()

0    6378
1    1622
Name: exited, dtype: int64

In [27]:
1622 / (6378 + 1622)

0.20275

Global Mean

In [28]:
global_mean = df_train_full.exited.mean()
round(global_mean,5)

0.20275

In [29]:
df.dtypes

rownumber            int64
customerid           int64
surname             object
creditscore          int64
geography           object
gender              object
age                  int64
tenure               int64
balance            float64
numofproducts        int64
hascrcard            int64
isactivemember       int64
estimatedsalary    float64
exited               int64
dtype: object

List categorial features and numeric features.

In [30]:
categorical = ['geography', 'gender', 'tenure', 'numofproducts', 'hascrcard', 'isactivemember']
numerical = ['creditscore', 'age', 'balance', 'estimatedsalary']

In [31]:
df_train_full[categorical].nunique()

geography          3
gender             2
tenure            11
numofproducts      4
hascrcard          2
isactivemember     2
dtype: int64

In [32]:
df_train_full[numerical].describe()

Unnamed: 0,creditscore,age,balance,estimatedsalary
count,8000.0,8000.0,8000.0,8000.0
mean,650.313625,38.88725,76463.443272,100143.429649
std,96.67699,10.468894,62372.543408,57574.884151
min,350.0,18.0,0.0,91.75
25%,583.0,32.0,0.0,51014.8375
50%,651.0,37.0,97055.145,99836.215
75%,718.0,44.0,127639.3725,149373.5275
max,850.0,92.0,250898.09,199970.74


In [33]:
df_train_full.corr()

Unnamed: 0,rownumber,customerid,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
rownumber,1.0,0.001165,0.003744,-0.009885,-0.008162,-0.008754,0.008319,-0.002024,-0.002751,-0.003735,-0.026526
customerid,0.001165,1.0,0.004212,0.010086,-0.004541,-0.006472,0.013681,-0.01393,-0.0021,0.021884,-0.010516
creditscore,0.003744,0.004212,1.0,-0.007697,-0.001942,-0.001585,0.015114,-0.001258,0.031762,0.000399,-0.02642
age,-0.009885,0.010086,-0.007697,1.0,-0.012859,0.02893,-0.029657,-0.014857,0.088084,-0.010799,0.278079
tenure,-0.008162,-0.004541,-0.001942,-0.012859,1.0,-0.007597,0.015562,0.024802,-0.030183,0.000881,-0.008248
balance,-0.008754,-0.006472,-0.001585,0.02893,-0.007597,1.0,-0.313202,-0.01375,-0.011692,0.009864,0.116926
numofproducts,0.008319,0.013681,0.015114,-0.029657,0.015562,-0.313202,1.0,0.005514,0.006243,0.018054,-0.040868
hascrcard,-0.002024,-0.01393,-0.001258,-0.014857,0.024802,-0.01375,0.005514,1.0,-0.006068,-0.0062,-0.014177
isactivemember,-0.002751,-0.0021,0.031762,0.088084,-0.030183,-0.011692,0.006243,-0.006068,1.0,-0.006182,-0.159633
estimatedsalary,-0.003735,0.021884,0.000399,-0.010799,0.000881,0.009864,0.018054,-0.0062,-0.006182,1.0,0.006483


Age is positively correleated to exited.

Balance is positively correlated to exited.

numofproducts, hascrcard and isactivemember are negatively correlated to exited

# Feature Importance

In [34]:
female_mean = df_train_full[df_train_full.gender == 'female'].exited.mean()
print('gender == female:', round(female_mean, 3))

male_mean = df_train_full[df_train_full.gender == 'male'].exited.mean()
print('gender == male:  ', round(male_mean, 3))

gender == female: 0.249
gender == male:   0.164


Female risk ratio: 1.229;  high > 1.0 => risky

In [35]:
female_mean / global_mean


1.2296637092961358

Male risk ratio: 0.807;  > 0.50 but < 1.0 => less risky than females

In [36]:
male_mean / global_mean

0.8076816986463685

In [37]:
df_group = df_train_full.groupby(by='gender').exited.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_mean
df_group['risk'] = df_group['mean'] / global_mean
df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.249314,0.046564,1.229664
male,0.163757,-0.038993,0.807682


isactivemeber risk ratio

In [38]:
isactivemember_no = df_train_full[df_train_full.isactivemember == 0].exited.mean()
print('isactivemember No:', round(isactivemember_no, 3))

isactivemember_yes = df_train_full[df_train_full.isactivemember == 1].exited.mean()
print('isactivemember Yes:', round(isactivemember_yes, 3))

isactivemember No: 0.268
isactivemember Yes: 0.14


isactivemember No: has risk ratio > 1.0 => risky

In [39]:
isactivemember_no / global_mean

1.3239149009209348

In [40]:
isactivemember_yes / global_mean

0.690650187488022

hascrcard risk ratio

In [41]:
hascrcard_no = df_train_full[df_train_full.hascrcard == 0].exited.mean()
print('hascrcard No:', round(hascrcard_no, 3))

hascrcard_yes = df_train_full[df_train_full.hascrcard == 1].exited.mean()
print('hascrcard yes:', round(hascrcard_yes, 3))

hascrcard No: 0.212
hascrcard yes: 0.199


In [42]:
hascrcard_no / global_mean

1.0436700151079652

Hascrcard No: risk ratio > 1.0 => risky

In [43]:
hascrcard_yes / global_mean


0.9819019597925971

Numofproducts risk ratio

In [44]:
numofproducts_one = df_train_full[df_train_full.numofproducts == 1].exited.mean()
print('number of products 1:', round(numofproducts_one, 3))

numofproducts_two = df_train_full[df_train_full.numofproducts == 2].exited.mean()
print('number of products 2:', round(numofproducts_two, 3))

numofproducts_three = df_train_full[df_train_full.numofproducts == 3].exited.mean()
print('number of products 3:', round(numofproducts_three, 3))

numofproducts_four = df_train_full[df_train_full.numofproducts == 4].exited.mean()
print('number of products 4:', round(numofproducts_four, 3))

number of products 1: 0.274
number of products 2: 0.077
number of products 3: 0.817
number of products 4: 1.0


number of products of 1, 3 & 4; risk ratio > 1.0 => risky

In [45]:
numofproducts_one / global_mean

1.3514325731676886

In [46]:
numofproducts_two / global_mean

0.37764122068156847

In [47]:
numofproducts_three / global_mean

4.031327241299708

In [48]:
numofproducts_four / global_mean

4.932182490752157

In [49]:
from IPython.display import display
global_mean = df_train_full.exited.mean()
global_mean

0.20275

Compute Risk ratio and Risk differnces across categorical feature vs exited

In [50]:
for col in categorical:
    df_group = df_train_full.groupby(by=col).exited.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
france,0.160991,-0.041759,0.794039
germany,0.318227,0.115477,1.569552
spain,0.168522,-0.034228,0.831183


Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.249314,0.046564,1.229664
male,0.163757,-0.038993,0.807682


Unnamed: 0_level_0,mean,diff,risk
tenure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.239521,0.036771,1.181361
1,0.213483,0.010733,1.052938
2,0.181168,-0.021582,0.893554
3,0.223881,0.021131,1.10422
4,0.2025,-0.00025,0.998767
5,0.197789,-0.004961,0.97553
6,0.201777,-0.000973,0.995199
7,0.17407,-0.02868,0.858543
8,0.193267,-0.009483,0.953227
9,0.220102,0.017352,1.085582


Unnamed: 0_level_0,mean,diff,risk
numofproducts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.274003,0.071253,1.351433
2,0.076567,-0.126183,0.377641
3,0.817352,0.614602,4.031327
4,1.0,0.79725,4.932182


Unnamed: 0_level_0,mean,diff,risk
hascrcard,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.211604,0.008854,1.04367
1,0.199081,-0.003669,0.981902


Unnamed: 0_level_0,mean,diff,risk
isactivemember,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.268424,0.065674,1.323915
1,0.140029,-0.062721,0.69065


Compute Mutual Information Scores

In [55]:
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.exited)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head(10))
display(df_mi.tail(5))

# Most useful categorical numofproducts to exited according to MI score
# Least useful categorical: tenure, hascrcard

Unnamed: 0,MI
numofproducts,0.069025
geography,0.013117
isactivemember,0.012872
gender,0.005598
tenure,0.000904
hascrcard,0.0001


Unnamed: 0,MI
geography,0.013117
isactivemember,0.012872
gender,0.005598
tenure,0.000904
hascrcard,0.0001


# One Hot Encoding

In [56]:
from sklearn.feature_extraction import DictVectorizer
train_dict = df_train[categorical + numerical].to_dict(orient='records')


In [57]:
train_dict[0]

{'geography': 'spain',
 'gender': 'male',
 'tenure': 9,
 'numofproducts': 2,
 'hascrcard': 1,
 'isactivemember': 0,
 'creditscore': 648,
 'age': 46,
 'balance': 127209.0,
 'estimatedsalary': 77405.95}

In [58]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [59]:
X_train = dv.transform(train_dict)
X_train.shape

(5360, 13)

In [60]:
type(X_train)

numpy.ndarray

In [61]:
dv.feature_names_

['age',
 'balance',
 'creditscore',
 'estimatedsalary',
 'gender=female',
 'gender=male',
 'geography=france',
 'geography=germany',
 'geography=spain',
 'hascrcard',
 'isactivemember',
 'numofproducts',
 'tenure']

# Train our Logistic Regression Model

In [63]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)

y_pred

y_pred = model.predict_proba(X_val)[:,1]
y_pred




array([0.08624693, 0.08447113, 0.25434001, ..., 0.28506909, 0.29989179,
       0.17403539])

In [64]:
len(y_val)

2640

In [65]:
len(y_pred)

2640

In [66]:
len(X_train)

5360

In [67]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_train)

# Calculate the accuracy
# accuracy = accuracy_score(y_test, y_pred)
accuracy = accuracy_score(y_train, y_pred)
print(f"Test set accuracy: {accuracy:.2f}")

Test set accuracy: 0.79


In [68]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [69]:
y_pred = model.predict_proba(X_val)

In [70]:
y_pred

array([[0.91375307, 0.08624693],
       [0.91552887, 0.08447113],
       [0.74565999, 0.25434001],
       ...,
       [0.71493091, 0.28506909],
       [0.70010821, 0.29989179],
       [0.82596461, 0.17403539]])

In [71]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.08624693, 0.08447113, 0.25434001, ..., 0.28506909, 0.29989179,
       0.17403539])

Show Max & Min of y_pred

In [72]:
max(y_pred)

0.7592834881929926

In [73]:
min(y_pred)

0.03685726521795917

In [74]:
exited = y_pred > 0.5

In [None]:
(y_val == exited).mean()

0.7901515151515152

Show Y intercept

In [75]:
model.intercept_[0]

-0.0005703091676461757

Show Coefficients

In [76]:
dict(zip(dv.feature_names_, model.coef_[0].round(7)))

{'age': 0.0413732,
 'balance': 3.3e-06,
 'creditscore': -0.0048121,
 'estimatedsalary': -1.5e-06,
 'gender=female': 0.0015991,
 'gender=male': -0.0021694,
 'geography=france': -0.0016313,
 'geography=germany': 0.0016032,
 'geography=spain': -0.0005422,
 'hascrcard': -0.0007043,
 'isactivemember': -0.0031144,
 'numofproducts': -0.0010604,
 'tenure': -0.0056406}

# Let's use the model on a few single records from the dataset to check their predict probablity




In [77]:
customer = {
    'creditscore' : 549,
    'geography' : 'spain',
    'gender' : 'female',
    'age' :   24,
    'tenure' : 9,
    'balance' : 0,
    'numofproducts' : 2,
    'hascrcard' : 1,
    'isactivemember' : 1,
    'estimatedsalary' : 14406.41
}

In [78]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.15102699837385597

In [79]:
print(list(X_test[0]))

[24.0, 0.0, 549.0, 14406.41, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 9.0]


In [80]:
customer = {
   'creditscore' : 475,
    'geography' : 'france',
    'gender' : 'female',
    'age' :   45,
    'tenure' : 0,
    'balance' : 134264.04,
    'numofproducts' : 1,
    'hascrcard' : 1,
    'isactivemember' : 0,
    'estimatedsalary' : 27822.99 

}

In [81]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.49249714958118584

In [82]:
print(list(X_test[0]))

[45.0, 134264.04, 475.0, 27822.99, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0]


In [83]:
customer = {
   'creditscore' : 576,
    'geography' : 'germany',
    'gender' : 'female',
    'age' :   71,
    'tenure' : 6,
    'balance' : 140273.47,
    'numofproducts' : 1,
    'hascrcard' : 1,
    'isactivemember' : 1,
    'estimatedsalary' :  193135.25

}

In [84]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.5741919442980223

In [85]:
print(list(X_test[0]))


[71.0, 140273.47, 576.0, 193135.25, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 6.0]


##############################################################################

Performance Metrics
