<a href="https://colab.research.google.com/github/jmohsbeck1/jpmc_mle/blob/module2/JPMC_Ramu_Ranjani_John_Day1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
df = pd.read_csv("Churn_Modelling.csv")
print(len(df))
df.head()

10000


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Data Prep

Dataset

records: 10000

columns: 11

In [37]:
df.shape

(10000, 11)

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
RowNumber,1,2,3,4,5
CustomerId,15634602,15647311,15619304,15701354,15737888
Surname,Hargrave,Hill,Onio,Boni,Mitchell
CreditScore,619,608,502,699,850
Geography,France,Spain,France,France,Spain
Gender,Female,Female,Female,Female,Female
Age,42,41,42,39,43
Tenure,2,1,8,1,2
Balance,0.0,83807.86,159660.8,0.0,125510.82
NumOfProducts,1,1,3,2,1


In [6]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

Remove the following columns.
They do not add value to modeling the exited target.

"Rownumber",
"CustomerId",
"Surname"

In [7]:
df = df.drop(['RowNumber','CustomerId','Surname'],axis=1)

In [38]:
df.head()

Unnamed: 0,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,619,france,female,42,2,0.0,1,1,1,101348.88,1
1,608,spain,female,41,1,83807.86,1,0,1,112542.58,0
2,502,france,female,42,8,159660.8,3,1,0,113931.57,1
3,699,france,female,39,1,0.0,2,0,0,93826.63,0
4,850,spain,female,43,2,125510.82,1,1,1,79084.1,0


In [39]:
df.isnull().sum()

creditscore        0
geography          0
gender             0
age                0
tenure             0
balance            0
numofproducts      0
hascrcard          0
isactivemember     0
estimatedsalary    0
exited             0
dtype: int64

In [40]:
# No Nulls hence commenting...
#df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
#df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [41]:
df.dtypes

creditscore          int64
geography           object
gender              object
age                  int64
tenure               int64
balance            float64
numofproducts        int64
hascrcard            int64
isactivemember       int64
estimatedsalary    float64
exited               int64
dtype: object

Replace blanks with underscore "_" and transform column names to lower case.

In [42]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [44]:
df.nunique()

creditscore         460
geography             3
gender                2
age                  70
tenure               11
balance            6382
numofproducts         4
hascrcard             2
isactivemember        2
estimatedsalary    9999
exited                2
dtype: int64

In [45]:
df['exited'].unique()

array([1, 0])

In [46]:
df.dtypes

creditscore          int64
geography           object
gender              object
age                  int64
tenure               int64
balance            float64
numofproducts        int64
hascrcard            int64
isactivemember       int64
estimatedsalary    float64
exited               int64
dtype: object

In [47]:
# Already Exited column is encoded...commenting
# Encoding the Target with 0's and 1's
#df.exited = (df.exited == 'yes').astype(int)

In [48]:
df['exited'].nunique()

2

In [49]:
df['exited'].sample(25)

2418    1
7205    0
9497    0
5106    0
8369    1
3253    0
5866    1
2360    0
1278    0
1755    1
4735    0
5404    0
7695    1
8917    0
7317    1
3704    0
1998    0
6992    1
3105    1
3286    0
1333    0
6592    0
7730    0
4353    0
1329    1
Name: exited, dtype: int64

In [50]:
df.exited.value_counts()

0    7963
1    2037
Name: exited, dtype: int64

##Dataset is Imbalanced

Exited: 1622
Stay: 6378

Split dataset 80/20 to df_train_full, df_test.
Then split df_train_full 67%, 33% to df_train, df_val.

In [51]:
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)
y_train = df_train.exited.values
y_val = df_val.exited.values
del df_train['exited']
del df_val['exited']

# EDA

In [52]:
df_train_full.isnull().sum()

creditscore        0
geography          0
gender             0
age                0
tenure             0
balance            0
numofproducts      0
hascrcard          0
isactivemember     0
estimatedsalary    0
exited             0
dtype: int64

In [53]:
df_train_full.exited.value_counts()

0    6378
1    1622
Name: exited, dtype: int64

In [54]:
1622 / (6378 + 1622)

0.20275

Global Mean

In [77]:
global_mean = df_train_full.exited.mean()
round(global_mean,5)

0.20275

In [59]:
df.dtypes

creditscore          int64
geography           object
gender              object
age                  int64
tenure               int64
balance            float64
numofproducts        int64
hascrcard            int64
isactivemember       int64
estimatedsalary    float64
exited               int64
dtype: object

List categorial features and numeric features.

In [65]:
categorical = ['geography', 'gender','tenure','numofproducts', 'hascrcard', 'isactivemember']
numerical = ['creditscore','age','balance','estimatedsalary']

In [66]:
df_train_full[categorical].nunique()

geography          3
gender             2
tenure            11
numofproducts      4
hascrcard          2
isactivemember     2
dtype: int64

In [67]:
df_train_full[numerical].describe()

Unnamed: 0,creditscore,age,balance,estimatedsalary
count,8000.0,8000.0,8000.0,8000.0
mean,650.313625,38.88725,76463.443272,100143.429649
std,96.67699,10.468894,62372.543408,57574.884151
min,350.0,18.0,0.0,91.75
25%,583.0,32.0,0.0,51014.8375
50%,651.0,37.0,97055.145,99836.215
75%,718.0,44.0,127639.3725,149373.5275
max,850.0,92.0,250898.09,199970.74


In [68]:
df_train_full.corr()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
creditscore,1.0,-0.007697,-0.001942,-0.001585,0.015114,-0.001258,0.031762,0.000399,-0.02642
age,-0.007697,1.0,-0.012859,0.02893,-0.029657,-0.014857,0.088084,-0.010799,0.278079
tenure,-0.001942,-0.012859,1.0,-0.007597,0.015562,0.024802,-0.030183,0.000881,-0.008248
balance,-0.001585,0.02893,-0.007597,1.0,-0.313202,-0.01375,-0.011692,0.009864,0.116926
numofproducts,0.015114,-0.029657,0.015562,-0.313202,1.0,0.005514,0.006243,0.018054,-0.040868
hascrcard,-0.001258,-0.014857,0.024802,-0.01375,0.005514,1.0,-0.006068,-0.0062,-0.014177
isactivemember,0.031762,0.088084,-0.030183,-0.011692,0.006243,-0.006068,1.0,-0.006182,-0.159633
estimatedsalary,0.000399,-0.010799,0.000881,0.009864,0.018054,-0.0062,-0.006182,1.0,0.006483
exited,-0.02642,0.278079,-0.008248,0.116926,-0.040868,-0.014177,-0.159633,0.006483,1.0


# Feature Importance

In [70]:
female_mean = df_train_full[df_train_full.gender == 'female'].exited.mean()
print('gender == female:', round(female_mean, 3))

male_mean = df_train_full[df_train_full.gender == 'male'].exited.mean()
print('gender == male:  ', round(male_mean, 3))

gender == female: 0.249
gender == male:   0.164


Female risk ratio is high > 1.0 => risky

In [71]:
# global mean = 27%... then if i did 27/27 = 1... or 100%
# female_mean / global_mean ... then the closer that I am to 1... the less important that feature
# male_mean / global_mean ... same thing
female_mean / global_mean


1.2296637092961358

Male risk ratio, also high, above 0.50

In [72]:
male_mean / global_mean

0.8076816986463685

In [74]:
isactivemember_no = df_train_full[df_train_full.isactivemember == 0].exited.mean()
print('isactivemember == no:', round(isactivemember_no, 3))

isactivemember_yes = df_train_full[df_train_full.isactivemember == 1].exited.mean()
print('isactivemember == yes :', round(isactivemember_yes, 3))

isactivemember == no: 0.268
isactivemember == yes : 0.14


In [75]:
isactivemember_no / global_mean

1.3239149009209348

In [76]:
isactivemember_yes / global_mean

0.690650187488022

In [None]:
df_group = df_train_full.groupby(by='gender').churn.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_mean
df_group['risk'] = df_group['mean'] / global_mean
df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


In [None]:
from IPython.display import display
global_mean = df_train_full.churn.mean()
global_mean

0.26996805111821087

In [None]:
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


In [None]:
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
display(df_mi.tail())

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923


Unnamed: 0,MI
partner,0.009968
seniorcitizen,0.00941
multiplelines,0.000857
phoneservice,0.000229
gender,0.000117


# One Hot Encoding

In [None]:
from sklearn.feature_extraction import DictVectorizer
train_dict = df_train[categorical + numerical].to_dict(orient='records')


In [None]:
train_dict[0]

{'gender': 'male',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'no',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'tenure': 71,
 'monthlycharges': 86.1,
 'totalcharges': 6045.9}

In [None]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [None]:
X_train = dv.transform(train_dict)
X_train.shape

(3774, 45)

In [None]:
dv.feature_names_

['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

# Train our Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [None]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
y_pred = model.predict_proba(X_val)

In [None]:
y_pred

array([[0.76508784, 0.23491216],
       [0.73113015, 0.26886985],
       [0.68054704, 0.31945296],
       ...,
       [0.94274614, 0.05725386],
       [0.38476895, 0.61523105],
       [0.93872763, 0.06127237]])

In [None]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.23491216, 0.26886985, 0.31945296, ..., 0.05725386, 0.61523105,
       0.06127237])

In [None]:
churn = y_pred > 0.5

In [None]:
(y_val == churn).mean()

0.8016129032258065

In [None]:
model.intercept_[0]

-0.12198896163042137

In [None]:
dict(zip(dv.feature_names_, model.coef_[0].round(7)))

{'contract=month-to-month': 0.5633504,
 'contract=one_year': -0.0859199,
 'contract=two_year': -0.5994195,
 'dependents=no': -0.030282,
 'dependents=yes': -0.0917069,
 'deviceprotection=no': 0.0999283,
 'deviceprotection=no_internet_service': -0.1158683,
 'deviceprotection=yes': -0.106049,
 'gender=female': -0.0273677,
 'gender=male': -0.0946213,
 'internetservice=dsl': -0.323349,
 'internetservice=fiber_optic': 0.3172283,
 'internetservice=no': -0.1158683,
 'monthlycharges': 0.0007843,
 'multiplelines=no': -0.1680968,
 'multiplelines=no_phone_service': 0.127134,
 'multiplelines=yes': -0.0810262,
 'onlinebackup=no': 0.1357062,
 'onlinebackup=no_internet_service': -0.1158683,
 'onlinebackup=yes': -0.1418268,
 'onlinesecurity=no': 0.2578617,
 'onlinesecurity=no_internet_service': -0.1158683,
 'onlinesecurity=yes': -0.2639824,
 'paperlessbilling=no': -0.2126151,
 'paperlessbilling=yes': 0.0906262,
 'partner=no': -0.0480303,
 'partner=yes': -0.0739587,
 'paymentmethod=bank_transfer_(automa

# Let's use the model

In [None]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}

In [None]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.07332111084949638

In [None]:
print(list(X_test[0]))

[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 79.85, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 41.0, 3320.75]


In [None]:
customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7

In [None]:
customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7
}

In [None]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.8321656556055403