<a href="https://colab.research.google.com/github/jmohsbeck1/jpmc_mle/blob/Mar.-29/Churn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# John Mohsbeck
# Data Bank Churn dataset
# Data preparation
# EDA
# Feature Importance
# Logistic Regression model
# Performance Metrics

import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("Churn_Modelling.csv")
print(len(df))
df.head()

FileNotFoundError: ignored

# Data Prep

Dataset

records: 10000

columns: 14

In [None]:
df.shape

(10000, 14)

In [None]:
df.head().T

Unnamed: 0,0,1,2,3,4
RowNumber,1,2,3,4,5
CustomerId,15634602,15647311,15619304,15701354,15737888
Surname,Hargrave,Hill,Onio,Boni,Mitchell
CreditScore,619,608,502,699,850
Geography,France,Spain,France,France,Spain
Gender,Female,Female,Female,Female,Female
Age,42,41,42,39,43
Tenure,2,1,8,1,2
Balance,0.0,83807.86,159660.8,0.0,125510.82
NumOfProducts,1,1,3,2,1


In [None]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

Remove the following columns.
They do not add value to modeling the exited target.

"Rownumber",
"CustomerId",
"Surname"

In [None]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
df.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

Replace blanks with underscore "_" and transform column names to lower case.

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [None]:
df.nunique()

creditscore         460
geography             3
gender                2
age                  70
tenure               11
balance            6382
numofproducts         4
hascrcard             2
isactivemember        2
estimatedsalary    9999
exited                2
dtype: int64

In [None]:
df['exited'].unique()

array([1, 0])

In [None]:
df.dtypes

creditscore          int64
geography           object
gender              object
age                  int64
tenure               int64
balance            float64
numofproducts        int64
hascrcard            int64
isactivemember       int64
estimatedsalary    float64
exited               int64
dtype: object

In [None]:
df['exited'].nunique()

2

In [None]:
df['exited'].sample(25)

1702    0
5994    0
6954    0
5702    0
4906    0
4134    0
4887    0
3032    1
6142    0
5635    0
3101    0
3913    1
5221    0
133     0
380     0
2870    0
1731    0
5988    0
4558    0
9096    0
3328    1
9584    1
8527    0
8879    0
2202    0
Name: exited, dtype: int64

In [None]:
df.exited.value_counts()

0    7963
1    2037
Name: exited, dtype: int64

Checking for duplicates.
Found 0 duplicates.

In [None]:
duplicates = df.duplicated()
print(duplicates.sum())

0


##Dataset is Imbalanced

Exited: 1622
Stay: 6378

Split dataset 80/20 to df_train_full, df_test.
Then split df_train_full 67%, 33% to df_train, df_val.

In [None]:
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)
y_train = df_train.exited.values
y_val = df_val.exited.values
del df_train['exited']
del df_val['exited']


# EDA

In [None]:
df_train_full.isnull().sum()

creditscore        0
geography          0
gender             0
age                0
tenure             0
balance            0
numofproducts      0
hascrcard          0
isactivemember     0
estimatedsalary    0
exited             0
dtype: int64

In [None]:
df_train_full.exited.value_counts()

0    6378
1    1622
Name: exited, dtype: int64

In [None]:
1622 / (6378 + 1622)

0.20275

Global Mean

In [None]:
global_mean = df_train_full.exited.mean()
round(global_mean,5)

0.20275

In [None]:
df.dtypes

creditscore          int64
geography           object
gender              object
age                  int64
tenure               int64
balance            float64
numofproducts        int64
hascrcard            int64
isactivemember       int64
estimatedsalary    float64
exited               int64
dtype: object

List categorial features and numeric features.

In [None]:
categorical = ['geography', 'gender', 'tenure', 'numofproducts', 'hascrcard', 'isactivemember']
numerical = ['creditscore', 'age', 'balance', 'estimatedsalary']

In [None]:
df_train_full[categorical].nunique()

geography          3
gender             2
tenure            11
numofproducts      4
hascrcard          2
isactivemember     2
dtype: int64

In [None]:
df_train_full[numerical].describe()

Unnamed: 0,creditscore,age,balance,estimatedsalary
count,8000.0,8000.0,8000.0,8000.0
mean,650.313625,38.88725,76463.443272,100143.429649
std,96.67699,10.468894,62372.543408,57574.884151
min,350.0,18.0,0.0,91.75
25%,583.0,32.0,0.0,51014.8375
50%,651.0,37.0,97055.145,99836.215
75%,718.0,44.0,127639.3725,149373.5275
max,850.0,92.0,250898.09,199970.74


In [None]:
df_train_full.corr()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
creditscore,1.0,-0.007697,-0.001942,-0.001585,0.015114,-0.001258,0.031762,0.000399,-0.02642
age,-0.007697,1.0,-0.012859,0.02893,-0.029657,-0.014857,0.088084,-0.010799,0.278079
tenure,-0.001942,-0.012859,1.0,-0.007597,0.015562,0.024802,-0.030183,0.000881,-0.008248
balance,-0.001585,0.02893,-0.007597,1.0,-0.313202,-0.01375,-0.011692,0.009864,0.116926
numofproducts,0.015114,-0.029657,0.015562,-0.313202,1.0,0.005514,0.006243,0.018054,-0.040868
hascrcard,-0.001258,-0.014857,0.024802,-0.01375,0.005514,1.0,-0.006068,-0.0062,-0.014177
isactivemember,0.031762,0.088084,-0.030183,-0.011692,0.006243,-0.006068,1.0,-0.006182,-0.159633
estimatedsalary,0.000399,-0.010799,0.000881,0.009864,0.018054,-0.0062,-0.006182,1.0,0.006483
exited,-0.02642,0.278079,-0.008248,0.116926,-0.040868,-0.014177,-0.159633,0.006483,1.0


Age is positively correleated to exited.

Balance is positively correlated to exited.

numofproducts, hascrcard and isactivemember are negatively correlated to exited

# Feature Importance

In [None]:
female_mean = df_train_full[df_train_full.gender == 'female'].exited.mean()
print('gender == female:', round(female_mean, 3))

male_mean = df_train_full[df_train_full.gender == 'male'].exited.mean()
print('gender == male:  ', round(male_mean, 3))

gender == female: 0.249
gender == male:   0.164


Female risk ratio: 1.229;  high > 1.0 => risky

In [None]:
female_mean / global_mean


1.2296637092961358

Male risk ratio: 0.807;  > 0.50 but < 1.0 => less risky than females

In [None]:
male_mean / global_mean

0.8076816986463685

In [None]:
df_group = df_train_full.groupby(by='gender').exited.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_mean
df_group['risk'] = df_group['mean'] / global_mean
df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.249314,0.046564,1.229664
male,0.163757,-0.038993,0.807682


isactivemeber risk ratio

In [None]:
isactivemember_no = df_train_full[df_train_full.isactivemember == 0].exited.mean()
print('isactivemember No:', round(isactivemember_no, 3))

isactivemember_yes = df_train_full[df_train_full.isactivemember == 1].exited.mean()
print('isactivemember Yes:', round(isactivemember_yes, 3))

isactivemember No: 0.268
isactivemember Yes: 0.14


isactivemember No: has risk ratio > 1.0 => risky

In [None]:
isactivemember_no / global_mean

1.3239149009209348

In [None]:
isactivemember_yes / global_mean

0.690650187488022

hascrcard risk ratio

In [None]:
hascrcard_no = df_train_full[df_train_full.hascrcard == 0].exited.mean()
print('hascrcard No:', round(hascrcard_no, 3))

hascrcard_yes = df_train_full[df_train_full.hascrcard == 1].exited.mean()
print('hascrcard yes:', round(hascrcard_yes, 3))

hascrcard No: 0.212
hascrcard yes: 0.199


In [None]:
hascrcard_no / global_mean

1.0436700151079652

Hascrcard No: risk ratio > 1.0 => risky

In [None]:
hascrcard_yes / global_mean


0.9819019597925971

Numofproducts risk ratio

In [None]:
numofproducts_one = df_train_full[df_train_full.numofproducts == 1].exited.mean()
print('number of products 1:', round(numofproducts_one, 3))

numofproducts_two = df_train_full[df_train_full.numofproducts == 2].exited.mean()
print('number of products 2:', round(numofproducts_two, 3))

numofproducts_three = df_train_full[df_train_full.numofproducts == 3].exited.mean()
print('number of products 3:', round(numofproducts_three, 3))

numofproducts_four = df_train_full[df_train_full.numofproducts == 4].exited.mean()
print('number of products 4:', round(numofproducts_four, 3))

number of products 1: 0.274
number of products 2: 0.077
number of products 3: 0.817
number of products 4: 1.0


number of products of 1, 3 & 4; risk ratio > 1.0 => risky

In [None]:
numofproducts_one / global_mean

1.3514325731676886

In [None]:
numofproducts_two / global_mean

0.37764122068156847

In [None]:
numofproducts_three / global_mean

4.031327241299708

In [None]:
numofproducts_four / global_mean

4.932182490752157

In [None]:
from IPython.display import display
global_mean = df_train_full.exited.mean()
global_mean

0.20275

Compute Risk ratio and Risk differnces across categorical feature vs exited

In [None]:
for col in categorical:
    df_group = df_train_full.groupby(by=col).exited.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
france,0.160991,-0.041759,0.794039
germany,0.318227,0.115477,1.569552
spain,0.168522,-0.034228,0.831183


Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.249314,0.046564,1.229664
male,0.163757,-0.038993,0.807682


Unnamed: 0_level_0,mean,diff,risk
tenure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.239521,0.036771,1.181361
1,0.213483,0.010733,1.052938
2,0.181168,-0.021582,0.893554
3,0.223881,0.021131,1.10422
4,0.2025,-0.00025,0.998767
5,0.197789,-0.004961,0.97553
6,0.201777,-0.000973,0.995199
7,0.17407,-0.02868,0.858543
8,0.193267,-0.009483,0.953227
9,0.220102,0.017352,1.085582


Unnamed: 0_level_0,mean,diff,risk
numofproducts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.274003,0.071253,1.351433
2,0.076567,-0.126183,0.377641
3,0.817352,0.614602,4.031327
4,1.0,0.79725,4.932182


Unnamed: 0_level_0,mean,diff,risk
hascrcard,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.211604,0.008854,1.04367
1,0.199081,-0.003669,0.981902


Unnamed: 0_level_0,mean,diff,risk
isactivemember,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.268424,0.065674,1.323915
1,0.140029,-0.062721,0.69065


Compute Mutual Information Scores

In [None]:
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.exited)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
display(df_mi.tail())

Unnamed: 0,MI
numofproducts,0.069025
geography,0.013117
isactivemember,0.012872
gender,0.005598
tenure,0.000904


Unnamed: 0,MI
geography,0.013117
isactivemember,0.012872
gender,0.005598
tenure,0.000904
hascrcard,0.0001


# One Hot Encoding

In [None]:
from sklearn.feature_extraction import DictVectorizer
train_dict = df_train[categorical + numerical].to_dict(orient='records')


In [None]:
train_dict[0]

{'geography': 'spain',
 'gender': 'male',
 'tenure': 9,
 'numofproducts': 2,
 'hascrcard': 1,
 'isactivemember': 0,
 'creditscore': 648,
 'age': 46,
 'balance': 127209.0,
 'estimatedsalary': 77405.95}

In [None]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [None]:
X_train = dv.transform(train_dict)
X_train.shape

(5360, 13)

In [None]:
type(X_train)

numpy.ndarray

In [None]:
dv.feature_names_

['age',
 'balance',
 'creditscore',
 'estimatedsalary',
 'gender=female',
 'gender=male',
 'geography=france',
 'geography=germany',
 'geography=spain',
 'hascrcard',
 'isactivemember',
 'numofproducts',
 'tenure']

# Train our Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)

y_pred

y_pred = model.predict_proba(X_val)[:,1]
y_pred




array([0.08624693, 0.08447113, 0.25434001, ..., 0.28506909, 0.29989179,
       0.17403539])

In [None]:
len(y_val)

2640

In [None]:
len(y_pred)

5360

In [None]:
len(X_train)

5360

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_train)

# Calculate the accuracy
# accuracy = accuracy_score(y_test, y_pred)
accuracy = accuracy_score(y_train, y_pred)
print(f"Test set accuracy: {accuracy:.2f}")

Test set accuracy: 0.79


In [None]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
y_pred = model.predict_proba(X_val)

In [None]:
y_pred

array([[0.91375307, 0.08624693],
       [0.91552887, 0.08447113],
       [0.74565999, 0.25434001],
       ...,
       [0.71493091, 0.28506909],
       [0.70010821, 0.29989179],
       [0.82596461, 0.17403539]])

In [None]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.08624693, 0.08447113, 0.25434001, ..., 0.28506909, 0.29989179,
       0.17403539])

Show Max & Min of y_pred

In [None]:
max(y_pred)

0.7592834881929926

In [None]:
min(y_pred)

0.03685726521795917

In [None]:
exited = y_pred > 0.5

In [None]:
(y_val == exited).mean()

0.7901515151515152

Show Y intercept

In [None]:
model.intercept_[0]

-0.0005703091676461757

Show Coefficients

In [None]:
dict(zip(dv.feature_names_, model.coef_[0].round(7)))

{'age': 0.0413732,
 'balance': 3.3e-06,
 'creditscore': -0.0048121,
 'estimatedsalary': -1.5e-06,
 'gender=female': 0.0015991,
 'gender=male': -0.0021694,
 'geography=france': -0.0016313,
 'geography=germany': 0.0016032,
 'geography=spain': -0.0005422,
 'hascrcard': -0.0007043,
 'isactivemember': -0.0031144,
 'numofproducts': -0.0010604,
 'tenure': -0.0056406}

# Let's use the model on a few single records from the dataset to check their predict probablity




In [None]:
customer = {
    'creditscore' : 549,
    'geography' : 'spain',
    'gender' : 'female',
    'age' :   24,
    'tenure' : 9,
    'balance' : 0,
    'numofproducts' : 2,
    'hascrcard' : 1,
    'isactivemember' : 1,
    'estimatedsalary' : 14406.41
}

In [None]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.15102699837385597

In [None]:
print(list(X_test[0]))

[24.0, 0.0, 549.0, 14406.41, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 9.0]


In [None]:
customer = {
   'creditscore' : 475,
    'geography' : 'france',
    'gender' : 'female',
    'age' :   45,
    'tenure' : 0,
    'balance' : 134264.04,
    'numofproducts' : 1,
    'hascrcard' : 1,
    'isactivemember' : 0,
    'estimatedsalary' : 27822.99 

}

In [None]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.49249714958118584

In [None]:
print(list(X_test[0]))

[45.0, 134264.04, 475.0, 27822.99, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0]


In [None]:
customer = {
   'creditscore' : 576,
    'geography' : 'germany',
    'gender' : 'female',
    'age' :   71,
    'tenure' : 6,
    'balance' : 140273.47,
    'numofproducts' : 1,
    'hascrcard' : 1,
    'isactivemember' : 1,
    'estimatedsalary' :  193135.25

}

In [None]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

0.5741919442980223

In [None]:
print(list(X_test[0]))


[71.0, 140273.47, 576.0, 193135.25, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 6.0]


##############################################################################

Part 2

1. Load Churn Modeling dataset
2. Pipeline process churn data
3. Logistic Regression model on X_preprocessed
3. Check accuracy

In [None]:
# Get the data and target separately
churn_data = pd.read_csv("Churn_Modelling.csv")

feature_names = list(churn_data.columns)

churn_df = pd.DataFrame(churn_data, columns=feature_names)
churn_df['Exited'] = churn_data['Exited']

# convert to lowercase
# remove underscore
churn_df.columns = churn_df.columns.str.lower().str.replace(' ', '_')

string_columns = list(churn_df.dtypes[churn_df.dtypes == 'object'].index)

for col in string_columns:
    churn_df[col] = churn_df[col].str.lower().str.replace(' ', '_')

churn_df.head()

X = churn_df.drop(['exited'], axis=1)
y = churn_data['Exited']

churn_df.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,1,15634602,hargrave,619,france,female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,hill,608,spain,female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,onio,502,france,female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,boni,699,france,female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,mitchell,850,spain,female,43,2,125510.82,1,1,1,79084.1,0


Pipeline Processing

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define preprocessing steps
categorical_features = ['geography', 'gender', 'tenure', 'numofproducts', 'hascrcard', 'isactivemember']
numeric_features = ['creditscore', 'age', 'balance', 'estimatedsalary']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Preprocess the data
X_preprocessed = preprocessor.fit_transform(X)

1. Call Logistic Regression using outout from Feature Selector: train_removed
2. Check accuracy
3. Look at Results.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42)

model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.2f}")

Test set accuracy: 0.84


In [None]:
model.intercept_[0]

0.028050428477543205

In [None]:
model.coef_[0].round(7)

array([-0.0707292,  0.7311439, -0.0471405,  0.0077193, -0.3371375,
        0.6166293, -0.230214 ,  0.2872191, -0.2379413,  0.2813457,
        0.0813207,  0.0119279, -0.0306781,  0.0527577, -0.106347 ,
       -0.0050978, -0.0856137, -0.0886975,  0.0465241, -0.1081643,
       -1.1960248, -2.7245915,  1.5114417,  2.4584524,  0.0401502,
        0.0091276,  0.570354 , -0.5210762])

Conclusion:
1. Using Pipeline with Logistic Regression resulted in a higher accuracy:
    84% compared to 79% previously.

2. The Linear regression model conatined many more coefficients than previous model, due to OneHot encoding.

3. Appears that, using Pipeline  + Logistic Regression improves accuracy with this Churn_Modeling dataset.
