# Bank Customer Churn Machine Learning Predictive Models

In [None]:
pip install imblearn

In [2]:

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics


from imblearn.combine import SMOTEENN


import pandas as pd
import numpy as np

In [3]:
dataset = pd.read_csv ('dataset.csv') 

In [4]:
# drop off unimportant column
dataset.drop(['RowNumber','CustomerId','Surname'],axis = 1, inplace = True)

In [5]:
# male, female and geography are converted into 0 and 1 
le=LabelEncoder()
dataset['Geography']=le.fit_transform(dataset['Geography'])
dataset['Gender']=le.fit_transform(dataset['Gender'])

In [6]:
# Churned customers (Dependent Variable(Y))
Y=dataset['Exited']

In [7]:
# Independent Variables (X)
ind_vars=['CreditScore','Geography','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']
X=dataset[ind_vars]

In [8]:
# train and test data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

## Logistics Regression Classifier

In [9]:
logreg = LogisticRegression(solver='liblinear', random_state=100)
logreg.fit(x_train, y_train)

LogisticRegression(random_state=100, solver='liblinear')

In [10]:
y_pred = logreg.predict(x_test)

In [11]:
print('Accuracy:',metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.78


In [12]:
print(classification_report(y_test,y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.80      0.97      0.87      2379
           1       0.33      0.06      0.11       621

    accuracy                           0.78      3000
   macro avg       0.57      0.52      0.49      3000
weighted avg       0.70      0.78      0.72      3000



Data is imbalanced and model is not properly created

In [13]:
print(confusion_matrix(y_test,y_pred))

[[2301   78]
 [ 582   39]]


As precision, recall and f1-scores are not good. Hence, model is either underfit or overfit

In [14]:
sm = SMOTEENN()
X_resampled,y_resampled = sm.fit_resample(X,Y)

In [15]:
xlr_train, xlr_test, ylr_train, ylr_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

In [16]:
model_lr_smote = LogisticRegression(solver='liblinear',random_state=100)

In [17]:
model_lr_smote.fit(xlr_train,ylr_train)

LogisticRegression(random_state=100, solver='liblinear')

In [18]:
y_pred_smote_lr = model_lr_smote.predict(xlr_test)

In [19]:
print(classification_report(ylr_test,y_pred_smote_lr, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.70      0.56      0.62       888
           1       0.70      0.81      0.75      1120

    accuracy                           0.70      2008
   macro avg       0.70      0.68      0.68      2008
weighted avg       0.70      0.70      0.69      2008



In [20]:
print('Accuracy:',metrics.accuracy_score(ylr_test, y_pred_smote_lr),'after smote')

Accuracy: 0.6962151394422311 after smote


# Decision Tree Classifier

In [21]:
model_dt = DecisionTreeClassifier(criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)

In [22]:
model_dt.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [23]:
y_pred = model_dt.predict(x_test)

In [24]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
#model_dt.score(y_test,y_pred)

In [26]:
print(classification_report(y_test,y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92      2379
           1       0.79      0.44      0.57       621

    accuracy                           0.86      3000
   macro avg       0.83      0.71      0.74      3000
weighted avg       0.85      0.86      0.84      3000



Data is imbalanced and model is not properly created

In [27]:
print(confusion_matrix(y_test,y_pred))

[[2307   72]
 [ 347  274]]


As precision, recall and f1-scores are not good. Hence, model is either underfit or overfit

Combine over and under sampling using SMOTE

In [28]:
sm = SMOTEENN()
X_resampled,y_resampled = sm.fit_resample(X,Y)

In [29]:
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

In [30]:
model_dt_smote = DecisionTreeClassifier(criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)

In [31]:
model_dt_smote.fit(xr_train,yr_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [32]:
y_pred_smote = model_dt_smote.predict(xr_test)

In [33]:
print(classification_report(yr_test,y_pred_smote, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79       878
           1       0.83      0.83      0.83      1086

    accuracy                           0.81      1964
   macro avg       0.81      0.81      0.81      1964
weighted avg       0.81      0.81      0.81      1964



In [34]:
print('Accuracy:',metrics.accuracy_score(yr_test, y_pred_smote),'after smote')

Accuracy: 0.8131364562118126 after smote


## Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
model_rf = RandomForestClassifier(n_estimators = 100, criterion = 'gini',random_state=100,max_depth=6,min_samples_leaf=8)
model_rf.fit(x_train,y_train)
y_pred_rf = model_rf.predict(x_test)

In [37]:
print(classification_report(y_test,y_pred_rf, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92      2379
           1       0.83      0.42      0.55       621

    accuracy                           0.86      3000
   macro avg       0.85      0.70      0.74      3000
weighted avg       0.86      0.86      0.84      3000



Combine over and under sampling using SMOTE

In [38]:
sm = SMOTEENN()
X_resampled,y_resampled = sm.fit_resample(X,Y)

In [39]:
xrf_train, xrf_test, yrf_train, yrf_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

In [40]:
model_smote_rf = RandomForestClassifier(n_estimators = 100,criterion='gini',random_state=100,max_depth=6,min_samples_leaf=8)

In [41]:
model_smote_rf.fit(xrf_train,yrf_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [42]:
y_pred_smote_rf = model_smote_rf.predict(xrf_test)

In [43]:
print(classification_report(yrf_test,y_pred_smote_rf, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.80      0.81       868
           1       0.85      0.87      0.86      1108

    accuracy                           0.84      1976
   macro avg       0.84      0.84      0.84      1976
weighted avg       0.84      0.84      0.84      1976



In [44]:
print('Accuracy:',metrics.accuracy_score(yrf_test, y_pred_smote_rf),'after smote')

Accuracy: 0.8405870445344129 after smote


random forest have produced better results and hence it will use in the Power BI prediction model

 Calculation of prediction and probability using best fit model

In [45]:
y_predictLR=model_smote_rf.predict(X)
y_probabilityLR=model_smote_rf.predict_proba(X)[:,1]
y_pred = model_smote_rf.predict(x_test)

In [46]:
y_predictLR

array([1, 0, 1, ..., 0, 0, 1], dtype=int64)

In [47]:
y_probabilityLR

array([0.56467295, 0.41180708, 0.86842061, ..., 0.42034422, 0.42316199,
       0.72369429])

In [48]:
# Adding column Probability and Predict to dataset

dataset['Probability']=y_probabilityLR
dataset['Predict']=y_predictLR

Storing best fit model which can be use later in API

In [49]:
import pickle

In [50]:
filename = 'model.sav'

In [51]:
pickle.dump(model_smote_rf, open(filename, 'wb'))

In [52]:
load_model = pickle.load(open(filename, 'rb'))

In [54]:
# Model Score
print('Accuracy:',load_model.score(xrf_test, yrf_test))

Accuracy: 0.8405870445344129
