# Model Building for Bank Churn Prediction

In [4]:
# Data Wrangling
import pandas as pd

# To ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Machine Learning Classes
from sklearn.preprocessing import LabelEncoder # To convert categorical variables to numerical ones 
from sklearn.model_selection import train_test_split # To split into train and test datasets
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

# Import Evaluation Modules
from sklearn import metrics
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_auc_score, roc_curve, recall_score, precision_score

In [5]:
# pip install xgboost

In [6]:
# Import and Read Data
path = 'Churn_Modelling.csv'
df = pd.read_csv(path)
df.head()
# df.shape

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Feature Selection

In [7]:
# Dropping unnecessary columns
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], axis=1,inplace=True)

In [8]:
# Converting Categorical Variables to numerical variables
le = LabelEncoder()
cat_var = df[['Geography','Gender']]
for x in cat_var:
    df[x] = le.fit_transform(df[x])
df.dtypes

CreditScore          int64
Geography            int64
Gender               int64
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

**These are the features we would use to predict churn**

#### Creating Dependent and Independend Variables

In [9]:
# Spliting data to independent and target variables
X = df.drop('Exited', axis=1) # Independent Variable
y = df['Exited'] # Target Variable
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.0,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.8,3,1,0,113931.57
3,699,0,0,39,1,0.0,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.1


#### Spliting Data into Train and Test sets

In [10]:
# Splitting Data to Training and testing splits
X_train , X_test, y_train, y_test =  train_test_split(X,y, test_size=0.2, random_state=40) 
# 80% for train, 20% for test

## Model Building and Selection

### Logistic Regression


In [11]:
# Fit Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Prediction for training set
log_reg_train = log_reg.predict(X_train) # Baseline Model
# Prediction for testing set
log_reg_pred = log_reg.predict(X_test)

# Evaluating Model Results
# Accuracy Scores
print('Accuracy Score on Train: ', accuracy_score(y_train, log_reg_train))
accuracy_lr = accuracy_score(y_test, log_reg_pred).round(2) # Storing score in variable for display 
print(f'Accuracy Score on Test: {accuracy_lr}')
print('--'*20)

#Precision Scores 
print('Precision Score on Train: ', precision_score(y_train, log_reg_train))
precision_lr = precision_score(y_test, log_reg_pred).round(2) # Storing score in variable for display 
print(f'Precision Score on Test: {precision_lr}')
print('--'*20)

#Recall Scores
print('Recall Score on Train: ', recall_score(y_train, log_reg_train))
recall_lr = recall_score(y_test, log_reg_pred).round(2) # Storing score in variable for display 
print(f'Recall Score on Test: {recall_lr}')
print('--'*20)

# roc_auc_score
print('AUC-ROC Score on Train: ', roc_auc_score(y_train, log_reg_train))
roc_auc_lr = roc_auc_score(y_test, log_reg_pred).round(2) # Storing score in variable for display 
print(f'ROC_AUC Score on Test: {roc_auc_lr}')
print('--'*20)

# Kappa Score
kappa_lr = metrics.cohen_kappa_score(y_test, log_reg_pred).round(2) # Storing score in variable for display 
print(f'Kappa Score on Test: {kappa_lr}')

Accuracy Score on Train:  0.788
Accuracy Score on Test: 0.8
----------------------------------------
Precision Score on Train:  0.41365461847389556
Precision Score on Test: 0.38
----------------------------------------
Recall Score on Train:  0.062310949788263764
Recall Score on Test: 0.07
----------------------------------------
AUC-ROC Score on Train:  0.5196539781240042
ROC_AUC Score on Test: 0.52
----------------------------------------
Kappa Score on Test: 0.06


### Decision Tree


In [12]:
# Fit Model 
deci_tree = DecisionTreeClassifier()
deci_tree.fit(X_train, y_train)

# Prediction for train
deci_tree_train = deci_tree.predict(X_train) # Overfitting
# Prediction for test
deci_tree_pred = deci_tree.predict(X_test)

# Evaluating Model Results
# Acuracy Score
print('Accuracy Score on Train: ', accuracy_score(y_train, deci_tree_train)) 
accuracy_DT = accuracy_score(y_test, deci_tree_pred).round(2) # Storing score in variable for display 
print(f'Accuracy Score on Test: {accuracy_DT}')
print('--'*20)

# Precision Score
print('Precision Score on Train: ', precision_score(y_train, deci_tree_train)) 
precision_DT = precision_score(y_test, deci_tree_pred).round(2) # Storing score in variable for display 
print(f'Precision Score on Test: {precision_DT}') 
print('--'*20)

# Recall Score
print('Recall Score on Train: ', recall_score(y_train, deci_tree_train)) 
recall_DT = recall_score(y_test, deci_tree_pred).round(2) # Storing score in variable for display 
print(f'Recall Score on Test: {recall_DT}')
print('--'*20)

# roc_auc_score
print('AUC-ROC Score on Train: ', roc_auc_score(y_train, deci_tree_train))
roc_auc_DT = roc_auc_score(y_test, deci_tree_pred).round(2) # Storing score in variable for display 
print(f'AUC_ROC Score on Test: {roc_auc_DT}')
print('--'*20)

# Kappa Score
kappa_DT = metrics.cohen_kappa_score(y_test, deci_tree_pred).round(2) # Storing score in variable for display 
print(f'Kappa Score on Test: {kappa_DT}')

Accuracy Score on Train:  1.0
Accuracy Score on Test: 0.79
----------------------------------------
Precision Score on Train:  1.0
Precision Score on Test: 0.46
----------------------------------------
Recall Score on Train:  1.0
Recall Score on Test: 0.51
----------------------------------------
AUC-ROC Score on Train:  1.0
AUC_ROC Score on Test: 0.68
----------------------------------------
Kappa Score on Test: 0.35


### Random Forest 

In [13]:
# Defining Model
RF = RandomForestClassifier()
# Fitting Model
RF.fit(X_train, y_train)

# Model Prediction for train
RF_train = RF.predict(X_train) #Overfitting
# Prediction for test
RF_pred = RF.predict(X_test)

# Evaluation of Results
print('Accuracy Score on Train: ', accuracy_score(y_train, RF_train))
accuracy_RF = accuracy_score(y_test, RF_pred).round(2) # Storing score in variable for display 
print(f'Accuracy Score on Test: {accuracy_RF}')
print('--'*20)

#Precision Score 
print('Precision Score on train: ', precision_score(y_train, RF_train))
precision_RF = precision_score(y_test, RF_pred).round(2) # Storing score in variable for display 
print(f'Precision Score on Test: {precision_RF}')
print('--'*20)

# Recall Score
print('Recall Score on train: ', recall_score(y_train, RF_train))
recall_RF = recall_score(y_test, RF_pred).round(2) # Storing score in variable for display 
print(f'Recall Score on Test: {recall_RF}')
print('--'*20)

# roc_auc_score
print('AUC-ROC Score on Train: ', roc_auc_score(y_train, RF_train))
roc_auc_RF = roc_auc_score(y_test, RF_pred).round(2)
print(f'Recall Score on Test: {roc_auc_RF}')
print('--'*20)

# Kappa Score
kappa_RF = metrics.cohen_kappa_score(y_test, RF_pred).round(2)
print(f'Kappa Score on Test: {kappa_RF}')

Accuracy Score on Train:  1.0
Accuracy Score on Test: 0.87
----------------------------------------
Precision Score on train:  1.0
Precision Score on Test: 0.75
----------------------------------------
Recall Score on train:  1.0
Recall Score on Test: 0.51
----------------------------------------
AUC-ROC Score on Train:  1.0
Recall Score on Test: 0.74
----------------------------------------
Kappa Score on Test: 0.54


### Naive Bayes

In [14]:
# Defining Model
NB = GaussianNB()
# Fitting Model
NB.fit(X_train, y_train)

# Model Prediction for train 
NB_train = NB.predict(X_train) 
# prediction for test
NB_pred = NB.predict(X_test)

# Evaluation of Results
print('Accuracy Score on Train: ', accuracy_score(y_train, NB_train))
accuracy_NB = accuracy_score(y_test, NB_pred).round(2) # Storing score in variable for display 
print(f'Accuracy Score on Test: {accuracy_NB}')
print('--'*20)

#Precision Score 
print('Precision Score on train: ', precision_score(y_train, NB_train))
precision_NB = precision_score(y_test, NB_pred).round(2) # Storing score in variable for display 
print(f'Precision Score on Test: {precision_NB}')
print('--'*20)

# Recall Score
print('Recall Score on train: ', recall_score(y_train, NB_train))
recall_NB = recall_score(y_test, NB_pred).round(2) # Storing score in variable for display 
print(f'Recall Score on Test: {recall_NB}')
print('--'*20)

# roc_auc_score
print('AUC-ROC Score on Train: ', roc_auc_score(y_train, NB_train))
roc_auc_NB = roc_auc_score(y_test, NB_pred).round(2) # Storing score in variable for display 
print(f'AUC_ROC Score on Test: {roc_auc_NB}')
print('--'*20)

# Kappa Score
kappa_NB = metrics.cohen_kappa_score(y_test, NB_pred).round(2) # Storing score in variable for display 
print(f'Kappa Score on Test: {kappa_DT}')

Accuracy Score on Train:  0.781875
Accuracy Score on Test: 0.79
----------------------------------------
Precision Score on train:  0.375
Precision Score on Test: 0.33
----------------------------------------
Recall Score on train:  0.08348457350272233
Recall Score on Test: 0.09
----------------------------------------
AUC-ROC Score on Train:  0.5236234904696533
AUC_ROC Score on Test: 0.52
----------------------------------------
Kappa Score on Test: 0.35


### Support Vector Machine

In [15]:
# Defining Model
SVM = SVC(kernel='linear')
SVM.fit(X_train, y_train)

# Prediction on Train
SVM_train = SVM.predict(X_train)
# Rpediction on Test
SVM_pred = SVM.predict(X_test)

# Evaluation of Results
print('Accuracy Score on Train: ', accuracy_score(y_train, SVM_train)) 
accuracy_svm = accuracy_score(y_test, SVM_pred).round(2) # Storing score in variable for display 
print(f'Accuracy Score on Test: {accuracy_svm}')
print('--'*20)

#Precision Score
print('Precision Score on Train:', metrics.precision_score(y_train, SVM_train))
precision_svm = metrics.precision_score(y_test, SVM_pred).round(2) # Storing score in variable for display 
print(f'Precision Score on Test: {precision_svm}')
print('--'*20)

# Recall Score
print('Recall Score on Train:', metrics.recall_score(y_train, SVM_train))
recall_svm = metrics.recall_score(y_test, SVM_pred).round(2) # Storing score in variable for display 
print(f'Recall Score on Test: {recall_svm}')
print('--'*20)

# roc_auc_score
print('AUC-ROC Score on Train: ', roc_auc_score(y_train, SVM_train))
roc_auc_svm = roc_auc_score(y_test, SVM_pred).round(2) # Storing score in variable for display 
print(f'AUC_ROC Score on Test: {roc_auc_svm}')
print('--'*20)

# Kappa Score
kappa_svm = metrics.cohen_kappa_score(y_test, SVM_pred).round(2) # Storing score in variable for display 
print(f'Kappa Score on Test: {kappa_svm}')

Accuracy Score on Train:  0.778375
Accuracy Score on Test: 0.79
----------------------------------------
Precision Score on Train: 0.2777777777777778
Precision Score on Test: 0.24
----------------------------------------
Recall Score on Train: 0.045372050816696916
Recall Score on Test: 0.05
----------------------------------------
AUC-ROC Score on Train:  0.5073244372564656
AUC_ROC Score on Test: 0.51
----------------------------------------
Kappa Score on Test: 0.02


### XGBoost 

In [17]:
# Defining and Fitting Model
xgboost = xgb.XGBClassifier()
xgboost.fit(X_train, y_train)

# Prediction for Train
xgb_train = xgboost.predict(X_train) 
# Predictioon for test
xgb_pred = xgboost.predict(X_test)

# Evaluation of Results
print('Accuracy Score on Train: ', accuracy_score(y_train, xgb_train)) 
accuracy_xgb = accuracy_score(y_test, xgb_pred).round(2) # Storing score in variable for display 
print(f'Accuracy Score on Test: {accuracy_xgb}')
print('--'*20)

#Precision Score
print('Precision Score on Train:', precision_score(y_train, xgb_train))
precision_xgb = precision_score(y_test, xgb_pred).round(2) # Storing score in variable for display 
print(f'Precision Score on Test: {precision_xgb}')
print('--'*20)

# Recall Score
print('Recall Score on Train:', recall_score(y_train, xgb_train))
recall_xgb = recall_score(y_test, xgb_pred).round(2) # Storing score in variable for display 
print(f'Recall Score on Test: {recall_xgb}')
print('--'*20)

# roc_auc_score
print('AUC-ROC Score on Train: ', roc_auc_score(y_train, xgb_train))
roc_auc_xgb = roc_auc_score(y_test, xgb_pred).round(2) # Storing score in variable for display 
print(f'AUC_ROC Score on Test: {roc_auc_xgb}')
print('--'*20)

# Kappa Score
kappa_xgb = metrics.cohen_kappa_score(y_test, xgb_pred).round(2) # Storing score in variable for display 
print(f'Kappa Score on Test: {kappa_xgb}')

Accuracy Score on Train:  0.957375
Accuracy Score on Test: 0.87
----------------------------------------
Precision Score on Train: 0.9746743849493488
Precision Score on Test: 0.69
----------------------------------------
Recall Score on Train: 0.8148820326678766
Recall Score on Test: 0.55
----------------------------------------
AUC-ROC Score on Train:  0.9046838082041132
AUC_ROC Score on Test: 0.75
----------------------------------------
Kappa Score on Test: 0.54


### Model Evaluation and Results

In [20]:
pip install prettytable

Collecting prettytable
  Downloading prettytable-3.8.0-py3-none-any.whl (27 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.8.0
Note: you may need to restart the kernel to use updated packages.


In [21]:
# Importing prettytable library for display of results
from prettytable import PrettyTable

# Create an instance of PrettyTable
table = PrettyTable()

# Define the column names
table.field_names = ["Model", "Accuracy", "Precision", "Recall", "AUC-ROC", 'Kappa']

# Add rows for each model's performance
table.add_row(["Logistic Regression", accuracy_lr, precision_lr, recall_lr, roc_auc_lr, kappa_lr])
table.add_row(["Decision Tree", accuracy_DT, precision_DT, recall_DT, roc_auc_DT, kappa_DT])
table.add_row(["Random Forest", accuracy_RF, precision_RF, recall_RF, roc_auc_RF, kappa_RF])
table.add_row(["Naive Bayes", accuracy_NB, precision_NB, recall_NB, roc_auc_NB, kappa_NB])
table.add_row(["Support Vector Machine", accuracy_svm, precision_svm, recall_svm, roc_auc_svm, kappa_svm])
table.add_row(["XGBoost", accuracy_xgb, precision_xgb, recall_xgb, roc_auc_xgb,kappa_xgb])



#### Displaying Evaluation Scores of Models

In [22]:
# Displaying PrettyTable that shows model and its accuracy on the test 
table.sortby = ('Accuracy')
table.reversesort = True
print(table)

+------------------------+----------+-----------+--------+---------+-------+
|         Model          | Accuracy | Precision | Recall | AUC-ROC | Kappa |
+------------------------+----------+-----------+--------+---------+-------+
|        XGBoost         |   0.87   |    0.69   |  0.55  |   0.75  |  0.54 |
|     Random Forest      |   0.87   |    0.75   |  0.51  |   0.74  |  0.54 |
|  Logistic Regression   |   0.8    |    0.38   |  0.07  |   0.52  |  0.06 |
| Support Vector Machine |   0.79   |    0.24   |  0.05  |   0.51  |  0.02 |
|      Naive Bayes       |   0.79   |    0.33   |  0.09  |   0.52  |  0.06 |
|     Decision Tree      |   0.79   |    0.46   |  0.51  |   0.68  |  0.35 |
+------------------------+----------+-----------+--------+---------+-------+


#### Model Selection

The **Random Forest** model performed the best with the best accuracy score, and has the highest chance of selecting the true positives. <br>
- It has an accuracy score of `87%` meaning it can reliably predict churn.
- It's precision and recall scores are `0.74` and `0.5` respectively, meaning it has a fairly high rate of correctly identifying True Positives.

