Data Science Project #1: Bank Customer Churn Analysis using a Classification Model
Goal: Get back into Python and learn about different classification ML models
Models Used: RandomForestClassifier, SVM, KNN

In [60]:
import os
import pandas as pd 
import numpy as np

#for data processing and testing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

#importing models
from sklearn.ensemble import RandomForestClassifier

#importing performance KPIs
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score

In [24]:
#Set working directory
os.chdir('/Users/michaelyoo/Documents/Projects/Python Projects/Bank Customer Churn Project')

In [43]:
#Read in data
df = pd.read_csv("Churn_Modeling.csv")
print(df.head(10))

#Get rid of junk columns
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis = 1)

#One Hot Encode categorical variables to add them as features, drop first to prevent multicollinearity
df_encoded = pd.get_dummies(df, columns = ['Gender', 'Geography'], drop_first=True)

print(df_encoded.head())
df = df_encoded

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   
5          6    15574012       Chu          645     Spain    Male   44   
6          7    15592531  Bartlett          822    France    Male   50   
7          8    15656148    Obinna          376   Germany  Female   29   
8          9    15792365        He          501    France    Male   44   
9         10    15592389        H?          684    France    Male   27   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0       

In [56]:
#Set y and X
y = df["Exited"]
x = df.drop("Exited", axis = 1)

#Split data into training and test set
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.70,random_state=42)

In [68]:
#Random Forest Classifier
model = RandomForestClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
model.score(x_test, y_test)

# Counts of true/false positives and negatives
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Overall correctness of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Out of predicted positives, how many are correct
precision = precision_score(y_test, y_pred, average='binary')
print("Precision:", precision)

# Out of actual positives, how many did the model detect
recall = recall_score(y_test, y_pred, average='binary')
print("Recall:", recall)

# Harmonic mean of precision and recall, balancing both
f1 = f1_score(y_test, y_pred, average='binary')
print("F1 Score:", f1)

Confusion Matrix:
 [[2331   85]
 [ 309  275]]
Accuracy: 0.8686666666666667
Precision: 0.7638888888888888
Recall: 0.4708904109589041
F1 Score: 0.5826271186440678
