In [94]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics

import matplotlib.pyplot as plt

In [95]:
df = pd.read_csv("dataset/BankChurners.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       9000 non-null   int64  
 1   Geography        9000 non-null   object 
 2   Tenure           9000 non-null   int64  
 3   Balance          9000 non-null   float64
 4   NumOfProducts    9000 non-null   int64  
 5   HasCrCard        9000 non-null   int64  
 6   IsActiveMember   9000 non-null   int64  
 7   EstimatedSalary  9000 non-null   float64
 8   Exited           9000 non-null   int64  
 9   CreditLevel      9000 non-null   int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 703.2+ KB


## Clean

In [96]:
df_clean = df.copy()

In [98]:
geo_onehot = pd.get_dummies(df_clean["Geography"], prefix="Geo")
df_clean = df_clean.join(geo_onehot)

In [99]:
df_clean.head()

Unnamed: 0,CustomerId,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel,Geo_France,Geo_Germany,Geo_Spain
0,15762418,Spain,3,121681.82,1,1,0,128643.35,1,8,0,0,1
1,15749905,Spain,6,0.0,1,1,0,50213.81,1,7,0,0,1
2,15600911,France,2,182888.08,1,1,0,3061.0,0,7,1,0,0
3,15572762,Germany,2,102278.79,2,1,0,89822.48,0,2,0,1,0
4,15627848,France,7,109346.13,2,1,0,102665.92,0,7,1,0,0


In [100]:
std_cols = preprocessing.StandardScaler().fit_transform(df_clean[["Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]])
df_clean[["Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]] = std_cols

In [101]:
feature_names=["Geo_France","Geo_Germany","Geo_Spain","Tenure","Balance","NumOfProducts","HasCrCard","IsActiveMember","EstimatedSalary","Exited"]
label_name="CreditLevel"

X = df_clean[feature_names]
y = df_clean[label_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)

(6300, 10) (2700, 10)


In [107]:
# Define model. Specify a number for random_state to ensure same results each run
credit_model = DecisionTreeClassifier(
    criterion = 'entropy', 
    max_depth = 20, # default is None, max is 32
    min_samples_leaf = 1, # default
    min_samples_split = 2, # default
    random_state = 40
)  
credit_model.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=40)

In [108]:
y_pred= credit_model.predict(X_test)

In [109]:
metrics.accuracy_score(y_test, y_pred)

0.14814814814814814

In [110]:
res = pd.DataFrame({"y_true":y_test, "y_pred":y_pred})
res.head()
res.to_csv("decision_tree.csv", index=False)

In [93]:
from sklearn.tree import export_text

r = export_text(credit_model, feature_names=feature_names)
print(r)

|--- Exited <= 0.50
|   |--- EstimatedSalary <= 1.16
|   |   |--- EstimatedSalary <= 0.63
|   |   |   |--- EstimatedSalary <= -0.64
|   |   |   |   |--- Balance <= 0.23
|   |   |   |   |   |--- Balance <= 0.09
|   |   |   |   |   |   |--- Tenure <= -1.56
|   |   |   |   |   |   |   |--- NumOfProducts <= -0.05
|   |   |   |   |   |   |   |   |--- EstimatedSalary <= -1.45
|   |   |   |   |   |   |   |   |   |--- EstimatedSalary <= -1.60
|   |   |   |   |   |   |   |   |   |   |--- class: 7
|   |   |   |   |   |   |   |   |   |--- EstimatedSalary >  -1.60
|   |   |   |   |   |   |   |   |   |   |--- class: 9
|   |   |   |   |   |   |   |   |--- EstimatedSalary >  -1.45
|   |   |   |   |   |   |   |   |   |--- EstimatedSalary <= -1.11
|   |   |   |   |   |   |   |   |   |   |--- EstimatedSalary <= -1.35
|   |   |   |   |   |   |   |   |   |   |   |--- class: 6
|   |   |   |   |   |   |   |   |   |   |--- EstimatedSalary >  -1.35
|   |   |   |   |   |   |   |   |   |   |   |--- truncated br

# Random Forest

In [63]:
# define random forest
clf=RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

In [64]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.16407407407407407
