# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Random Forest
---

# 1. Importing packages and libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import make_scorer, recall_score, fbeta_score, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix

# 2. Reading file and tidying up columns

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")


y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

# 3. Model Performance before Hyperparameter tuning

In [3]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer

smote_sampler = SMOTE(random_state=2021)

rf_clf = RandomForestClassifier(random_state=2021)

scale_features = x_train.columns

scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', smote_sampler],
                             ['classifier', rf_clf]])

pipeline.fit(x_train, y_train.values.ravel() )

y_pred = pipeline.predict(x_test)
recall = recall_score(y_test, y_pred)
fbeta2 = fbeta_score(y_test, y_pred, beta=2)
auc = roc_auc_score(y_test, y_pred)

base_results = [recall, fbeta2, auc]

print("-----------------------PERFORMANCE EVALUATION--------------------  ")
print(f"Recall: {recall} ")
print(f"Fbeta2: {fbeta2} ")
print(f"AUC Score: {auc} ")

-----------------------PERFORMANCE EVALUATION--------------------  
Recall: 0.8369230769230769 
Fbeta2: 0.8287629494210846 
AUC Score: 0.8981793515126848 


# 4. Hyper parameter tuning with GridSearchCV

## 4.1 First GridSearchCV

In [4]:
# Creating parameter grid to search
n_estimators = [100, 300, 500, 700, 900]

max_features = ['sqrt', 'log2']

max_depth = [5, 10, 15, 20]

min_samples_split = [3, 5, 8, 10, 13]

min_samples_leaf = [1, 3, 5, 8, 10]

params_grid = {
                'classifier__n_estimators': n_estimators,
                'classifier__max_features': max_features,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)
print('Across 5-folds =',total_combi*5)

classifier__n_estimators [100, 300, 500, 700, 900]
classifier__max_features ['sqrt', 'log2']
classifier__min_samples_split [3, 5, 8, 10, 13]
classifier__min_samples_leaf [1, 3, 5, 8, 10]
classifier__max_depth [5, 10, 15, 20]
-----------------
Total combinations: 1000
Across 5-folds = 5000


In [5]:
# # running grid search 1
# smote_sampler = SMOTE(random_state=2021)
# rf_clf = RandomForestClassifier(random_state=2021)

# scale_features = x_train.columns

# scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])

# pipeline = Pipeline(steps = [['scaler', scaler],
#                              ['smote', smote_sampler],
#                              ['classifier', rf_clf]])

# stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

# rf_gridsearch = GridSearchCV(estimator = pipeline,
#                            param_grid = params_grid,
#                            scoring = 'recall',
#                            cv = stratified_kfold,
#                            refit = True,
#                            n_jobs = -1)

# rf_gridsearch.fit(x_train, y_train.values.ravel() )

# best_parameters = rf_gridsearch.best_params_
# print(best_parameters)

{'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 100}

## 4.2 Second GridSearchCV
This second parameter search space aims to narrow down the search space based on the results obtained in the first grid search.

**Previous parameter search space:**
- n_estimators [100, 300, 500, 700, 900]
- max_features ['sqrt', 'log2']
- min_samples_split [3, 5, 8, 10, 13]
- min_samples_leaf [1, 3, 5, 8, 10]
- max_depth [5, 10, 15, 20]

**Obtained results:**

| Feature           | Value | New search space |
|-------------------|:-----:|------------|
| n_estimators      | 100   | [50, 75, 100, 125, 150] |
| max_features      |'sqrt' | Leave as sqrt |
| min_samples_split | 3     | [2, 3, 4] |
| min_samples_leaf  | 8     | [6, 7, 8, 9] |
| max_depth         | 10    | [8, 9, 10, 11, 12] |


In [6]:
# Creating parameter grid to search
n_estimators = [50, 75, 100, 125, 150]

min_samples_split = [2, 3, 4]

min_samples_leaf = [6, 7, 8, 9, 10]

max_depth = [8, 9, 10, 11, 12]

params_grid = {
                'classifier__n_estimators': n_estimators,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)
print('Across 5-folds =',total_combi*5)

classifier__n_estimators [50, 75, 100, 125, 150]
classifier__min_samples_split [2, 3, 4]
classifier__min_samples_leaf [6, 7, 8, 9, 10]
classifier__max_depth [8, 9, 10, 11, 12]
-----------------
Total combinations: 375
Across 5-folds = 1875


In [7]:
# # running grid search 2
# smote_sampler = SMOTE(random_state=2021)
# rf_clf = RandomForestClassifier(max_features='sqrt', random_state=2021)

# scale_features = x_train.columns

# scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])

# pipeline = Pipeline(steps = [['scaler', scaler],
#                              ['smote', smote_sampler],
#                              ['classifier', rf_clf]])

# stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

# rf_gridsearch = GridSearchCV(estimator = pipeline,
#                            param_grid = params_grid,
#                            scoring = 'recall',
#                            cv = stratified_kfold,
#                            refit = True,
#                            n_jobs = -1)

# rf_gridsearch.fit(x_train, y_train.values.ravel() )

# best_parameters = rf_gridsearch.best_params_
# print(best_parameters)

{'classifier__max_depth': 10, 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 125}

## 4.3 Third GridSearchCV
This third parameter search space aims to narrow down the search space based on the results obtained in the first grid search.

**Previous parameter search spaces:**
1. GridSearch 1
- n_estimators = [100, 300, 500, 700, 900]
- max_features = ['sqrt', 'log2']
- min_samples_split = [3, 5, 8, 10, 13]
- min_samples_leaf = [1, 3, 5, 8, 10]
- max_depth = [5, 10, 15, 20]

2. GridSearch 2
- n_estimators = [50, 75, 100, 125, 150]
- min_samples_split = [2, 3, 4]
- min_samples_leaf = [6, 7, 8, 9, 10]
- max_depth = [8, 9, 10, 11, 12]

**Obtained results:**

| Feature           | GridSearch1 | GridSearch2 |      New search space     |
|-------------------|:-----------:|:-----------:|:-------------------------:|
| n_estimators      |     100     |     125     | [100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150] |
| min_samples_split |      3      |      2      |         [2,3]             |
| min_samples_leaf  |      8      |      8      |         Leave as 8        |
|    max_depth      |     10      |     10      |         Leave as 10       |



In [8]:
# Creating parameter grid to search
n_estimators = [100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150]
min_samples_split = [2, 3]

params_grid = {
                'classifier__n_estimators': n_estimators,
                'classifier__min_samples_split': min_samples_split
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)
print('Across 5-folds =', total_combi*5)

classifier__n_estimators [100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150]
classifier__min_samples_split [2, 3]
-----------------
Total combinations: 22
Across 5-folds = 110


In [9]:
# # running grid search 3
# smote_sampler = SMOTE(random_state=2021)
# rf_clf = RandomForestClassifier(min_samples_leaf=8, max_depth=10, max_features='sqrt', random_state=2021)

# scale_features = x_train.columns

# scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])

# pipeline = Pipeline(steps = [['scaler', scaler],
#                              ['smote', smote_sampler],
#                              ['classifier', rf_clf]])

# stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

# rf_gridsearch = GridSearchCV(estimator = pipeline,
#                            param_grid = params_grid,
#                            scoring = 'recall',
#                            cv = stratified_kfold,
#                            refit = True,
#                            n_jobs = -1)

# rf_gridsearch.fit(x_train, y_train.values.ravel() )

# best_parameters = rf_gridsearch.best_params_
# print(best_parameters)

{'classifier__min_samples_split': 2, 'classifier__n_estimators': 105}

# 4. Evaluation on Test

In [None]:
smote_sampler = SMOTE(random_state=2021)

rf_clf = RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=8,
                                min_samples_split=2, n_estimators=105,
                                random_state=2021)

scale_features = x_train.columns

scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', smote_sampler],
                             ['classifier', rf_clf]])

pipeline.fit(x_train, y_train.values.ravel() )

y_pred = pipeline.predict(x_test)

recall = recall_score(y_test, y_pred)
fbeta2 = fbeta_score(y_test, y_pred, beta=2)
auc = roc_auc_score(y_test, y_pred)

final_results = [recall, fbeta2, auc]

print("-----------------------PERFORMANCE EVALUATION--------------------  ")
print(f"Recall: {recall} ")
print(f"Fbeta2: {fbeta2} ")
print(f"AUC Score: {auc} ")

In [None]:
p

In [None]:
cm = confusion_matrix(y_test, y_pred)
labels = ['Existing Customer', 'Attrited Customer']

diagram = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
diagram.plot(cmap=plt.cm.Blues)
plt.show()