# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Random Forest
---

# 1. Importing packages and libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# 2. Reading file and tidying up columns

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

In [3]:
# def run_models(x, y, xt, yt, variation, model):
#     x_train = x.copy()
#     x_test = xt.copy()
       
#     if variation == 'correlation':
#         x_train.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
#         x_test.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
        
#     if variation == 'keep_yellow_only':
#         x_train.drop( ['total_trans_count', 'months_on_book','card_category','education_level',
#                'income_category', 'married', 'single','divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
#         x_test.drop( ['total_trans_count', 'months_on_book','card_category','education_level',
#            'income_category', 'married', 'single', 'divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
    
#     if variation == 'keep_yellow_blue':
#         x_train.drop( ['total_trans_count', 'months_on_book','income_category', 'married','single','divorced',
#                      'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
#         x_test.drop( ['total_trans_count', 'months_on_book','income_category', 'married','single','divorced',
#              'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)  
    
#     # Instantiate RandomClassifier, fit and predict
#     oversampler = SMOTE(random_state=2021)
#     x_train, y_train = oversampler.fit_resample(x_train, y)
    
#     model.fit(x_train, y_train.values.ravel() )
#     y_pred = model.predict(x_test)
    
#     print(f"-------------------------TEST SCORES for {variation}-----------------------")
#     print(f"Recall: {recall_score(y_test, y_pred)}")
#     print(f"Precision: {precision_score(y_test, y_pred)}")
#     print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
#     print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
#     print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
#     print()

In [4]:
# from sklearn.linear_model import LogisticRegression

# variations = ['base', 'correlation', 'keep_yellow_only', 'keep_yellow_blue']

# for variation in variations:
#     run_models(x_train, y_train, x_test, y_test, variation, LogisticRegression(solver='lbfgs', max_iter=500, random_state=2021))

In [5]:
# variations = ['base', 'correlation', 'keep_yellow_only', 'keep_yellow_blue']

# for variation in variations:
#     run_models(x_train, y_train, x_test, y_test, variation, RandomForestClassifier(random_state=2021))

# 3. Hyper paramter tuning with GridSearchCV

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from sklearn.metrics import confusion_matrix, recall_score, fbeta_score, roc_auc_score

In [7]:
# Creating parameter grid to search
n_estimators = [ 100, 500, 900 ]

max_depth = [ depth for depth in range(2, 32, 6) ]
max_depth.append(None)

max_features = list(range(1, x_train.shape[1]+1, 4))

params_grid = {
                'classifier__max_features': max_features,
                'classifier__n_estimators': n_estimators,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__max_features [1, 5, 9]
classifier__n_estimators [100, 500, 900]
classifier__max_depth [2, 8, 14, 20, 26, None]
-----------------
Total combinations: 54


In [None]:
smote_sampler = SMOTE(random_state=2021)
rf_clf = RandomForestClassifier(random_state=2021)

scale_features = x_train.columns[1:]

scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])


pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', smote_sampler],
                             ['classifier', rf_clf]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit(x_train, y_train.values.ravel() )

best_parameters = rf_gridsearch.best_params_
print(best_parameters)

In [None]:
rf_clf = rf_gridsearch.best_estimator_

y_pred = rf_clf.predict(x_test)

print(f"-------------------------TEST SCORES-----------------------")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
print()