# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Random Forest
---

# 1. Importing packages and libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# 2. Reading file and tidying up columns

In [17]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

In [29]:
x_train.columns

Index(['customer_age', 'gender', 'dependent_count', 'education_level',
       'income_category', 'card_category', 'months_on_book',
       'total_relationship_count', 'months_inactive_12_mon',
       'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
       'avg_open_to_buy', 'total_amt_change_q4_q1', 'total_trans_amt',
       'total_trans_count', 'total_count_change_q4_q1',
       'avg_utilization_ratio', 'divorced', 'married', 'single'],
      dtype='object')

In [50]:
def run_models(x, y, xt, yt, variation, model):
    x_train = x.copy()
    x_test = xt.copy()
       
    if variation == 'correlation':
        x_train.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
        x_test.drop( ['avg_open_to_buy', 'customer_age', 'total_trans_count', 'months_on_book'], axis=1, inplace=True)
        
    if variation == 'keep_yellow_only':
        x_train.drop( ['total_trans_count', 'months_on_book','card_category','education_level',
               'income_category', 'married', 'single','divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
        x_test.drop( ['total_trans_count', 'months_on_book','card_category','education_level',
           'income_category', 'married', 'single', 'divorced', 'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
    
    if variation == 'keep_yellow_blue':
        x_train.drop( ['total_trans_count', 'months_on_book','income_category', 'married','single','divorced',
                     'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)
        x_test.drop( ['total_trans_count', 'months_on_book','income_category', 'married','single','divorced',
             'avg_open_to_buy', 'customer_age'], axis=1, inplace=True)  
    
    # Instantiate RandomClassifier, fit and predict
    oversampler = SMOTE(random_state=2021)
    x_train, y_train = oversampler.fit_resample(x_train, y)
    
    model.fit(x_train, y_train.values.ravel() )
    y_pred = model.predict(x_test)
    
    print(f"-------------------------TEST SCORES for {variation}-----------------------")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
    print()

In [52]:
from sklearn.naive_bayes import BernoulliNB

variations = ['base', 'correlation', 'keep_yellow_only', 'keep_yellow_blue']

for variation in variations:
    run_models(x_train, y_train, x_test, y_test, variation, BernoulliNB() )

-------------------------TEST SCORES for base-----------------------
Recall: 0.5415384615384615
Precision: 0.3651452282157676
F2-Score: 0.49382716049382713
Accuracy score: 0.7754195459032577
AUC Score: 0.6808221408221408

-------------------------TEST SCORES for correlation-----------------------
Recall: 0.5353846153846153
Precision: 0.359504132231405
F2-Score: 0.48766816143497754
Accuracy score: 0.7724580454096742
AUC Score: 0.676569438791661

-------------------------TEST SCORES for keep_yellow_only-----------------------
Recall: 0.5384615384615384
Precision: 0.36082474226804123
F2-Score: 0.49019607843137253
Accuracy score: 0.7729516288252715
AUC Score: 0.6781079003301225

-------------------------TEST SCORES for keep_yellow_blue-----------------------
Recall: 0.5384615384615384
Precision: 0.36082474226804123
F2-Score: 0.49019607843137253
Accuracy score: 0.7729516288252715
AUC Score: 0.6781079003301225



In [53]:
from sklearn.linear_model import LogisticRegression

variations = ['base', 'correlation', 'keep_yellow_only', 'keep_yellow_blue']

for variation in variations:
    run_models(x_train, y_train, x_test, y_test, variation, LogisticRegression(solver='lbfgs', max_iter=500, random_state=2021))

-------------------------TEST SCORES for base-----------------------
Recall: 0.8092307692307692
Precision: 0.5560253699788583
F2-Score: 0.7416807670614777
Accuracy score: 0.8657453109575518
AUC Score: 0.8428869895536563

-------------------------TEST SCORES for correlation-----------------------
Recall: 0.7015384615384616
Precision: 0.48
F2-Score: 0.6422535211267606
Accuracy score: 0.8302073050345509
AUC Score: 0.7781648803871026

-------------------------TEST SCORES for keep_yellow_only-----------------------
Recall: 0.676923076923077
Precision: 0.4592901878914405
F2-Score: 0.6183249016301293
Accuracy score: 0.8203356367226061
AUC Score: 0.7623298512187401

-------------------------TEST SCORES for keep_yellow_blue-----------------------
Recall: 0.6861538461538461
Precision: 0.4714587737843552
F2-Score: 0.62887760857304
Accuracy score: 0.8262586377097729
AUC Score: 0.7695907384796273



In [48]:
variations = ['base', 'correlation', 'keep_yellow_only', 'keep_yellow_blue']

for variation in variations:
    run_models(x_train, y_train, x_test, y_test, variation, RandomForestClassifier(random_state=2021))


-------------------------TEST SCORES for base-----------------------
Recall: 0.8215384615384616
Precision: 0.8612903225806452
F2-Score: 0.829192546583851
Accuracy score: 0.9501480750246791
AUC Score: 0.8981296070184959

-------------------------TEST SCORES for correlation-----------------------
Recall: 0.7692307692307693
Precision: 0.8090614886731392
F2-Score: 0.7768800497203232
Accuracy score: 0.9338598223099703
AUC Score: 0.8672726450504228

-------------------------TEST SCORES for keep_yellow_only-----------------------
Recall: 0.7815384615384615
Precision: 0.8220064724919094
F2-Score: 0.7893101305158485
Accuracy score: 0.9378084896347483
AUC Score: 0.8746022701578257

-------------------------TEST SCORES for keep_yellow_blue-----------------------
Recall: 0.7723076923076924
Precision: 0.8229508196721311
F2-Score: 0.7819314641744549
Accuracy score: 0.9368213228035538
AUC Score: 0.8702808302808304



# 3. Hyper paramter tuning with GridSearchCV

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import recall_score

In [11]:
# # Creating parameter grid to search
# n_estimators = [ n for n in range(100, 1500+1, 200) ]

# max_depth = [ depth for depth in range(5, 30, 5) ]
# max_depth.append( None )

# max_features = list(range(1, x_train.shape[1], 4))

# min_samples_leaf = [1, 2, 4]

# params_grid = {
#                'classifier__max_features': max_features,
#                'classifier__min_samples_leaf': min_samples_leaf,
#                'classifier__n_estimators': n_estimators,
#                'classifier__max_depth': max_depth
#               }

# total_combi = 1
# for param, value in params_grid.items():
#     print(param, value)
#     total_combi *= len(value)

# print('-----------------')
# print('Total combinations:', total_combi)

classifier__max_features [1, 5, 9, 13, 17]
classifier__min_samples_leaf [1, 2, 4]
classifier__n_estimators [100, 300, 500, 700, 900, 1100, 1300, 1500]
classifier__max_depth [5, 10, 15, 20, 25, None]
-----------------
Total combinations: 720


In [12]:
# smote_sampler = SMOTE(random_state=2021)
# rf_clf = RandomForestClassifier(random_state=2021)

# pipeline = Pipeline(steps = [['smote', smote_sampler],
#                              ['classifier', rf_clf]])

# stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

# rf_gridsearch = RandomizedSearchCV(estimator = pipeline,
#                            param_distributions = params_grid,
#                            scoring = 'recall',
#                            cv = stratified_kfold,
#                            refit = True,
#                            n_jobs = -1,
#                            random_state = 2021)

# rf_gridsearch.fit(x_train, y_train)

# best_parameters = rf_gridsearch.best_params_
# print(best_parameters)