# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Random Forest
---

# 1. Importing packages and libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# 2. Reading file and tidying up columns

In [2]:
df = pd.read_csv("../Data/BankChurners.csv")
df.drop(columns=["CLIENTNUM", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], axis=1, inplace=True)

# tidy up and standardise column naming
tidied_cols = []
for col_name in df.columns:
    col_name = col_name.lower()
    if '_ct' in col_name:
        col_name = col_name.replace('_ct', '_count')
    if '_chng' in col_name:
        col_name = col_name.replace('_chng', '_change')
    tidied_cols.append( col_name )

df.columns = tidied_cols.copy()

numerical = list(df.describe().columns)

categorical = [i for i in df.columns if i not in numerical and i != "attrition_flag"]

# 3. Create train test split

In [3]:
y = df[["attrition_flag"]]
x = df.drop("attrition_flag", axis=1)
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_age              10127 non-null  int64  
 1   gender                    10127 non-null  object 
 2   dependent_count           10127 non-null  int64  
 3   education_level           10127 non-null  object 
 4   marital_status            10127 non-null  object 
 5   income_category           10127 non-null  object 
 6   card_category             10127 non-null  object 
 7   months_on_book            10127 non-null  int64  
 8   total_relationship_count  10127 non-null  int64  
 9   months_inactive_12_mon    10127 non-null  int64  
 10  contacts_count_12_mon     10127 non-null  int64  
 11  credit_limit              10127 non-null  float64
 12  total_revolving_bal       10127 non-null  int64  
 13  avg_open_to_buy           10127 non-null  float64
 14  total_

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2021)

# 4. Data Preprocessing

In [5]:
def impute_unknown(df_x):
    """
        Imputes unknown values to their mode
    """
    x = df_x.copy()

    marital_status_mode = x["marital_status"].mode()[0]
    x["marital_status"] = x["marital_status"].replace("Unknown", marital_status_mode)
    education_level_mode = x["education_level"].mode()[0]
    x["education_level"] = x["education_level"].replace("Unknown", education_level_mode)
    income_category_mode = x["income_category"].mode()[0]
    x["income_category"] = x["income_category"].replace("Unknown", income_category_mode)
    
    return x

In [6]:
def data_preprocessing(df_x, df_y):
    x = df_x.copy()
    y = df_y.copy()
    
    # Encoding features with binary categories
    label_enc = LabelEncoder()
    y["attrition_flag"] = label_enc.fit_transform(y["attrition_flag"])
    x["gender"] = label_enc.fit_transform(x["gender"])
    
    # Encoding features with multiple categories
    onehot_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    values = onehot_enc.fit_transform(x[["marital_status"]])
    labels = np.array(["divorced", "married", "single"]).ravel()
    marital_status_df = pd.DataFrame(values, columns=labels)

    x.reset_index(drop=True, inplace=True)
    marital_status_df.reset_index(drop=True, inplace=True)
    x = pd.concat([x, marital_status_df], axis=1)
    x.drop("marital_status", axis=1, inplace=True)
    
    # Encoding Ordinal Features
    edu_level_mapper = {"Doctorate": 1, "Post-Graduate": 2, "Graduate": 3, "College": 4, "High School": 5, "Uneducated": 6}
    x["education_level"] = x["education_level"].replace(edu_level_mapper)

    income_cat_mapper = {"$120K +": 1, "$80K - $120K":2, "$60K - $80K":3, "$40K - $60K": 4, "Less than $40K": 5}
    x["income_category"] = x["income_category"].replace(income_cat_mapper)

    card_cat_mapper = {"Platinum":1, "Gold":2, "Silver":3, "Blue": 4}
    x["card_category"] = x["card_category"].replace(card_cat_mapper)
    
    # Feature Transformation — Scaling
    skewed = ["credit_limit", "total_amt_change_q4_q1", "total_trans_amt", "total_count_change_q4_q1"]

    for skewed_col in skewed:
#         x[skewed_col] = np.where(x[skewed_col] > 0 , np.log(x[skewed_col]), 0) # not using, leaving here for now
        x[skewed_col] = np.log(x[skewed_col].mask(x[skewed_col] <=0)).fillna(0)

    scaler = MinMaxScaler()
    x[numerical] = scaler.fit_transform(x[numerical]) 
    
    return x, y

### Execute preprocessing steps

In [7]:
# Data cleaning - imputing unknown
x_train = impute_unknown(x_train)
x_test = impute_unknown(x_test)

# Dropping correlated features
df = df.drop(columns=["customer_age", "avg_open_to_buy", "total_trans_count"])

# Feature engineering and feature scaling
x_train, y_train = data_preprocessing(x_train, y_train)
x_test, y_test = data_preprocessing(x_test, y_test)

# 5. Preprocessing Evaluation

In [8]:
def run_variation_model(x_train, y_train, x_test, y_test, variation):
    if "pca" in variation:
        pca = PCA(0.9, random_state=2021)
        x_train = pca.fit_transform(x_train)
        x_test = pca.fit_transform(x_test)
        
#     https://medium.com/analytics-vidhya/categorical-feature-selection-using-chi-squared-test-e4c0d0af6b7e
#     https://towardsdatascience.com/using-the-chi-squared-test-for-feature-selection-with-implementation-b15a4dad93f1
    if "chi_square" in variation:
        chi_scores = chi2(x_train, y_train)
        p_values = pd.Series(chi_scores[1], index = x_train.columns)
        p_values.sort_values(ascending = False , inplace = True)
#         p_values.plot.bar()
        x_train = x_train.drop(["card_category", "months_on_book", "divorced", "total_amt_change_q4_q1", "dependent_count", "single", "income_category", "married", "education_level", "credit_limit", "gender"], axis=1)
        x_test = x_test.drop(["card_category", "months_on_book", "divorced", "total_amt_change_q4_q1", "dependent_count", "single", "income_category", "married", "education_level", "credit_limit", "gender"], axis=1)
        
    if "smote" in variation:
        oversampler = SMOTE(random_state=2021)
        x_train, y_train = oversampler.fit_resample(x_train, y_train)
    
    # Instantiate RandomClassifier, fit and predict
    rf_clf = RandomForestClassifier(random_state=2021)
    
    rf_clf.fit(x_train, y_train.values.ravel() )
    y_pred = rf_clf.predict(x_test)
    
    print(f"-------------------------TEST SCORES for {variation}-----------------------")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
    print()

In [9]:
variations = ["base",
              "smote",
              "chi_square",
              "pca",
              "smote, chi_square",
              "smote, pca"]

for variation in variations:
    run_variation_model(x_train, y_train, x_test, y_test, variation)

-------------------------TEST SCORES for base-----------------------
Recall: 0.9876543209876543
Precision: 0.9621993127147767
F2-Score: 0.9824561403508772
Accuracy score: 0.9570582428430404
AUC Score: 0.8922886989553656

-------------------------TEST SCORES for smote-----------------------
Recall: 0.9717813051146384
Precision: 0.9677985948477752
F2-Score: 0.9709821428571428
Accuracy score: 0.9491609081934848
AUC Score: 0.9012752679419346

-------------------------TEST SCORES for chi_square-----------------------
Recall: 0.9829512051734274
Precision: 0.9720930232558139
F2-Score: 0.9807602064758331
Accuracy score: 0.9619940769990128
AUC Score: 0.91762944874056

-------------------------TEST SCORES for pca-----------------------
Recall: 0.9535567313345091
Precision: 0.8559366754617415
F2-Score: 0.9322910679388435
Accuracy score: 0.8262586377097729
AUC Score: 0.5567783656672546

-------------------------TEST SCORES for smote, chi_square-----------------------
Recall: 0.9653145208700764
Pre

# 6. Hyper paramter tuning with GridSearchCV

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import recall_score

In [11]:
# Creating parameter grid to search
n_estimators = [ n for n in range(100, 1500+1, 200) ]

max_depth = [ depth for depth in range(5, 30, 5) ]
max_depth.append( None )

max_features = list(range(1, x_train.shape[1], 4))

min_samples_leaf = [1, 2, 4]

params_grid = {
               'classifier__max_features': max_features,
               'classifier__min_samples_leaf': min_samples_leaf,
               'classifier__n_estimators': n_estimators,
               'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__max_features [1, 5, 9, 13, 17]
classifier__min_samples_leaf [1, 2, 4]
classifier__n_estimators [100, 300, 500, 700, 900, 1100, 1300, 1500]
classifier__max_depth [5, 10, 15, 20, 25, None]
-----------------
Total combinations: 720


In [12]:
# smote_sampler = SMOTE(random_state=2021)
# rf_clf = RandomForestClassifier(random_state=2021)

# pipeline = Pipeline(steps = [['smote', smote_sampler],
#                              ['classifier', rf_clf]])

# stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

# rf_gridsearch = RandomizedSearchCV(estimator = pipeline,
#                            param_distributions = params_grid,
#                            scoring = 'recall',
#                            cv = stratified_kfold,
#                            refit = True,
#                            n_jobs = -1,
#                            random_state = 2021)

# rf_gridsearch.fit(x_train, y_train)

# best_parameters = rf_gridsearch.best_params_
# print(best_parameters)