# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Random Forest
---

# 1. Importing packages and libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# 2. Reading file and tidying up columns

In [2]:
df = pd.read_csv("../Data/BankChurners.csv")
df.drop(columns=["CLIENTNUM", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], axis=1, inplace=True)
df = df.drop(columns=["Customer_Age", "Avg_Open_To_Buy", "Total_Trans_Ct"])
df.columns = ['attrition_flag', 'gender', 'dependent_count', 'education_level', 
              'marital_status', 'income_category', 'card_category', 'months_on_book', 
              'total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 
              'credit_limit', 'total_revolving_bal', 'total_amt_change_q4_q1', 
              'total_trans_amt','total_count_change_q4_q1', 'avg_utilization_ratio']

numerical = ['dependent_count', 'months_on_book', 
             'total_relationship_count', 'months_inactive_12_month',
             'contacts_count_12_month', 'credit_limit', 'total_revolving_bal',
             'total_amt_change_q4_q1', 'total_trans_amt',
             'total_count_change_q4_q1', 'avg_utilization_ratio']

# 3. Create train test split

In [3]:
y = df[["attrition_flag"]]
x = df.drop("attrition_flag", axis=1)
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   gender                    10127 non-null  object 
 1   dependent_count           10127 non-null  int64  
 2   education_level           10127 non-null  object 
 3   marital_status            10127 non-null  object 
 4   income_category           10127 non-null  object 
 5   card_category             10127 non-null  object 
 6   months_on_book            10127 non-null  int64  
 7   total_relationship_count  10127 non-null  int64  
 8   months_inactive_12_month  10127 non-null  int64  
 9   contacts_count_12_month   10127 non-null  int64  
 10  credit_limit              10127 non-null  float64
 11  total_revolving_bal       10127 non-null  int64  
 12  total_amt_change_q4_q1    10127 non-null  float64
 13  total_trans_amt           10127 non-null  int64  
 14  total_

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2021)

# 4. Data Preprocessing

In [5]:
def impute_unknown(df_x):
    """
        Imputes unknown values to their mode
    """
    x = df_x.copy()

    marital_status_mode = x["marital_status"].mode()[0]
    x["marital_status"] = x["marital_status"].replace("Unknown", marital_status_mode)
    education_level_mode = x["education_level"].mode()[0]
    x["education_level"] = x["education_level"].replace("Unknown", education_level_mode)
    income_category_mode = x["income_category"].mode()[0]
    x["income_category"] = x["income_category"].replace("Unknown", income_category_mode)
    
    return x
    
x_train = impute_unknown(x_train)
x_test = impute_unknown(x_test)

In [6]:
def data_preprocessing(df_x, df_y):
    x = df_x.copy()
    y = df_y.copy()
    
    # Encoding features with binary categories
    label_enc = LabelEncoder()
    y["attrition_flag"] = label_enc.fit_transform(y["attrition_flag"])
    x["gender"] = label_enc.fit_transform(x["gender"])
    
    # Encoding features with multiple categories
    onehot_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    values = onehot_enc.fit_transform(x[["marital_status"]])
    labels = np.array(["divorced", "married", "single"]).ravel()
    marital_status_df = pd.DataFrame(values, columns=labels)

    x.reset_index(drop=True, inplace=True)
    marital_status_df.reset_index(drop=True, inplace=True)
    x = pd.concat([x, marital_status_df], axis=1)
    x.drop("marital_status", axis=1, inplace=True)
    
    # Encoding Ordinal Features
    edu_level_mapper = {"Doctorate": 1, "Post-Graduate": 2, "Graduate": 3, "College": 4, "High School": 5, "Uneducated": 6}
    x["education_level"] = x["education_level"].replace(edu_level_mapper)

    income_cat_mapper = {"$120K +": 1, "$80K - $120K":2, "$60K - $80K":3, "$40K - $60K": 4, "Less than $40K": 5}
    x["income_category"] = x["income_category"].replace(income_cat_mapper)

    card_cat_mapper = {"Platinum":1, "Gold":2, "Silver":3, "Blue": 4}
    x["card_category"] = x["card_category"].replace(card_cat_mapper)
    
    # Feature Transformation — Scaling
    skewed = ["credit_limit", "total_amt_change_q4_q1", "total_trans_amt", "total_count_change_q4_q1"]

    for skewed_col in skewed:
#         x[skewed_col] = np.where(x[skewed_col] > 0 , np.log(x[skewed_col]), 0) # not using, leaving here for now
        x[skewed_col] = np.log(x[skewed_col].mask(x[skewed_col] <=0)).fillna(0)

    scaler = MinMaxScaler()
    x[numerical] = scaler.fit_transform(x[numerical]) 
    
    return x, y

In [7]:
# Execute preprocessing steps
x_train, y_train = data_preprocessing(x_train, y_train)
x_test, y_test = data_preprocessing(x_test, y_test)

# 5. Running Model

In [20]:
def run_variation_model(x_train, y_train, x_test, y_test, variation):
    if "pca" in variation:
        pca = PCA(0.9, random_state=2021)
        x_train = pca.fit_transform(x_train)
        x_test = pca.fit_transform(x_test)
        
#     https://medium.com/analytics-vidhya/categorical-feature-selection-using-chi-squared-test-e4c0d0af6b7e
#     https://towardsdatascience.com/using-the-chi-squared-test-for-feature-selection-with-implementation-b15a4dad93f1
    if "chi_square" in variation:
        selector = SelectKBest(score_func=chi2, k=8)
        fit = selector.fit(x_train, y_train)
        kbest_cols = x_train.columns[ selector.get_support() ]
        drop_cols = [ col for col in x_train.columns if col not in kbest_cols]
        print(drop_cols)
        
    if "smote" in variation:
        oversampler = SMOTE(random_state=2021)
        x_train, y_train = oversampler.fit_resample(x_train, y_train)
    
    # Instantiate RandomClassifier, fit and predict
    rf_clf = RandomForestClassifier(random_state=2021)
    
    rf_clf.fit(x_train, y_train.values.ravel() )
    y_pred = rf_clf.predict(x_test)
    
    print(f"-------------------------TEST SCORES for {variation}-----------------------")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
    print()

In [21]:
variations = ["base",
              "smote",
              "chi_square",
              "pca",
              "smote, chi_square",
              "smote, pca"]

for variation in variations:
    run_variation_model(x_train, y_train, x_test, y_test, variation)

-------------------------TEST SCORES for base-----------------------
Recall: 0.9870664315108759
Precision: 0.9491237987563595
F2-Score: 0.9792371398576928
Accuracy score: 0.9447186574531096
AUC Score: 0.8550716772938995

-------------------------TEST SCORES for smote-----------------------
Recall: 0.9653145208700764
Precision: 0.9546511627906977
F2-Score: 0.9631628343500703
Accuracy score: 0.9323790720631787
AUC Score: 0.8626572604350382

['dependent_count', 'education_level', 'income_category', 'card_category', 'months_on_book', 'credit_limit', 'total_amt_change_q4_q1', 'divorced', 'married', 'single']
-------------------------TEST SCORES for chi_square-----------------------
Recall: 0.9870664315108759
Precision: 0.9491237987563595
F2-Score: 0.9792371398576928
Accuracy score: 0.9447186574531096
AUC Score: 0.8550716772938995

-------------------------TEST SCORES for pca-----------------------
Recall: 0.974132863021752
Precision: 0.8441161487519103
F2-Score: 0.9450211018592449
Accuracy 