# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### RandomForests
---

# 1. Setting up the notebook

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_theme(style = "darkgrid")

### Import dataset

In [2]:
data = pd.read_csv("../dataset/loanprediction.csv", index_col = 0)
data.head()

Unnamed: 0_level_0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [3]:
data.shape

(252000, 12)

### Renaming columns

In [4]:
data.columns = ['income', 'age', 'experience', 'marital_status', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_yrs',
       'current_house_yrs', 'risk_flag']

numerical = ["income", "age", "experience", "current_job_yrs", "current_house_yrs"]
categorical = ["marital_status", "house_ownership", "car_ownership", "profession", "city", "state"] 

# 2. Feature Engineering

## Checking cardinality of categorical features before encoding

In [5]:
for column in categorical:
    unique_categories = data[column].nunique()
    print( column, ":" + str(unique_categories) )

marital_status :2
house_ownership :3
car_ownership :2
profession :51
city :317
state :29


The following encoding will be done to the categorical features:

- marital_status, car_ownership – binarise
- house_ownership – one-hot
- profession, city, state – count encoding

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

In [7]:
label_encoder = LabelEncoder()

for col in ['marital_status','car_ownership']:
    data[col] = label_encoder.fit_transform( data[col] )

In [8]:
onehot_encoder = OneHotEncoder(sparse = False)
data['house_ownership'] = onehot_encoder.fit_transform(data['house_ownership'].values.reshape(-1, 1) )

In [9]:
high_card_features = ['profession', 'city', 'state']

count_encoder = ce.CountEncoder()

# Transform the features, rename the columns with the _count suffix, and join to dataframe
count_encoded = count_encoder.fit_transform( data[high_card_features] )
data = data.join(count_encoded.add_suffix("_count"))

In [10]:
# remove old columns
data.drop(labels=['profession', 'city', 'state'], axis=1, inplace=True)
data.head()

Unnamed: 0_level_0,income,age,experience,marital_status,house_ownership,car_ownership,current_job_yrs,current_house_yrs,risk_flag,profession_count,city_count,state_count
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1303834,23,3,1,0.0,0,3,13,0,5217,798,14122
2,7574516,40,10,1,0.0,0,9,13,0,5053,849,25562
3,3991815,66,4,0,0.0,0,4,10,0,5195,688,5805
4,6256451,41,2,1,0.0,1,2,12,1,5053,607,4658
5,5768871,47,11,1,0.0,0,3,14,1,4413,809,16537


## Splitting the data
### Split the dataset into train and test splits

In [11]:
x = data.drop("risk_flag", axis=1)
y = data["risk_flag"]

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 420)

# 3. Cross Validation

### Prepare Pipeline: Smote and RandomForestClassifier

In [13]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import StratifiedKFold
# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline

# smote_sampler = SMOTE(random_state=420)

# pipeline = Pipeline(steps = [['smote', smote_sampler],
#                              ['classifier', rf_clf]])

In [14]:
# Bootstrapping
# rf_bootstrap = [True, False]

# # Split criterion
# rf_criterion = ['gini', 'entropy']

# params_grid = {
#                'classifier__criterion': rf_criterion,
#                'classifier__bootstrap': rf_bootstrap
#               }

### GridSearch Cross Validation

In [15]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import recall_score

# stratified_kfold = StratifiedKFold(shuffle=True, n_splits=10, random_state=420)

# rf_gridsearch = GridSearchCV(estimator = pipeline,
#                            param_grid = params_grid,
#                            scoring = 'recall',
#                            cv = stratified_kfold,
#                            n_jobs = -1 )

# rf_gridsearch.fit(x_train, y_train)

# best_parameters = rf_gridsearch.best_params_
# print(best_parameters)

In [16]:
# best_parameters = rf_gridsearch.best_params_
# print(best_parameters)

The best parameters from the GridSearch was <br>
`{'classifier__bootstrap': True, 'classifier__criterion': 'gini'}`

### Re-train with best parameters

In [17]:
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import make_scorer
# from sklearn.metrics import fbeta_score, recall_score

# rf_clf = RandomForestClassifier(criterion='gini', bootstrap=True, random_state=420)

# smote_sampler = SMOTE(random_state=420)

# pipeline = Pipeline(steps = [['smote', smote_sampler],
#                              ['classifier', rf_clf]])

# stratified_kfold = StratifiedKFold(shuffle=True, n_splits=10, random_state=420)

# cross_val = cross_val_score(estimator=pipeline,
#                             X = x_train,
#                             y = y_train,
#                             scoring='recall',
#                             cv=stratified_kfold,
#                             n_jobs=-1)

### Average recall score across 10 folds

In [18]:
# print( cross_val.sum() / 10 )

In [19]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

rf_clf = RandomForestClassifier(criterion='gini', bootstrap=True, random_state=420)

smote_sampler = SMOTE(random_state=420)

pipeline = Pipeline(steps = [['smote', smote_sampler],
                             ['classifier', rf_clf]])

pipeline.fit(x_train, y_train)

y_pred = pipeline.predict(x_test)

In [20]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

print("-------------------------TEST SCORES-----------------------") 
print(f"Recall: { round(recall_score(y_test, y_pred)*100, 2) }")
print(f"Precision: { round(precision_score(y_test, y_pred)*100, 2) }")
print(f"F1-Score: { round(f1_score(y_test, y_pred)*100, 4) }")
print(f"Accuracy score: { round(accuracy_score(y_test, y_pred)*100, 2) }")
print(f"AUC Score: { round(roc_auc_score(y_test, y_pred)*100, 2) }")

-------------------------TEST SCORES-----------------------
Recall: 81.64
Precision: 49.71
F1-Score: 61.7911
Accuracy score: 87.58
AUC Score: 85.03
