# IS424: Data Mining & Biz Analytics
## Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### Grid Search and Cross-validation: DecisionTreeClassifier
---

# 1. Setting up the notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

In [2]:
df_train = pd.read_csv("../dataset/train.csv") # local
df_test = pd.read_csv("../dataset/test.csv") # local

# df_train = pd.read_csv("../input/is424loanpredictionpreprocessed/train.csv") # kaggle
# df_test = pd.read_csv("../input/is424loanpredictionpreprocessed/test.csv") # kaggle

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

y_test = df_test[["risk_flag"]]
x_test = df_test.drop("risk_flag", axis=1)

In [3]:
x_train.head()

Unnamed: 0,income,age,experience,marital_status,car_ownership,profession,city,state,rented
0,3278154,56,20,1,0,Technology_specialist,Gorakhpur,Uttar_Pradesh,1.0
1,1315700,53,4,1,0,Civil_engineer,Thoothukudi,Tamil_Nadu,1.0
2,5021070,47,20,1,0,Magistrate,Khammam,Telangana,1.0
3,2824175,54,0,1,0,Chef,Serampore,West_Bengal,1.0
4,8790456,41,2,1,0,Designer,Tiruppur,Tamil_Nadu,1.0


# 2. Performance of Base Random Forest Classifier

In [4]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.compose import ColumnTransformer

def cv_evaluate_model(rf_clf):
    smote_sampler = SMOTE(random_state=2021)
    te_features = ['profession', 'city','state']
    scale_features = ['income','age','experience']

    preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                    ('scale_features', MinMaxScaler(), scale_features)])

    pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                                 ['smote', SMOTE(random_state=2021)],
                                 ['classifier', rf_clf]
                                ])
    
    stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

    scoring = {"recall": 'recall',
               "fbeta_2": make_scorer(fbeta_score, beta=2),
               "roc_auc": make_scorer(roc_auc_score),
              }

    scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                           scoring = scoring)

    recall = [ val for val in scores['test_recall'] ]
    fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
    auc = [ val for val in scores['test_roc_auc'] ]
    recall.append( sum(recall) / len(recall) )
    fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
    auc.append( sum(auc) / len(auc) )

    score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                            index=['Recall','Fbeta2','AUC'])
    return score_df

In [5]:
base_rf_clf = DecisionTreeClassifier(random_state=2021)
results_base = cv_evaluate_model(base_rf_clf)
display(results_base)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.842105,0.846359,0.838858,0.842441
Fbeta2,0.731677,0.734642,0.731034,0.732451
AUC,0.857067,0.859082,0.856409,0.857519


# 3. Hyper parameter tuning with GridSearch

## 3.1 Conducting Grid Search 1

In [9]:
# Creating parameter grid to search
criterion = ["gini","entropy"]

max_depth = [None, 5, 10, 15]

min_samples_split = [2, 10, 20, 30, 40]

min_samples_leaf = [1, 10, 20]

params_grid = {
                'classifier__criterion': criterion,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__criterion ['gini', 'entropy']
classifier__min_samples_split [2, 10, 20, 30, 40]
classifier__min_samples_leaf [1, 10, 20]
classifier__max_depth [None, 5, 10, 15]
-----------------
Total combinations: 120


In [10]:
# conduct gridsearch 1
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', DecisionTreeClassifier(random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)


rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = rf_gridsearch.best_params_
print(best_parameters)

{'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 30}


{'classifier__max_depth': 15, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 100}

In [15]:
# classifier trained on the latest hyper parameters
rf_clf2 = DecisionTreeClassifier(criterion = "entropy", max_depth = None, min_samples_leaf = 10, min_samples_split=30,random_state=2021)

results_gs1 = cv_evaluate_model(rf_clf2)
display(results_gs1)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.845856,0.847931,0.848657,0.847481
Fbeta2,0.730375,0.732469,0.734002,0.732282
AUC,0.856686,0.858027,0.858942,0.857885


## 3.2 Second GridSearchCV
This second parameter search space aims to narrow down the search space based on the results obtained in the first grid search.

**Previous parameter search space:**
- classifier__criterion ['gini', 'entropy']
- classifier__min_samples_split [2, 10, 20, 30, 40]
- classifier__min_samples_leaf [1, 10, 20]
- classifier__max_depth [None, 5, 10, 15]

**Obtained results:**

| Feature           | Value | New search space |
|-------------------|:-----:|------------|
|classifier__criterion|'entropy'| ['gini', 'entropy']
|classifier__min_samples_split| 30 |[2, 10, 20, 30, 40]
|classifier__min_samples_leaf| 10 | [1, 10, 20]
|classifier__max_depth| None | [None, 5, 10, 15]

In [12]:
# Creating parameter grid to search
criterion = ["gini","entropy"]

max_depth = [None, 20,25,30]

min_samples_split = [25 , 27, 30, 32, 35]

min_samples_leaf = [5,7,10,12,15]

params_grid = {
                'classifier__criterion': criterion,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__criterion ['gini', 'entropy']
classifier__min_samples_split [25, 27, 30, 32, 35]
classifier__min_samples_leaf [5, 7, 10, 12, 15]
classifier__max_depth [None, 20, 25, 30]
-----------------
Total combinations: 200


In [13]:
# conduct gridsearch 2
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', DecisionTreeClassifier(random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)


rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = rf_gridsearch.best_params_
print(best_parameters)

{'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 7, 'classifier__min_samples_split': 25}


In [16]:
# classifier trained on the latest hyper parameters
rf_clf2 = DecisionTreeClassifier(criterion = "entropy", max_depth = None, min_samples_leaf = 7, min_samples_split=25, random_state=2021)

results_gs1 = cv_evaluate_model(rf_clf2)
display(results_gs1)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.848518,0.851198,0.850956,0.850224
Fbeta2,0.732138,0.73313,0.735266,0.733512
AUC,0.857906,0.858693,0.859854,0.858818
