# IS424: Data Mining & Biz Analytics
## Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### Grid Search and Cross-validation: DecisionTreeClassifier
---

# 1. Setting up the notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

In [2]:
df_train = pd.read_csv("../dataset/train.csv") # local
df_test = pd.read_csv("../dataset/test.csv") # local

# df_train = pd.read_csv("../input/is424loanpredictionpreprocessed/train.csv") # kaggle
# df_test = pd.read_csv("../input/is424loanpredictionpreprocessed/test.csv") # kaggle

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

y_test = df_test[["risk_flag"]]
x_test = df_test.drop("risk_flag", axis=1)

In [3]:
x_train.head()

Unnamed: 0,income,age,experience,marital_status,car_ownership,profession,city,state,rented
0,3278154,56,20,1,0,Technology_specialist,Gorakhpur,Uttar_Pradesh,1.0
1,1315700,53,4,1,0,Civil_engineer,Thoothukudi,Tamil_Nadu,1.0
2,5021070,47,20,1,0,Magistrate,Khammam,Telangana,1.0
3,2824175,54,0,1,0,Chef,Serampore,West_Bengal,1.0
4,8790456,41,2,1,0,Designer,Tiruppur,Tamil_Nadu,1.0


# 2. Performance of Base Random Forest Classifier

In [4]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.compose import ColumnTransformer

def cv_evaluate_model(rf_clf):
    smote_sampler = SMOTE(random_state=2021)
    te_features = ['profession', 'city','state']
    scale_features = ['income','age','experience']

    preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                    ('scale_features', MinMaxScaler(), scale_features)])

    pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                                 ['smote', SMOTE(random_state=2021)],
                                 ['classifier', rf_clf]
                                ])
    
    stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

    scoring = {"recall": 'recall',
               "fbeta_2": make_scorer(fbeta_score, beta=2),
               "roc_auc": make_scorer(roc_auc_score),
              }

    scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                           scoring = scoring)

    recall = [ val for val in scores['test_recall'] ]
    fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
    auc = [ val for val in scores['test_roc_auc'] ]
    recall.append( sum(recall) / len(recall) )
    fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
    auc.append( sum(auc) / len(auc) )

    score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                            index=['Recall','Fbeta2','AUC'])
    return score_df

In [5]:
base_rf_clf = DecisionTreeClassifier(random_state=2021)
results_base = cv_evaluate_model(base_rf_clf)
display(results_base)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.842105,0.846359,0.838858,0.842441
Fbeta2,0.731677,0.734642,0.731034,0.732451
AUC,0.857067,0.859082,0.856409,0.857519


# 3. Hyper parameter tuning with GridSearch

## 3.1 Conducting Grid Search 1

In [10]:
# Creating parameter grid to search
# max_depth = [None, 2, 8, 32]

# min_samples_split = [2, 10, 20, 30, 40]

# min_samples_leaf = [1, 10, 20]

# max_features = [None, 2, 4, 6]

# params_grid = {
#                 'classifier__max_depth': max_depth,
#                 'classifier__min_samples_split': min_samples_split,
#                 'classifier__min_samples_leaf': min_samples_leaf,
#                 'classifier__max_features': max_features
#               }

min_samples_split = range(1,100,5)

min_samples_leaf = range(1,100,5)
params_grid = {
#                 'classifier__max_depth': max_depth,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
#                 'classifier__max_features': max_features
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__min_samples_split range(1, 100, 5)
classifier__min_samples_leaf range(1, 100, 5)
-----------------
Total combinations: 400


In [11]:
# conduct gridsearch 1
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', DecisionTreeClassifier(random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)


rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = rf_gridsearch.best_params_
print(best_parameters)

 0.84417481 0.8436102  0.84240036 0.84050494 0.83796429 0.83586725
 0.83372989 0.83171353 0.83010045 0.82808398 0.82699524 0.82622901
 0.82606768 0.82409171        nan 0.83990013 0.83990013 0.84300532
 0.84534434 0.8455863  0.84405391 0.84244079 0.84111001 0.83990012
 0.83836761 0.83631088 0.83393159 0.83211691 0.83066511 0.82913262
 0.82772116 0.82699526 0.82735823 0.8250193         nan 0.84679618
 0.84679618 0.84679618 0.84679618 0.84780436 0.84687681 0.84453788
 0.84159396 0.84078737 0.83977917 0.83792408 0.83453656 0.83268153
 0.83098778 0.82977791 0.82872943 0.82780189 0.82687438 0.8253823
        nan 0.84244077 0.84244077 0.84244077 0.84244077 0.84244077
 0.84244077 0.84211812 0.83994042 0.83792395 0.83683512 0.83502037
 0.83304432 0.83098763 0.82909222 0.82812436 0.82699524 0.82634999
 0.82526116 0.82393042        nan 0.83752056 0.83752056 0.83752056
 0.83752056 0.83752056 0.83752056 0.83752056 0.83752056 0.83493952
 0.83227794 0.83046321 0.82953569 0.82760002 0.82566431 0.82445

{'classifier__min_samples_leaf': 11, 'classifier__min_samples_split': 26}


In [12]:
# classifier trained on the latest hyper parameters
rf_clf2 = DecisionTreeClassifier(min_samples_leaf=11, min_samples_split = 26, random_state=2021)
results_gs1 = cv_evaluate_model(rf_clf2)
display(results_gs1)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.852511,0.847689,0.843213,0.847804
Fbeta2,0.735352,0.731481,0.729614,0.732149
AUC,0.860055,0.857457,0.856016,0.857843


## 3.2 Second GridSearchCV
This second parameter search space aims to narrow down the search space based on the results obtained in the first grid search.

**Previous parameter search space:**
- classifier__max_depth [None, 2, 8, 32]
- classifier__min_samples_split [2, 10, 20, 30, 40]
- classifier__min_samples_leaf [1, 10, 20]
- classifier__max_features [None, 2, 4, 6]

**Obtained results:**

| Feature           | Value | New search space |
|-------------------|:-----:|------------|
|classifier__max_depth|None| [None, 64]
|classifier__min_samples_split| 30 |range(21,40)
|classifier__min_samples_leaf| 10 | range(8,12)
|classifier__max_features| 4 | [3,4,5]

In [13]:
# Creating parameter grid to search

# max_depth = [None, 64]

min_samples_split = range(21,32)

min_samples_leaf = range(6,17)

# max_features = [3, 4, 5]

params_grid = {
                #'classifier__max_depth': max_depth,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
                #'classifier__max_features': max_features
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__min_samples_split range(21, 32)
classifier__min_samples_leaf range(6, 17)
-----------------
Total combinations: 121


In [14]:
# conduct gridsearch 2
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', DecisionTreeClassifier(random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)


rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = rf_gridsearch.best_params_
print(best_parameters)

{'classifier__min_samples_leaf': 9, 'classifier__min_samples_split': 26}


In [8]:
# classifier trained on the latest hyper parameters
rf_clf2 = DecisionTreeClassifier(max_depth = None, max_features =3, min_samples_leaf=9, min_samples_split = 25, random_state=2021)

results_gs1 = cv_evaluate_model(rf_clf2)
display(results_gs1)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.851422,0.850472,0.853012,0.851635
Fbeta2,0.734045,0.733897,0.732328,0.733423
AUC,0.859231,0.859052,0.858413,0.858899
