# IS424: Data Mining & Biz Analytics
## Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### Grid Search and Cross-validation: RandomForest
---

# 1. Setting up the notebook

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

In [7]:
# df_train = pd.read_csv("../dataset/train.csv") # local
# df_test = pd.read_csv("../dataset/test.csv") # local

df_train = pd.read_csv("../input/is424loanpredictionpreprocessed/train.csv") # kaggle
df_test = pd.read_csv("../input/is424loanpredictionpreprocessed/test.csv") # kaggle

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

y_test = df_test[["risk_flag"]]
x_test = df_test.drop("risk_flag", axis=1)

In [8]:
x_train.head()

# 2. Performance of Base Random Forest Classifier

In [9]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.compose import ColumnTransformer

def cv_evaluate_model(rf_clf):
    smote_sampler = SMOTE(random_state=2021)
    te_features = ['profession', 'city','state']
    scale_features = ['income','age','experience']

    preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                    ('scale_features', MinMaxScaler(), scale_features)])

    pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                                 ['smote', SMOTE(random_state=2021)],
                                 ['classifier', rf_clf]
                                ])
    
    stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

    scoring = {"recall": 'recall',
               "fbeta_2": make_scorer(fbeta_score, beta=2),
               "roc_auc": make_scorer(roc_auc_score),
              }

    scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                           scoring = scoring)

    recall = [ val for val in scores['test_recall'] ]
    fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
    auc = [ val for val in scores['test_roc_auc'] ]
    recall.append( sum(recall) / len(recall) )
    fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
    auc.append( sum(auc) / len(auc) )

    score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                            index=['Recall','Fbeta2','AUC'])
    return score_df

In [10]:
base_rf_clf = RandomForestClassifier(random_state=2021)
results_base = cv_evaluate_model(base_rf_clf)
display(results_base)

# 3. Hyper parameter tuning with GridSearch

## 3.1 Conducting Grid Search 1

In [11]:
# Creating parameter grid to search
n_estimators = [100, 500, 900]

max_depth = [5, 10, 15]

min_samples_split = [3, 5, 8]

min_samples_leaf = [1, 3, 5]

params_grid = {
                'classifier__n_estimators': n_estimators,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

In [12]:
# # conduct gridsearch 1
# te_features = ['profession', 'city','state']
# scale_features = ['income','age','experience']

# preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
#                                                 ('scale_features', MinMaxScaler(), scale_features)])

# pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
#                              ['smote', SMOTE(random_state=2021)],
#                              ['classifier', RandomForestClassifier(random_state=2021)]
#                             ])

# stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)


# rf_gridsearch = GridSearchCV(estimator = pipeline,
#                            param_grid = params_grid,
#                            scoring = 'recall',
#                            cv = stratified_kfold,
#                            refit = True,
#                            n_jobs = -1)

# rf_gridsearch.fit( x_train, y_train.values.ravel() )
# best_parameters = rf_gridsearch.best_params_
# print(best_parameters)

{'classifier__max_depth': 15, 'classifier__min_samples_leaf': 3, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 100}

In [13]:
# classifier trained on the latest hyper parameters
rf_clf2 = RandomForestClassifier(max_depth=15, max_features='sqrt', min_samples_leaf=3,
                                min_samples_split=3, n_estimators=100,
                                random_state=2021)

results_gs1 = cv_evaluate_model(rf_clf2)
display(results_gs1)

## 3.2 Second GridSearchCV
This second parameter search space aims to narrow down the search space based on the results obtained in the first grid search.

**Previous parameter search space:**
- n_estimators = [100, 500, 900]
- max_depth = [5, 10, 15]
- min_samples_split = [3, 5, 8]
- min_samples_leaf = [1, 3, 5]

**Obtained results:**

| Feature           | Value | New search space |
|-------------------|:-----:|------------|
| n_estimators      | 100   | [100, 150, 200, 250, 300] |
| max_depth         | 15    | [13, 15, 17, 20] |
| min_samples_split | 3     | [2, 3, 4] |
| min_samples_leaf  | 3     | [2, 3, 4] |

In [15]:
# Creating parameter grid to search
n_estimators = [100, 150, 200, 250]

max_depth = [13, 15, 17, 20]

min_samples_split = [2, 3, 4]

min_samples_leaf = [2, 3, 4]

params_grid = {
                'classifier__n_estimators': n_estimators,
                'classifier__min_samples_split': min_samples_split,
                'classifier__min_samples_leaf': min_samples_leaf,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

In [None]:
# conduct gridsearch 2
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', RandomForestClassifier(random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)


rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = rf_gridsearch.best_params_
print(best_parameters)