# IS424: Data Mining & Biz Analytics
## Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### Grid Search and Cross-validation: RandomForest
---

# 1. Setting up the notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

In [2]:
df_train = pd.read_csv("../dataset/train.csv")
df_test = pd.read_csv("../dataset/test.csv")

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

y_test = df_test[["risk_flag"]]
x_test = df_test.drop("risk_flag", axis=1)

# 2. Preparations for GridSearch

In [3]:
# Creating parameter grid to search
n_estimators = [ n for n in range(100, 1000+1, 100) ]

max_depth = [ depth for depth in range(2, 32, 4) ]
max_depth.append(None)

max_features = list(range(1, x_train.shape[1], 4))

min_samples_leaf = [1, 3, 5, 8]

params_grid = {
                'classifier__max_features': max_features,
                'classifier__min_samples_leaf': min_samples_leaf,
                'classifier__n_estimators': n_estimators,
                'classifier__max_depth': max_depth
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__max_features [1, 5]
classifier__min_samples_leaf [1, 3, 5, 8]
classifier__n_estimators [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
classifier__max_depth [2, 6, 10, 14, 18, 22, 26, 30, None]
-----------------
Total combinations: 720


In [None]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer

te_features = ['profession', 'city','state']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', RandomForestClassifier(random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

rf_gridsearch = RandomizedSearchCV(estimator = pipeline,
                           param_distributions = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1,
                           random_state = 2021)

rf_gridsearch.fit( x_train, y_train.values.ravel() )