# IS424: Data Mining & Biz Analytics
## Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### Grid Search and Cross-validation: RandomForest
---

# 1. Setting up the notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

In [2]:
df_train = pd.read_csv("../dataset/train.csv")
df_test = pd.read_csv("../dataset/test.csv")

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

y_test = df_test[["risk_flag"]]
x_test = df_test.drop("risk_flag", axis=1)

# 2. Hyper parameter tuning with GridSearch

## 2.1 Conducting Grid Search 1

In [8]:
# Creating parameter grid to search
n_estimators = [100, 300, 500, 700, 900]

max_features = list(range(1, x_train.shape[1]+1, 2))

min_samples_leaf = [1, 30, 50, 70, 90]

params_grid = {
                'classifier__n_estimators': n_estimators,
                'classifier__max_features': max_features,
                'classifier__min_samples_leaf': min_samples_leaf
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__n_estimators [100, 300, 500, 700, 900]
classifier__max_features [1, 3, 5, 7, 9]
classifier__min_samples_leaf [1, 30, 50, 70, 90]
-----------------
Total combinations: 125


In [10]:
# initialising code for gridsearch
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer

te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', RandomForestClassifier(random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

In [17]:
# results before tuning
pipeline.fit(x_train, y_train.values.ravel() )
y_pred = pipeline.predict(x_test)

print(f"-----------------------PERFORMANCE BEFORE TUNING-----------------------")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
print()

-----------------------PERFORMANCE BEFORE TUNING-----------------------
Recall: 0.7860945313760284
F2-Score: 0.7166387246683726
AUC Score: 0.844066473398247



In [None]:
# conduct gridsearch 1
rf_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

rf_gridsearch.fit( x_train, y_train.values.ravel() )

In [None]:
best_parameters = rf_gridsearch.best_params_
print(best_parameters)

In [24]:
rf_gs1 = rf_gridsearch.best_estimator_

y_pred = rf_gs1.fit(x_train, y_train.values.ravel() )

print(f"-----------------------TEST SCORES-----------------------")
print(f"Recall: {recall_score(y_train, y_pred)}")
print(f"F2-Score: {fbeta_score(y_train, y_pred, beta=2)}")
print(f"AUC Score: {roc_auc_score(y_train, y_pred)}")
print()

-----------------------TEST SCORES-----------------------


TypeError: Expected sequence or array-like, got <class 'imblearn.pipeline.Pipeline'>