# IS424: Data Mining & Biz Analytics
## Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### Grid Search and Cross-validation: RandomForest
---

# 1. Setting up the notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

In [16]:
df_train = pd.read_csv("../dataset/train.csv")
df_test = pd.read_csv("../dataset/test.csv")

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

y_test = df_test[["risk_flag"]]
x_test = df_test.drop("risk_flag", axis=1)

# 2. Preparations for GridSearch

In [40]:
from sklearn.base import BaseEstimator, TransformerMixin

class TargetTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features = None
        self.label = None
        pass
        
    def fit(self, features, label=None):
        self.features = features
        return self
    
    def transform(self, features):
        x = features.copy()
        y_train = label.copy()
    
        # Target Encoding — categorical columns with high cardinality: profession, city, state
        profession_target_enc = TargetEncoder()
        x["profession_encoded"] = profession_target_enc.fit_transform(x["profession"], y_train)

        city_target_enc = TargetEncoder()
        x["city_encoded"] = city_target_enc.fit_transform(x["city"], y_train)

        state_target_enc = TargetEncoder()
        x["state_encoded"] = state_target_enc.fit_transform(x["state"], y_train)

        x.drop("profession", axis=1, inplace=True)
        x.drop("city", axis=1, inplace=True)
        x.drop("state", axis=1, inplace=True)
        
        return x

In [32]:
# from sklearn.model_selection import StratifiedKFold

# strat_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=2021)

# for train_ix, test_ix in kfold.split(x_train):

In [41]:
# n_estimators = [ n for n in range(100, 1500+1, 300) ]

max_depth = [ depth for depth in range(5, 30, 5) ]
max_depth.append( None )

max_features = list(range(1, x_train.shape[1], 4))

min_samples_leaf = [1, 2, 4]

params_grid = {
               'classifier__max_features': max_features,
               'classifier__min_samples_leaf': min_samples_leaf,
               'classifier__max_depth': max_depth
              }

In [42]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

pipeline = Pipeline(steps = [['target_encode', TargetTransformer() ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', RandomForestClassifier(random_state=2021)]
                            ])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

rf_gridsearch = RandomizedSearchCV(estimator = pipeline,
                           param_distributions = params_grid,
                           scoring = 'recall',
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1,
                           random_state = 2021)

rf_gridsearch.fit( x_train, y_train.values.ravel() )

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/imblearn/pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/imblearn/pipeline.py", line 210, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Library/Frameworks/Python.framewor

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/imblearn/pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/imblearn/pipeline.py", line 210, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Library/Frameworks/Python.framewor

TypeError: transform() missing 1 required positional argument: 'label'

In [27]:
y_pred = pipeline.predict(x_test)

ValueError: The length of X is 50400 but length of y is 201600.