# IS424: Data Mining & Biz Analytics
### Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### Model selection: <font color='#0041C2'>Random Forest</font>
---

# 1. Setting up the notebook

In [1]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, fbeta_score, roc_auc_score, make_scorer

In [2]:
df_train = pd.read_csv("../dataset/train.csv")

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

# 2. Running base model

In [3]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer 


te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)], remainder='passthrough')

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', RandomForestClassifier(random_state=2021)]]
                   )

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

scoring = {"recall": 'recall',
           "fbeta_2": make_scorer(fbeta_score, beta=2),
           "roc_auc": make_scorer(roc_auc_score),
          }

scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                       scoring = scoring)


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [4]:
recall = [ val for val in scores['test_recall'] ]
fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
auc = [ val for val in scores['test_roc_auc'] ]
recall.append( sum(recall) / len(recall) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Recall','Fbeta2','AUC'])
display(score_df)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.784634,0.784176,0.782724,0.783845
Fbeta2,0.716717,0.716607,0.716628,0.716651
AUC,0.843993,0.84389,0.843784,0.843889
