# IS424: Data Mining & Biz Analytics
### Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### GridSearch and CrossValidation: <font color='#0041C2'>k-Nearest Neighbours</font>
---

# 1. Setting up the notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

In [2]:
df_train = pd.read_csv("../input/is424loanpredictionpreprocessed/train.csv") # kaggle
df_test = pd.read_csv("../input/is424loanpredictionpreprocessed/test.csv") # kaggle

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

y_test = df_test[["risk_flag"]]
x_test = df_test.drop("risk_flag", axis=1)

# 2. Performance of Base KNN

In [None]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer 


te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', KNeighborsClassifier()]]
                   )

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

scoring = {"recall": 'recall',
           "fbeta_2": make_scorer(fbeta_score, beta=2),
           "roc_auc": make_scorer(roc_auc_score),
          }

scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                       scoring = scoring)

In [None]:
recall = [ val for val in scores['test_recall'] ]
fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
auc = [ val for val in scores['test_roc_auc'] ]
recall.append( sum(recall) / len(recall) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Recall','Fbeta2','AUC'])
display(score_df)

# 3. Hyperparameter Tuning with GridSearchCV

In [3]:
# Creating parameter grid to search
params_grid = {
            'knn__n_neighbors': list(range(1,21,3)),
            'knn__leaf_size': list(range(20,40,5)),
            'knn__p': (1,2),
            'knn__weights': ('uniform', 'distance'),
            'knn__metric': ('minkowski', 'chebyshev', 'euclidean')
            }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

knn__n_neighbors [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
knn__leaf_size [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
knn__p (1, 2)
knn__weights ('uniform', 'distance')
knn__metric ('minkowski', 'chebyshev', 'manhattan', 'euclidean')
-----------------
Total combinations: 6400


In [9]:
from sklearn.model_selection import GridSearchCV


te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['knn', KNeighborsClassifier()]]
                   )

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

scoring = {"recall": 'recall',
           "fbeta_2": make_scorer(fbeta_score, beta=2),
           "roc_auc": make_scorer(roc_auc_score),
          }

knn_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = scoring,
                           cv = stratified_kfold,
                           refit = False,
                           n_jobs = -1)

In [None]:
knn_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = knn_gridsearch.best_params_
print(best_parameters)

In [None]:
best_knn_grid = KNeighborsClassifier()
best_knn_grid.set_params(**knn_gridsearch.best_params_)

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', best_knn_grid]]
                   )

In [None]:
scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                       scoring = scoring)

In [None]:
recall = [ val for val in scores['test_recall'] ]
fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
auc = [ val for val in scores['test_roc_auc'] ]
recall.append( sum(recall) / len(recall) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Recall','Fbeta2','AUC'])
display(score_df)