# IS424: Data Mining & Biz Analytics
### Team: G3T3
### Project: Predicting Loan Default based on Customer Profile
### GridSearch and CrossValidation: <font color='#0041C2'>k-Nearest Neighbours</font>
---

# 1. Setting up the notebook

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

In [2]:
df_train = pd.read_csv("../dataset/train.csv") # kaggle
df_test = pd.read_csv("../dataset/test.csv") # kaggle

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

y_test = df_test[["risk_flag"]]
x_test = df_test.drop("risk_flag", axis=1)

# 2. Performance of Base KNN

In [3]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer 


te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', KNeighborsClassifier()]]
                   )

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

scoring = {"recall": 'recall',
           "fbeta_2": make_scorer(fbeta_score, beta=2),
           "roc_auc": make_scorer(roc_auc_score),
          }

scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                       scoring = scoring)

In [4]:
recall = [ val for val in scores['test_recall'] ]
fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
auc = [ val for val in scores['test_roc_auc'] ]
recall.append( sum(recall) / len(recall) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Recall','Fbeta2','AUC'])
display(score_df)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.844283,0.839826,0.838374,0.840828
Fbeta2,0.72955,0.725013,0.725396,0.726653
AUC,0.856086,0.853152,0.85324,0.854159


# 3. Hyperparameter Tuning with GridSearchCV

## 3.1 First GridSearchCV 

In [5]:
# Creating parameter grid to search
params_grid = {
            'knn__n_neighbors': list(range(1,21,4)),
            'knn__leaf_size': list(range(20,40,5)),
            'knn__p': (1,2),
            'knn__weights': ('uniform', 'distance'),
            'knn__metric': ('minkowski', 'chebyshev', 'euclidean')
            }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

knn__n_neighbors [1, 5, 9, 13, 17]
knn__leaf_size [20, 25, 30, 35]
knn__p (1, 2)
knn__weights ('uniform', 'distance')
knn__metric ('minkowski', 'chebyshev', 'euclidean')
-----------------
Total combinations: 240


In [6]:
from sklearn.model_selection import GridSearchCV


te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['knn', KNeighborsClassifier()]]
                   )

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

scoring = {"recall": 'recall',
           "fbeta_2": make_scorer(fbeta_score, beta=2),
           "roc_auc": make_scorer(roc_auc_score),
          }

knn_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = "recall",
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

In [7]:
knn_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = knn_gridsearch.best_params_
print(best_parameters)

{'knn__leaf_size': 20, 'knn__metric': 'chebyshev', 'knn__n_neighbors': 9, 'knn__p': 1, 'knn__weights': 'uniform'}


In [8]:
best_knn_grid = KNeighborsClassifier(leaf_size=20, metric='chebyshev', 
                                     n_neighbors=9, p=1, weights='uniform')

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', best_knn_grid]]
                   )

In [9]:
scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                       scoring = scoring)

In [10]:
recall = [ val for val in scores['test_recall'] ]
fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
auc = [ val for val in scores['test_roc_auc'] ]
recall.append( sum(recall) / len(recall) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Recall','Fbeta2','AUC'])
display(score_df)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.857713,0.852891,0.857489,0.856031
Fbeta2,0.734429,0.730146,0.733626,0.732734
AUC,0.860027,0.857182,0.859548,0.858919


## 3.2 Second GridSearchCV 

In [14]:
# Creating parameter grid to search
params_grid = {
            'knn__n_neighbors': [5,7,9,11],
            'knn__leaf_size': list(range(15,22,1)),
            'knn__p': (1,2),
            'knn__weights': ['uniform'],
            'knn__metric': ['chebyshev']
            }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

knn__n_neighbors [5, 7, 9, 11]
knn__leaf_size [15, 16, 17, 18, 19, 20, 21]
knn__p (1, 2)
knn__weights ['uniform']
knn__metric ['chebyshev']
-----------------
Total combinations: 56


In [15]:
from sklearn.model_selection import GridSearchCV


te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['knn', KNeighborsClassifier()]]
                   )

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

scoring = {"recall": 'recall',
           "fbeta_2": make_scorer(fbeta_score, beta=2),
           "roc_auc": make_scorer(roc_auc_score),
          }

knn_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = "recall",
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

In [18]:
knn_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = knn_gridsearch.best_params_
print(best_parameters)

{'knn__leaf_size': 15, 'knn__metric': 'chebyshev', 'knn__n_neighbors': 7, 'knn__p': 1, 'knn__weights': 'uniform'}


In [19]:
best_knn_grid = KNeighborsClassifier(leaf_size=15, metric='chebyshev', 
                                     n_neighbors=7, p=1, weights='uniform')

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', best_knn_grid]]
                   )

In [20]:
scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                       scoring = scoring)

In [21]:
recall = [ val for val in scores['test_recall'] ]
fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
auc = [ val for val in scores['test_roc_auc'] ]
recall.append( sum(recall) / len(recall) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Recall','Fbeta2','AUC'])
display(score_df)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.859528,0.857851,0.854706,0.857362
Fbeta2,0.734658,0.73236,0.731806,0.732941
AUC,0.860323,0.858873,0.858276,0.859157


## 3.3 Third GridSearchCV 

In [28]:
# Creating parameter grid to search
params_grid = {
            'knn__n_neighbors': [5,7,9,11],
            'knn__leaf_size': list(range(1,15,1)),
            'knn__p': [1],
            'knn__weights': ['uniform'],
            'knn__metric': ['chebyshev']
            }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

knn__n_neighbors [5, 7, 9, 11]
knn__leaf_size [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
knn__p [1]
knn__weights ['uniform']
knn__metric ['chebyshev']
-----------------
Total combinations: 56


In [29]:
from sklearn.model_selection import GridSearchCV


te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['knn', KNeighborsClassifier()]]
                   )

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

scoring = {"recall": 'recall',
           "fbeta_2": make_scorer(fbeta_score, beta=2),
           "roc_auc": make_scorer(roc_auc_score),
          }

knn_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = "recall",
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

In [30]:
knn_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = knn_gridsearch.best_params_
print(best_parameters)

{'knn__leaf_size': 8, 'knn__metric': 'chebyshev', 'knn__n_neighbors': 7, 'knn__p': 1, 'knn__weights': 'uniform'}


In [32]:
best_knn_grid = KNeighborsClassifier(leaf_size=8, metric='chebyshev', 
                                     n_neighbors=7, p=1, weights='uniform')

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', best_knn_grid]]
                   )

In [33]:
scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                       scoring = scoring)

In [34]:
recall = [ val for val in scores['test_recall'] ]
fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
auc = [ val for val in scores['test_roc_auc'] ]
recall.append( sum(recall) / len(recall) )
fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
auc.append( sum(auc) / len(auc) )

score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                        index=['Recall','Fbeta2','AUC'])
display(score_df)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Recall,0.859407,0.855674,0.859545,0.858209
Fbeta2,0.734175,0.73112,0.734959,0.733418
AUC,0.860042,0.857979,0.860483,0.859501


# 4. Evaluate on Test Data

In [35]:
best_knn_grid = KNeighborsClassifier(leaf_size=8, metric='chebyshev', 
                                     n_neighbors=7, p=1, weights='uniform')

In [36]:
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                ('scale_features', MinMaxScaler(), scale_features)])

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', best_knn_grid]
                            ])

pipeline.fit(x_train, y_train.values.ravel() )

y_pred = pipeline.predict(x_test)

print("-----------------------PERFORMANCE EVALUATION--------------------  ")
print(f"Recall: {recall_score(y_test, y_pred)} ")
print(f"Fbeta2: {fbeta_score(y_test, y_pred, beta=2)} ")
print(f"AUC Score: {roc_auc_score(y_test, y_pred)} ")

-----------------------PERFORMANCE EVALUATION--------------------  
Recall: 0.8675592837554444 
Fbeta2: 0.732917223145902 
AUC Score: 0.8600822142177146 
