# GridSearch and CrossValidation: <font color='#0041C2'>k-Nearest Neighbours</font>
---

# 1. Setting up the notebook

In [21]:
import pandas as pd
import numpy as np

from category_encoders import TargetEncoder

from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score

from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer 

In [22]:
df_train = pd.read_csv("../dataset/train.csv")

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

In [23]:
def cv_evaluate_model(rf_clf):
    te_features = ['profession', 'city','state']
    scale_features = ['income','age','experience']

    preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                                    ('scale_features', MinMaxScaler(), scale_features)], remainder="passthrough")

    pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                                 ['smote', SMOTE(random_state=2021)],
                                 ['classifier', rf_clf]
                                ])
    
    stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

    scoring = {"accuracy": "accuracy",
               "recall": 'recall',
               "precision": "precision",
               "fbeta_2": make_scorer(fbeta_score, beta=2),
               "roc_auc": make_scorer(roc_auc_score),
              }

    scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                           scoring = scoring)

    accuracy = [ val for val in scores['test_accuracy'] ]
    recall = [ val for val in scores['test_recall'] ]
    precision = [ val for val in scores['test_precision'] ]
    fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
    auc = [ val for val in scores['test_roc_auc'] ]
    
    accuracy.append( sum(accuracy) / len(accuracy) )
    recall.append( sum(recall) / len(recall) )
    precision.append( sum(precision) / len(precision) )
    fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
    auc.append( sum(auc) / len(auc) )

    score_df = pd.DataFrame(data=[accuracy, recall, precision, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                            index=['Accuracy', 'Recall', 'Precision', 'Fbeta2', 'AUC'])
    return score_df

# 2. Grid Search

## 2.1 Grid Search 1

In [14]:
n_neighbours = list(range(1,21,4))
leaf_sizes = list(range(20,40,5))
p = (1,2)
weights = ["uniform", "distance"]
metrics = ["minkowski", "chebyshev", "euclidean"]

params_grid = {
                'knn__n_neighbors': n_neighbours,
                'knn__leaf_size': leaf_sizes,
                'knn__p': p,
                'knn__weights': weights,
                'knn__metric': metrics
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

knn__n_neighbors [1, 5, 9, 13, 17]
knn__leaf_size [20, 25, 30, 35]
knn__p (1, 2)
knn__weights ['uniform', 'distance']
knn__metric ['minkowski', 'chebyshev', 'euclidean']
-----------------
Total combinations: 240


In [15]:
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)], remainder="passthrough")

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['knn', KNeighborsClassifier()]]
                   )

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

scoring = {"recall": 'recall',
           "fbeta_2": make_scorer(fbeta_score, beta=2),
           "roc_auc": make_scorer(roc_auc_score),
          }

knn_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = "recall",
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

knn_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = knn_gridsearch.best_params_
print(best_parameters)

  elif pd.api.types.is_categorical(cols):


{'knn__leaf_size': 20, 'knn__metric': 'chebyshev', 'knn__n_neighbors': 9, 'knn__p': 1, 'knn__weights': 'uniform'}


In [16]:
knn1 = KNeighborsClassifier(leaf_size=20, metric='chebyshev', 
                            n_neighbors=9, p=1, weights='uniform')

results_gs1 = cv_evaluate_model(knn1)
display(results_gs1)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Accuracy,0.861771,0.860417,0.861101,0.861096
Recall,0.857713,0.852891,0.857489,0.856031
Precision,0.46632,0.463389,0.46497,0.464893
Fbeta2,0.734429,0.730146,0.733626,0.732734
AUC,0.860027,0.857182,0.859548,0.858919


## 3.2 Second GridSearchCV 

In [18]:
n_neighbours = [5,7,9,11]
leaf_sizes = list(range(15,22,1))
p = (1,2)
weights = ["uniform"]
metrics = ["chebyshev"]

params_grid = {
                'knn__n_neighbors': n_neighbours,
                'knn__leaf_size': leaf_sizes,
                'knn__p': p,
                'knn__weights': weights,
                'knn__metric': metrics
              }
total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

knn__n_neighbors [5, 7, 9, 11]
knn__leaf_size [15, 16, 17, 18, 19, 20, 21]
knn__p (1, 2)
knn__weights ['uniform']
knn__metric ['chebyshev']
-----------------
Total combinations: 56


In [19]:
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)], remainder="passthrough")

pipeline = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['knn', KNeighborsClassifier()]]
                   )

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

scoring = {"recall": 'recall',
           "fbeta_2": make_scorer(fbeta_score, beta=2),
           "roc_auc": make_scorer(roc_auc_score),
          }

knn_gridsearch = GridSearchCV(estimator = pipeline,
                           param_grid = params_grid,
                           scoring = "recall",
                           cv = stratified_kfold,
                           refit = True,
                           n_jobs = -1)

knn_gridsearch.fit( x_train, y_train.values.ravel() )
best_parameters = knn_gridsearch.best_params_
print(best_parameters)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

  elif pd.api.types.is_categorical(cols):


{'knn__leaf_size': 15, 'knn__metric': 'chebyshev', 'knn__n_neighbors': 7, 'knn__p': 1, 'knn__weights': 'uniform'}


In [24]:
knn2 = KNeighborsClassifier(leaf_size=15, metric='chebyshev', 
                            n_neighbors=7, p=1, weights='uniform')

results_gs2 = cv_evaluate_model(knn2)
display(results_gs2)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Accuracy,0.860923,0.859643,0.860967,0.860511
Recall,0.859528,0.857851,0.854706,0.857362
Precision,0.464648,0.462015,0.464589,0.46375
Fbeta2,0.734658,0.73236,0.731806,0.732941
AUC,0.860323,0.858873,0.858276,0.859157
