1. Train Validate Data Split 
2. GridSearch for best parameter combination
3. KNN with best parameter combination


In [24]:
import re
from pathlib import Path
import numpy as np
import pandas as pd
import time
from hyperopt import STATUS_OK
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


In [10]:
data_path_raw = Path.cwd().parent.parent / "data" / "raw"

data_path_preprocessed = Path.cwd().parent.parent / "data" / "processed"


In [11]:
raw_df_processed = pd.read_csv(data_path_preprocessed / "training_set_features_encoded_imputed_standardized.csv", index_col="respondent_id")
labels_df = pd.read_csv(data_path_raw / "training_set_labels.csv", index_col="respondent_id")
test_df_processed = pd.read_csv(data_path_preprocessed / "test_set_features_encoded_imputed_standardized.csv", index_col="respondent_id")

all_raw_df = raw_df_processed.join(labels_df)

In [12]:

# Create the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    raw_df_processed,
    labels_df,
    shuffle = True,
    test_size = 0.25,
    random_state = 10)



In [21]:
param = {
    'n_neighbors' : [1, 3, 5, 7, 9],
    'weights' : ['uniform', 'distance'],
    'p' : [1, 2]
    }
knn = KNeighborsClassifier()
    

clf = GridSearchCV(knn, param_grid = param, cv = 5, n_jobs=-1)
clf.fit(X_train, y_train)





{'mean_fit_time': array([1.65570102, 1.42086263, 1.17224278, 1.33470097, 1.43954864,
        1.0859304 , 1.10011306, 1.12572765, 1.04070244, 0.9834373 ,
        1.09257312, 1.17196398, 1.18226466, 1.34015241, 1.33047037,
        1.088798  , 1.07904549, 1.10687342, 1.27871461, 1.01740589]),
 'std_fit_time': array([0.19002084, 0.27861277, 0.04383297, 0.17160421, 0.20065829,
        0.02569707, 0.04426454, 0.020972  , 0.03088449, 0.03417254,
        0.04716863, 0.12994948, 0.11905636, 0.07984982, 0.08866542,
        0.05232486, 0.0126294 , 0.13990796, 0.08587741, 0.13389429]),
 'mean_score_time': array([16.00109596, 15.59435363, 15.75060439, 15.13066607, 15.96120696,
        15.95974379, 15.79670281, 14.93841319, 15.66205096, 15.29133248,
        15.77907486, 16.75441041, 17.6121172 , 17.21349773, 17.0717452 ,
        16.3696032 , 16.05460639, 16.19822636, 17.29297371, 10.8504086 ]),
 'std_score_time': array([0.12931597, 0.13471147, 0.10611433, 0.22689463, 0.32938509,
        0.10491116, 

In [23]:
clf.best_params_

{'n_neighbors': 9, 'p': 1, 'weights': 'distance'}

In [34]:
results = pd.DataFrame(clf.cv_results_)
results.sort_values(ascending = True, by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
17,1.106873,0.139908,16.198226,0.388573,9,1,distance,"{'n_neighbors': 9, 'p': 1, 'weights': 'distance'}",0.607589,0.615826,0.612082,0.595357,0.611083,0.608387,0.007024,1
16,1.079045,0.012629,16.054606,0.070468,9,1,uniform,"{'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}",0.607089,0.615826,0.614079,0.594358,0.610334,0.608337,0.007616,2
12,1.182265,0.119056,17.612117,0.151233,7,1,uniform,"{'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}",0.596106,0.600599,0.602097,0.591613,0.601598,0.598402,0.004001,3
13,1.340152,0.07985,17.213498,0.08235,7,1,distance,"{'n_neighbors': 7, 'p': 1, 'weights': 'distance'}",0.597853,0.59985,0.600849,0.590364,0.59985,0.597753,0.00382,4
18,1.278715,0.085877,17.292974,0.13367,9,2,uniform,"{'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}",0.594109,0.59336,0.592611,0.585871,0.589616,0.591113,0.003033,5
19,1.017406,0.133894,10.850409,2.92176,9,2,distance,"{'n_neighbors': 9, 'p': 2, 'weights': 'distance'}",0.594608,0.591862,0.592361,0.585622,0.588617,0.590614,0.003146,6
8,1.040702,0.030884,15.662051,0.048602,5,1,uniform,"{'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}",0.592361,0.590864,0.594608,0.578382,0.587619,0.588767,0.005667,7
9,0.983437,0.034173,15.291332,0.143743,5,1,distance,"{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}",0.591613,0.590614,0.594109,0.578382,0.585871,0.588118,0.005553,8
14,1.33047,0.088665,17.071745,0.420577,7,2,uniform,"{'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}",0.587868,0.580379,0.588867,0.581128,0.581877,0.584024,0.003592,9
15,1.088798,0.052325,16.369603,0.06531,7,2,distance,"{'n_neighbors': 7, 'p': 2, 'weights': 'distance'}",0.588118,0.57988,0.588867,0.580629,0.581628,0.583824,0.003859,10


In [25]:
knn = KNeighborsClassifier(n_neighbors=9, weights='distance', p = 1)

start = time.time()
knn.fit(X_train, y_train)
test_predictions = knn.predict(X_test)
test_probability = knn.predict_proba(X_test)
end = time.time()
print(end - start)




28.562514066696167


In [26]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = y_test.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (6677, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2653,0.225337,0.894359
9506,0.231998,0.343616
23107,0.092172,0.142329
22648,0.0,0.561493
25589,0.212942,0.549736


In [27]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_preds))

0.7695095328560894


In [28]:
ROC_3nn =  [0.6996214808883037 , 29.148728132247925]
ROC_5nn = [0.7229525862994551 , 29.378359079360962]
ROC_7nn = [0.7397354504203681, 29.483131170272827]
ROC_5nn_weighed = [0.7265594059943381, 33.25474190711975]
ROC_7nn_weighed = [0.7431371259324109, 34.52661919593811]
ROC_Grid_search = [0.7695095328560894, 28.562514066696167]




ROC_Overview = pd.DataFrame( [ROC_3nn, ROC_5nn, ROC_7nn, ROC_5nn_weighed,ROC_7nn_weighed, ROC_Grid_search])
ROC_Overview.columns = ['ROC', 'Time']
ROC_Overview.index = ['ROC_3nn', 'ROC_5nn', 'ROC_7nn', 'ROC_5nn_weighed', 'ROC_7nn_weighed', 'ROC_Grid_search']
ROC_Overview

Unnamed: 0,ROC,Time
ROC_3nn,0.699621,29.148728
ROC_5nn,0.722953,29.378359
ROC_7nn,0.739735,29.483131
ROC_5nn_weighed,0.726559,33.254742
ROC_7nn_weighed,0.743137,34.526619
ROC_Grid_search,0.76951,28.562514


In [29]:
from sklearn.metrics import classification_report
#print(classification_report(y_train, train_predictions))
print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.63      0.33      0.43      1410
           1       0.70      0.69      0.69      3137

   micro avg       0.68      0.58      0.63      4547
   macro avg       0.66      0.51      0.56      4547
weighted avg       0.68      0.58      0.61      4547
 samples avg       0.32      0.30      0.30      4547



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
