1. split preprocessed data in train und validation sets
2. create pipeline for Grid Search Algorithm and execute grid search on serveral KNN parameters 
3. apply KNN with best parameters on splitted train data 
4. compare ROC to other models  



In [1]:
import re
from pathlib import Path
import numpy as np
import pandas as pd
import time
from hyperopt import STATUS_OK
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


In [2]:
data_path_raw = Path.cwd().parent / "data" / "raw"

data_path_preprocessed = Path.cwd().parent / "data" / "processed"


In [None]:
processed_df = pd.read_csv(data_path_preprocessed / "training_set_features__nominal_ordinal_WOE_Impute_Dropped_balanced_stand.csv", index_col="respondent_id")
labels_df = pd.read_csv(data_path_preprocessed / "training_set_labels__balanced.csv", index_col="respondent_id")
test_df_processed = pd.read_csv(data_path_preprocessed / "test_set_features_nominal_ordinal_WOE_Impute_Dropped_Stand.csv", index_col="respondent_id")


In [None]:
## 1 split preprocessed data in train und validation sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    processed_df,
    labels_df,
    shuffle = True,
    test_size = 0.25,
    random_state = 10)



In [None]:
## 2 create pipeline for Grid Search Algorithm and execute grid search on serveral KNN parameters 
param = {
    'n_neighbors' : [1, 3, 5, 7, 9],
    'weights' : ['uniform', 'distance'],
    'p' : [1, 2]
    }
knn = KNeighborsClassifier()
    

clf = GridSearchCV(knn, param_grid = param, cv = 5, n_jobs=-1)
clf.fit(X_train, y_train)





In [None]:
## 2
clf.best_params_

In [None]:
## 2
results = pd.DataFrame(clf.cv_results_)
results.sort_values(ascending = True, by = 'rank_test_score')

In [None]:
## 3 apply KNN with best parameters on splitted train data 


knn = KNeighborsClassifier(n_neighbors=9, weights='distance', p = 1)

start = time.time()
knn.fit(X_train, y_train)
test_predictions = knn.predict(X_test)
test_probability = knn.predict_proba(X_test)
end = time.time()
print(end - start)




In [None]:
## 3 
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = y_test.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

In [None]:
## 3 
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_preds))

In [None]:
## 4 compare to other executions 

ROC_3nn =  [0.6996214808883037 , 29.148728132247925]
ROC_5nn = [0.7229525862994551 , 29.378359079360962]
ROC_7nn = [0.7397354504203681, 29.483131170272827]
ROC_5nn_weighed = [0.7265594059943381, 33.25474190711975]
ROC_7nn_weighed = [0.7431371259324109, 34.52661919593811]
ROC_Grid_search = [0.7695095328560894, 28.562514066696167]



ROC_Overview = pd.DataFrame( [ROC_3nn, ROC_5nn, ROC_7nn, ROC_5nn_weighed,ROC_7nn_weighed, ROC_Grid_search])
ROC_Overview.columns = ['ROC', 'Time']
ROC_Overview.index = ['ROC_3nn', 'ROC_5nn', 'ROC_7nn', 'ROC_5nn_weighed', 'ROC_7nn_weighed', 'ROC_Grid_search']
ROC_Overview