1. split preprocessed data in train und validation sets
2. create pipeline for Grid Search Algorithm and execute grid search on serveral KNN parameters 
3. apply KNN with best parameters on splitted train data 
4. compare ROC to other models  
5. fit knn model to complete train dataset and apply and test data
6. load submission dataset, sort probabilities accordingly and save submission dataset 






In [1]:
import re
from pathlib import Path
import numpy as np
import pandas as pd
import time
from hyperopt import STATUS_OK
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


In [2]:
data_path_raw = Path.cwd().parent / "data" / "raw"

data_path_preprocessed = Path.cwd().parent / "data" / "processed"


In [3]:
processed_df = pd.read_csv(data_path_preprocessed / "training_set_features__nominal_ordinal_WOE_Impute_Dropped_balanced_stand.csv", index_col="respondent_id")
labels_df = pd.read_csv(data_path_preprocessed / "training_set_labels__balanced.csv", index_col="respondent_id")
test_df_processed = pd.read_csv(data_path_preprocessed / "test_set_features_nominal_ordinal_WOE_Impute_Dropped_Stand.csv", index_col="respondent_id")


In [4]:
## 1 split preprocessed data in train und validation sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    processed_df,
    labels_df,
    shuffle = True,
    test_size = 0.25,
    random_state = 10)



In [5]:
## 2 create pipeline for Grid Search Algorithm and execute grid search on serveral KNN parameters 
param = {
    'n_neighbors' : [1, 3, 5, 7, 9],
    'weights' : ['uniform', 'distance'],
    'p' : [1, 2]
    }
knn = KNeighborsClassifier()
    

clf = GridSearchCV(knn, param_grid = param, cv = 5, n_jobs=-1)
clf.fit(X_train, y_train)





GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9], 'p': [1, 2],
                         'weights': ['uniform', 'distance']})

In [6]:
## 2
clf.best_params_

{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}

{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}

In [7]:
## 2
results = pd.DataFrame(clf.cv_results_)
results.sort_values(ascending = True, by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,17.040592,0.822083,85.790601,0.4692,1,1,uniform,"{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}",0.816472,0.818729,0.816222,0.814341,0.815971,0.816347,0.001405,1
1,16.466854,1.033414,85.171488,1.223535,1,1,distance,"{'n_neighbors': 1, 'p': 1, 'weights': 'distance'}",0.816472,0.818729,0.816222,0.814341,0.815971,0.816347,0.001405,1
5,11.526184,0.848806,89.601168,1.561541,3,1,distance,"{'n_neighbors': 3, 'p': 1, 'weights': 'distance'}",0.807572,0.803811,0.801554,0.804187,0.807948,0.805014,0.002419,3
9,5.358364,0.172744,80.660944,5.596294,5,1,distance,"{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}",0.79654,0.798044,0.796665,0.798546,0.799173,0.797794,0.001037,4
2,19.507741,0.566333,86.805318,0.379502,1,2,uniform,"{'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}",0.800301,0.797418,0.794158,0.796415,0.799298,0.797518,0.002165,5
3,13.52705,3.50051,86.313864,1.005382,1,2,distance,"{'n_neighbors': 1, 'p': 2, 'weights': 'distance'}",0.800301,0.797418,0.794158,0.796415,0.799298,0.797518,0.002165,5
13,7.421899,2.172124,72.481576,1.751846,7,1,distance,"{'n_neighbors': 7, 'p': 1, 'weights': 'distance'}",0.793531,0.792905,0.791275,0.792278,0.795412,0.79308,0.001384,7
17,5.070584,0.952319,73.498276,4.675019,9,1,distance,"{'n_neighbors': 9, 'p': 1, 'weights': 'distance'}",0.788392,0.787765,0.785383,0.783503,0.790648,0.787138,0.002473,8
4,14.716038,3.281873,92.853602,0.437452,3,1,uniform,"{'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}",0.778237,0.779616,0.775605,0.779867,0.781497,0.778965,0.001973,9
7,9.429813,1.292415,71.926762,1.225812,3,2,distance,"{'n_neighbors': 3, 'p': 2, 'weights': 'distance'}",0.769337,0.767206,0.76006,0.762442,0.770214,0.765852,0.003954,10


In [12]:
## 3 apply KNN with best parameters on splitted train data 


knn = KNeighborsClassifier(n_neighbors=1, weights='uniform', p = 1)

start = time.time()
knn.fit(X_train, y_train)
test_probability = knn.predict_proba(X_test)
end = time.time()
print(end - start)




109.24370288848877


In [13]:
## 3 
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = y_test.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (13295, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
44713,1.0,1.0
7604,0.0,0.0
8302,0.0,1.0
33252,1.0,0.0
25581,0.0,1.0


In [14]:
## 3 
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_preds))

0.9019025457625856


In [15]:
## 4 compare to other executions 

ROC_3nn =  [0.6996214808883037 , 29.148728132247925]
ROC_5nn = [0.7229525862994551 , 29.378359079360962]
ROC_7nn = [0.7397354504203681, 29.483131170272827]
ROC_5nn_weighed = [0.7265594059943381, 33.25474190711975]
ROC_7nn_weighed = [0.7431371259324109, 34.52661919593811]
ROC_Grid_search_1 = [0.7695095328560894, 28.562514066696167]
ROC_Grid_search_2 = [0.9019025457625856, 109.24370288848877]




ROC_Overview = pd.DataFrame( [ROC_3nn, ROC_5nn, ROC_7nn, ROC_5nn_weighed,ROC_7nn_weighed, ROC_Grid_search])
ROC_Overview.columns = ['ROC', 'Time']
ROC_Overview.index = ['ROC_3nn', 'ROC_5nn', 'ROC_7nn', 'ROC_5nn_weighed', 'ROC_7nn_weighed', 'ROC_Grid_search']
ROC_Overview

Unnamed: 0,ROC,Time
ROC_3nn,0.699621,29.148728
ROC_5nn,0.722953,29.378359
ROC_7nn,0.739735,29.483131
ROC_5nn_weighed,0.726559,33.254742
ROC_7nn_weighed,0.743137,34.526619
ROC_Grid_search,0.76951,28.562514


In [16]:
## 5 fit knn model to complete train dataset and apply and test data

knn.fit(processed_df, labels_df)
test_probability = knn.predict_proba(test_df_processed)

In [17]:
## 6 load submission dataset, sort probabilities accordingly and save submission dataset 

predictions = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = test_df_processed.index
)



In [18]:
## 6
submission_df = pd.read_csv(data_path_raw / "submission_format.csv", 
                            index_col="respondent_id")

In [19]:
## 6
np.testing.assert_array_equal(test_df_processed.index.values, 
                              submission_df.index.values)



In [20]:
## 6
# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = predictions["h1n1_vaccine"]
submission_df["seasonal_vaccine"] = predictions["seasonal_vaccine"]

In [21]:
## 6

output_path = Path.cwd().parent / "models" / "submissions"


submission_df.to_csv(output_path /'KNN_GridSearch.csv', index=True)


