1. Reduce No of Features with PCA
2. Fit KNN and Predict with Train Validate Data Split 
3. Iterate over PCA columns 


In [2]:
import re
from pathlib import Path
import numpy as np
import pandas as pd

In [3]:
data_path_raw = Path.cwd().parent.parent / "data" / "raw"

data_path_preprocessed = Path.cwd().parent.parent / "data" / "processed"


In [4]:
raw_df_processed = pd.read_csv(data_path_preprocessed / "training_set_features_encoded_imputed_standardized.csv", index_col="respondent_id")
labels_df = pd.read_csv(data_path_raw / "training_set_labels.csv", index_col="respondent_id")
test_df_processed = pd.read_csv(data_path_preprocessed / "test_set_features_encoded_imputed_standardized.csv", index_col="respondent_id")

all_raw_df = raw_df_processed.join(labels_df)

In [7]:
# Feature Extraction with PCA

from sklearn.decomposition import PCA

# feature extraction
pca = PCA(0.99)
fit = pca.fit(raw_df_processed)
# summarize components


print("Number of Features: %s" % len(raw_df_processed.columns))
print("Number of Factors: %s" % len(fit.explained_variance_ratio_))


print("Explained Variance: %s" % fit.explained_variance_ratio_)


Number of Features: 74
Number of Factors: 65
Explained Variance: [0.05635598 0.04570851 0.0418888  0.04043737 0.03020325 0.02530618
 0.02363715 0.02177056 0.0202816  0.01893893 0.01851062 0.01790168
 0.01755754 0.01718299 0.0167233  0.01647727 0.01633766 0.01595887
 0.01576383 0.0156291  0.015322   0.01513473 0.01500742 0.01479398
 0.01468061 0.01462851 0.01455204 0.01452518 0.01450952 0.01443816
 0.01442042 0.01441855 0.01438084 0.01430489 0.01427156 0.01420145
 0.01400384 0.01375178 0.01353252 0.01321968 0.01266008 0.01257326
 0.01222445 0.01204249 0.01170588 0.01123729 0.0110767  0.01047624
 0.01008412 0.00951569 0.00915983 0.00882744 0.00860214 0.00832599
 0.00778612 0.00762792 0.00750646 0.00700126 0.00591798 0.00570602
 0.00526985 0.00493175 0.00464574 0.00396161 0.00345955]


In [8]:
raw_df_processed_PCA = pca.transform(raw_df_processed)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    raw_df_processed_PCA,
    labels_df,
    shuffle = True,
    test_size = 0.25,
    random_state = 10)

In [11]:
from sklearn.neighbors import KNeighborsClassifier
import time 

knn = KNeighborsClassifier(n_neighbors=7)

start = time.time()
knn.fit(X_train, y_train)
test_predictions = knn.predict(X_test)
test_probability = knn.predict_proba(X_test)
end = time.time()
print(end - start)

49.70987868309021


In [12]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = y_test.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (6677, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2653,0.285714,1.0
9506,0.285714,0.285714
23107,0.0,0.142857
22648,0.0,0.428571
25589,0.285714,0.857143


In [14]:
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_test, y_preds))

0.7414210343064139


In [25]:
ROC_3nn =  [0.6996214808883037 , 29.148728132247925]
ROC_5nn = [0.7229525862994551 , 29.378359079360962]
ROC_7nn = [0.7397354504203681, 29.483131170272827]

ROC_7nn_PCA = [0.7414210343064139, 49.70987868309021]

ROC_Overview = pd.DataFrame( [ROC_3nn, ROC_5nn, ROC_7nn, ROC_7nn_PCA])
ROC_Overview.columns = ['ROC', 'Time']
ROC_Overview.index = ['ROC_3nn', 'ROC_5nn', 'ROC_7nn', 'ROC_7nn_PCA']
ROC_Overview

Unnamed: 0,ROC,Time
ROC_3nn,0.699621,29.148728
ROC_5nn,0.722953,29.378359
ROC_7nn,0.739735,29.483131


In [12]:
from sklearn.metrics import classification_report
#print(classification_report(y_train, train_predictions))
print(classification_report(y_test, test_predictions))

              precision    recall  f1-score   support

           0       0.60      0.40      0.48      1410
           1       0.70      0.69      0.69      3137

   micro avg       0.68      0.60      0.64      4547
   macro avg       0.65      0.54      0.59      4547
weighted avg       0.67      0.60      0.63      4547
 samples avg       0.31      0.30      0.30      4547



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
raw_df_processed_PCA_pd = pd.DataFrame(raw_df_processed_PCA) 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,-1.17672,-4.133293,-1.614302,0.125509,0.841317,2.870943,0.81007,-0.234957,0.069456,-1.135411,...,0.172287,-0.17835,-0.160004,1.32743,0.413891,-0.70743,-0.002256,0.013678,0.217172,0.116268
1,1.092509,0.334422,0.279211,2.646643,-1.479065,0.416249,0.768287,-2.862104,-1.672479,0.464441,...,0.865187,-0.161551,0.051793,1.415675,-0.209171,-0.150391,-0.786256,0.170386,0.00724,-0.097913
2,-3.062687,-0.189232,-0.081252,1.227165,-0.030669,-0.259173,1.62766,-0.378301,0.123607,1.043371,...,-1.739835,-0.430126,-0.732408,0.254847,-0.551204,-0.227131,-0.455424,0.104135,0.152852,0.798172
3,2.067111,-2.344447,-2.004324,0.343302,1.087153,-1.244538,1.043666,-1.356447,-0.686102,0.070942,...,-0.665793,-0.020114,1.861965,-1.332726,0.919013,0.539391,-0.627539,0.101682,0.024875,0.196815
4,-0.504103,0.831651,0.188744,0.370539,-1.074134,1.041658,-1.599364,-0.088693,-1.574307,-1.390709,...,0.597414,-0.191024,-0.542859,-1.38342,-1.437496,-0.080171,-0.966547,-0.174311,0.026692,0.08702
5,0.39051,1.361206,0.783817,0.954122,-1.701061,-0.933688,0.438401,0.358026,-0.468991,1.025995,...,0.138713,0.045629,-2.235813,-0.245425,0.836297,1.642053,0.028971,-0.083367,0.252769,0.163192
6,-3.83926,-2.071766,-0.907928,1.108235,-0.24171,0.384708,2.44489,-1.429555,1.088153,0.157338,...,0.378221,0.239298,0.056421,-0.157976,0.082854,-0.177527,0.326814,0.207433,0.201848,1.620682
7,-0.927894,0.711832,-0.601985,0.179061,-2.032922,1.143397,1.242754,0.107562,0.764458,1.331587,...,0.458937,-1.433072,0.555745,0.036767,-0.937725,-0.616888,1.121992,0.076335,-0.024647,-0.1671
8,-2.15977,1.017582,-0.118141,0.426204,-2.647338,0.814559,-1.544176,-0.233231,2.070983,1.843378,...,0.338249,-0.525478,0.293791,0.044542,-0.012468,-0.337062,0.041035,0.129452,-0.068964,0.663234
9,0.275242,-2.104417,-1.968178,-0.448927,-0.152123,0.080479,0.095212,-0.673673,-0.298384,-0.246182,...,-0.30895,0.773089,-0.341984,-1.56145,-0.442716,-0.350061,-0.205435,-0.029574,0.172148,0.015117


In [34]:
testfeatures_list = []

ROC_Features_PCA_5nn = pd.DataFrame(columns = ['last_feature', 'time', 'ROC'])

for column in raw_df_processed_PCA_pd.columns.tolist(): 
    testfeatures_list.append(column)
    
    raw_df_testing = raw_df_processed_PCA_pd[testfeatures_list]

    
    X_train, X_test, y_train, y_test = train_test_split(
        raw_df_testing,
        labels_df,
        shuffle = True,
        test_size = 0.25,
        random_state = 10)
    
    knn = KNeighborsClassifier(n_neighbors=5)

    start = time.time()
    knn.fit(X_train, y_train)
    test_predictions = knn.predict(X_test)
    test_probability = knn.predict_proba(X_test)
    end = time.time()
    
    
    y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": test_probability[0][:, 1],
        "seasonal_vaccine": test_probability[1][:, 1],
    },
    index = y_test.index
    )
    
    
    ROC_Features_PCA_5nn = ROC_Features_PCA_5nn.append({'last_feature': column, 'time': end - start, 'ROC': roc_auc_score(y_test, y_preds)}, ignore_index=True)
    

In [37]:
pd.set_option("display.max_rows", 80) 


ROC_Features_PCA_5nn.sort_values(ascending = False, by = 'ROC')

Unnamed: 0,last_feature,time,ROC
10,10.0,1.852319,0.753225
11,11.0,2.160591,0.748513
12,12.0,2.829969,0.745222
13,13.0,3.723951,0.743347
15,15.0,4.908741,0.741993
14,14.0,4.294517,0.740341
8,8.0,1.234814,0.73972
9,9.0,1.415747,0.738475
17,17.0,9.63245,0.737909
18,18.0,7.660255,0.737573
