In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from numpy import mean
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
#from sklearn.externals.six import StringIO  
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error,confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier


In [3]:
%store -r wp4
wp4 = wp4
 
%store -r wp8
wp8 = wp8

%store -r dwt4
dwt4 = dwt4

%store -r dwt8
dwt8 = dwt8

In [4]:
x = dwt8.loc[:, dwt8.columns != 'state'] #features
y = dwt8.loc[:, dwt8.columns == 'state'] #supervisor

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.67)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train.values.ravel())
y_pred = knn_model.predict(x_test)

accuracy = classification_report(y_test, y_pred, output_dict=True)['accuracy']
specificity = classification_report(y_test, y_pred, output_dict=True)['0']['recall'] #recall of the negative class = specificity
sensitivity = classification_report(y_test, y_pred, output_dict=True)['1']['recall'] #recall of the positive class = sensitivity

print("Accuracy: ", accuracy)
print("Specificity: ", specificity)
print("Sensitivity: ", sensitivity)

Accuracy:  0.9166666666666666
Specificity:  0.8235294117647058
Sensitivity:  0.98


## Grid Search Sampling Function

In [43]:
x = dwt8.loc[:, dwt8.columns != 'state'] #features
y = dwt8.loc[:, dwt8.columns == 'state'] #supervisor

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.67)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train.values.ravel())
y_pred = knn_model.predict(x_test)

param_grid = {'n_neighbors': [2, 4, 6, 8, 10, 15, 20], 
              'weights': ['uniform', 'distance'],
              'metric': ['euclidean', 'manhattan']} 
  
grid_model = GridSearchCV(knn_model, param_grid, refit = True, verbose = 0)

grid_model.fit(x_train, y_train.values.ravel())

grid_predictions = grid_model.predict(x_test)
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82        25
           1       0.93      0.92      0.92        59

    accuracy                           0.89        84
   macro avg       0.87      0.88      0.87        84
weighted avg       0.89      0.89      0.89        84



In [46]:
def knn_grid_reps(df, n_iter = 1000): #default 1k iterations
    df = df
    x = df.loc[:, df.columns != 'state'] #features
    y = df.loc[:, df.columns == 'state'] #supervisor

    eval_metrics = {} #empty dictionary to store metrics

    for i in range(n_iter): 

        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.67)

        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        knn_model = KNeighborsClassifier()
        knn_model.fit(x_train, y_train.values.ravel())
        y_pred = knn_model.predict(x_test)

        param_grid = {'n_neighbors': [2, 4, 6, 8, 10, 15, 20], 
                    'weights': ['uniform', 'distance'],
                    'metric': ['euclidean', 'manhattan']} 
        
        grid_model = GridSearchCV(knn_model, param_grid, refit = True, verbose = 0)

        grid_model.fit(x_train, y_train.values.ravel())

        y_pred = grid_model.predict(x_test)

        accuracy = classification_report(y_test, y_pred, output_dict=True)['accuracy']
        specificity = classification_report(y_test, y_pred, output_dict=True)['0']['recall'] #recall of the negative class = specificity
        sensitivity = classification_report(y_test, y_pred, output_dict=True)['1']['recall'] #recall of the positive class = sensitivity

        metrics = [accuracy, specificity, sensitivity] 
        eval_metrics[i]=list(metrics)

    eval_metrics = pd.DataFrame.from_dict(eval_metrics).T 
    eval_metrics.columns = ['accuracy', 'specificity', 'sensitivity']
    eval_metrics.head()

    means = eval_metrics.mean(axis=0) #mean of each column
    
    return means

#### Wavelet Packet / Wang, 8-7-02 data (wp8)

In [49]:
knn_grid_reps(df = wp8, n_iter = 1000)

accuracy       0.927131
specificity    0.892994
sensitivity    0.947623
dtype: float64

#### Discrete Wavelet Transform, 8-7-02 data (dwt8)

In [50]:
knn_grid_reps(df = dwt8, n_iter=1000)

accuracy       0.900774
specificity    0.879739
sensitivity    0.914451
dtype: float64