# K-Nearest Neightbor Model

## Step 1: Import packages

In [9]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

import xray_data 

## Step 2: Load, pre-process and split data

In [3]:
labels = ['NORMAL','PNEUMONIA']
X_train, y_train = xray_data.load_train(label_filters=labels, sample='PROP')

100% (460 of 460) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (3875 of 3875) |####################| Elapsed Time: 0:01:41 Time:  0:01:41
100% (1341 of 1341) |####################| Elapsed Time: 0:01:34 Time:  0:01:34
100% (650 of 650) |######################| Elapsed Time: 0:00:00 Time:  0:00:00


In [4]:
print(X_train.shape)
print(y_train.shape)

(5216, 40000)
(5216,)


In [5]:
X_mini_train, X_dev, y_mini_train, y_dev = train_test_split(X_train, y_train, test_size = .2, stratify = y_train)

In [6]:
print(X_mini_train.shape)
print(X_dev.shape)

(4172, 40000)
(1044, 40000)


## Step 3: Train Default KNN Model

In [7]:
knn = KNeighborsClassifier()
knn.fit(X_mini_train, y_mini_train)
y_pred = knn.predict(X_dev)
accuracy = accuracy_score(y_pred,y_dev)

print('Default k-Nearest Neighbors')
print('\nmetric: minkowski\nn_neighbors: 5\naccuracy: %.3f ' % accuracy) 

Default k-Nearest Neighbors

metric: minkowski
n_neighbors: 5
accuracy: 0.942 


## Step 4: CrossValidated Gridsearch KNN models

In [8]:
knn = KNeighborsClassifier()

param_grid = [
    {
        'n_neighbors': np.arange(1, 16), 
        'metric': ['euclidean', 'minkowski']
    }
]

knn_gscv = GridSearchCV(knn, param_grid, cv=5, verbose=3)
knn_gscv.fit(X_train,y_train) 

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ...metric=euclidean, n_neighbors=1;, score=0.917 total time=  13.4s
[CV 2/5] END ...metric=euclidean, n_neighbors=1;, score=0.933 total time=   9.7s
[CV 3/5] END ...metric=euclidean, n_neighbors=1;, score=0.910 total time=   9.7s
[CV 4/5] END ...metric=euclidean, n_neighbors=1;, score=0.925 total time=  11.1s
[CV 5/5] END ...metric=euclidean, n_neighbors=1;, score=0.932 total time=  12.2s
[CV 1/5] END ...metric=euclidean, n_neighbors=2;, score=0.896 total time=  12.1s
[CV 2/5] END ...metric=euclidean, n_neighbors=2;, score=0.913 total time=  10.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=2;, score=0.888 total time=  11.0s
[CV 4/5] END ...metric=euclidean, n_neighbors=2;, score=0.900 total time=  12.7s
[CV 5/5] END ...metric=euclidean, n_neighbors=2;, score=0.919 total time=  10.4s
[CV 1/5] END ...metric=euclidean, n_neighbors=3;, score=0.929 total time=  10.8s
[CV 2/5] END ...metric=euclidean, n_neighbors=3

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid=[{'metric': ['euclidean', 'minkowski'],
                          'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])}],
             verbose=3)

In [10]:
print('k-Nearest Neighbors Grid Search with 5-Fold Cross-Validation')
print('Best Score: %.3f' % knn_gscv.best_score_)
print('Optimal Parameters:') 
best_params = knn_gscv.best_params_
for param in best_params: 
    print('\t%s: %s' % (param, best_params[param]))

k-Nearest Neighbors Grid Search with 5-Fold Cross-Validation
Best Score: 0.934
Optimal Parameters:
	metric: euclidean
	n_neighbors: 5


In [11]:
y_pred = knn_gscv.predict(X_dev)
accuracy = accuracy_score(y_pred,y_dev)

print('Optimal k-Nearest Neighbors Model')
print('\nmetric: euclidean\nn_neighbors: 5\naccuracy: %.3f ' % accuracy) 

Optimal k-Nearest Neighbors Model

metric: euclidean
n_neighbors: 5
accuracy: 0.958 


## Part 5: Bacterial Pneumonia

### Load, pre-process and split data

In [3]:
X_train, y_train = xray_data.load_train(label_filters=labels, subset='PROP', pn_bacterial=True)
print(X_train.shape)
print(y_train.shape)

100% (460 of 460) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
100% (3875 of 3875) |####################| Elapsed Time: 0:01:09 Time:  0:01:09
100% (1341 of 1341) |####################| Elapsed Time: 0:02:11 Time:  0:02:11
100% (650 of 650) |######################| Elapsed Time: 0:00:00 Time:  0:00:00
- |#                                                  | 0 Elapsed Time: 0:00:00


PNEUMONIA: 2530
NORMAL: 1341
Total: 3871


In [5]:
X_mini_train, X_dev, y_mini_train, y_dev = train_test_split(X_train, y_train, test_size = .2, stratify = y_train)
print(X_mini_train.shape)
print(X_dev.shape)

(3096, 40000)
(775, 40000)


### Default KNN Model

In [6]:
knn = KNeighborsClassifier()
knn.fit(X_mini_train, y_mini_train)
y_pred = knn.predict(X_dev)
accuracy = accuracy_score(y_pred,y_dev)

print('Default k-Nearest Neighbors')
print('\nmetric: minkowski\nn_neighbors: 5\naccuracy: %.3f ' % accuracy) 

Default k-Nearest Neighbors

metric: minkowski
n_neighbors: 5
accuracy: 0.930 


### CrossValidated Gridsearch KNN models

In [8]:
knn = KNeighborsClassifier()

param_grid = [
    {
        'n_neighbors': np.arange(1, 16), 
        'metric': ['euclidean', 'minkowski']
    }
]

knn_gscv = GridSearchCV(knn, param_grid, cv=5, verbose=3)
knn_gscv.fit(X_train,y_train) 

print('k-Nearest Neighbors Grid Search with 5-Fold Cross-Validation')
print('Best Score: %.3f' % knn_gscv.best_score_)

print('Optimal Parameters:') 
best_params = knn_gscv.best_params_
for param in best_params: 
    print('\t%s: %s' % (param, best_params[param]))
    
y_pred = knn_gscv.predict(X_dev)
accuracy = accuracy_score(y_pred,y_dev)

print('Optimal k-Nearest Neighbors Model')
print('\nmetric: euclidean\nn_neighbors: 5\naccuracy: %.3f ' % accuracy) 

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ...metric=euclidean, n_neighbors=1;, score=0.930 total time=   6.3s
[CV 2/5] END ...metric=euclidean, n_neighbors=1;, score=0.934 total time=   5.9s
[CV 3/5] END ...metric=euclidean, n_neighbors=1;, score=0.926 total time=   8.2s
[CV 4/5] END ...metric=euclidean, n_neighbors=1;, score=0.929 total time=   6.1s
[CV 5/5] END ...metric=euclidean, n_neighbors=1;, score=0.935 total time=   6.1s
[CV 1/5] END ...metric=euclidean, n_neighbors=2;, score=0.908 total time=   6.1s
[CV 2/5] END ...metric=euclidean, n_neighbors=2;, score=0.911 total time=   8.8s
[CV 3/5] END ...metric=euclidean, n_neighbors=2;, score=0.889 total time=   7.7s
[CV 4/5] END ...metric=euclidean, n_neighbors=2;, score=0.904 total time=   5.4s
[CV 5/5] END ...metric=euclidean, n_neighbors=2;, score=0.920 total time=   6.1s
[CV 1/5] END ...metric=euclidean, n_neighbors=3;, score=0.943 total time=   7.0s
[CV 2/5] END ...metric=euclidean, n_neighbors=3

## Part 6: Pneumonia, COVID & Tuberculosis (All Labels)

### Load, pre-process and split data

In [4]:
labels = ['NORMAL','PNEUMONIA','COVID19','TURBERCULOSIS']
X_train, y_train = xray_data.load_train(label_filters=labels, subset='PROP', pn_bacterial=True)
print(X_train.shape)
print(y_train.shape)

100% (460 of 460) |######################| Elapsed Time: 0:01:07 Time:  0:01:07
100% (3875 of 3875) |####################| Elapsed Time: 0:01:34 Time:  0:01:34
100% (1341 of 1341) |####################| Elapsed Time: 0:01:57 Time:  0:01:57
100% (650 of 650) |######################| Elapsed Time: 0:00:21 Time:  0:00:21


COVID19: 460
PNEUMONIA: 2530
NORMAL: 1341
TURBERCULOSIS: 650
Total: 4981
(4981, 40000)
(4981,)


In [5]:
X_mini_train, X_dev, y_mini_train, y_dev = train_test_split(X_train, y_train, test_size = .2, stratify = y_train)
print(X_mini_train.shape)
print(X_dev.shape)

(3984, 40000)
(997, 40000)


### Default KNN Model

In [6]:
knn = KNeighborsClassifier()
knn.fit(X_mini_train, y_mini_train)
y_pred = knn.predict(X_dev)
accuracy = accuracy_score(y_pred,y_dev)

print('Default k-Nearest Neighbors')
print('\nmetric: minkowski\nn_neighbors: 5\naccuracy: %.3f ' % accuracy) 

Default k-Nearest Neighbors

metric: minkowski
n_neighbors: 5
accuracy: 0.901 


### CrossValidated Gridsearch KNN models

In [7]:
knn = KNeighborsClassifier()

param_grid = [
    {
        'n_neighbors': np.arange(1, 16), 
        'metric': ['euclidean', 'minkowski']
    }
]

knn_gscv = GridSearchCV(knn, param_grid, cv=5, verbose=3)
knn_gscv.fit(X_train,y_train) 

print('k-Nearest Neighbors Grid Search with 5-Fold Cross-Validation')
print('Best Score: %.3f' % knn_gscv.best_score_)

print('Optimal Parameters:') 
best_params = knn_gscv.best_params_
for param in best_params: 
    print('\t%s: %s' % (param, best_params[param]))
    
y_pred = knn_gscv.predict(X_dev)
accuracy = accuracy_score(y_pred,y_dev)

print('Optimal k-Nearest Neighbors Model')
print('\nmetric: euclidean\nn_neighbors: 5\naccuracy: %.3f ' % accuracy) 

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ...metric=euclidean, n_neighbors=1;, score=0.906 total time=   8.1s
[CV 2/5] END ...metric=euclidean, n_neighbors=1;, score=0.906 total time=   9.0s
[CV 3/5] END ...metric=euclidean, n_neighbors=1;, score=0.896 total time=  13.6s
[CV 4/5] END ...metric=euclidean, n_neighbors=1;, score=0.901 total time=  10.3s
[CV 5/5] END ...metric=euclidean, n_neighbors=1;, score=0.911 total time=   8.9s
[CV 1/5] END ...metric=euclidean, n_neighbors=2;, score=0.909 total time=   8.6s
[CV 2/5] END ...metric=euclidean, n_neighbors=2;, score=0.893 total time=   8.5s
[CV 3/5] END ...metric=euclidean, n_neighbors=2;, score=0.882 total time=   8.3s
[CV 4/5] END ...metric=euclidean, n_neighbors=2;, score=0.882 total time=   8.7s
[CV 5/5] END ...metric=euclidean, n_neighbors=2;, score=0.893 total time=   8.6s
[CV 1/5] END ...metric=euclidean, n_neighbors=3;, score=0.921 total time=   8.8s
[CV 2/5] END ...metric=euclidean, n_neighbors=3

In [10]:
confusion_matrix(y_dev, y_pred)

array([[258,  11,   0,   0],
       [  9, 497,   0,   0],
       [  0,   5,  85,   2],
       [  0,   7,  12, 111]])