# k Nearest Neighbours

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import argparse
import pathlib
import os
import jwt
import cv2
import numpy as np
import imutils

def image_to_feature_vector(image, size=(64, 64)):
    return cv2.resize(image, size).flatten()

# get images
def get_image_paths(rootPath):
    imagePaths = []
    for file in os.listdir(rootPath):
        if(file[0] == '.'): continue
        for f in os.listdir(rootPath+file):
            if(f[0] == '.'): continue
            imagePaths.append(os.path.abspath(rootPath+'/'+file+'/'+f))
#             print(os.path.abspath(file+'/'+f).split(os.path.sep)[-2])
    return imagePaths

def extract_color_histogram(image, bins=(8, 8, 8)):
    # extract a 3D color histogram from the HSV color space using
    # the supplied number of `bins` per channel
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
        [0, 256, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    # return the flattened histogram as the feature vector
    return hist.flatten()

all_images = get_image_paths('../dataset/train/')

rawImages = []
features = []
labels = []
files = []

for (i, imagePath) in enumerate(all_images):
    image = cv2.imread(imagePath) #sets image to BGR
    label = imagePath.split(os.path.sep)[-2]
    file = imagePath.split(os.path.sep)[-1]
    
    pixels = image_to_feature_vector(image)
    hist = extract_color_histogram(image)
    
    rawImages.append(pixels)
    features.append(hist)
    labels.append(label)
    files.append(file)
    
    if i > 0 and i % 500 == 0:
        print("[INFO] processed {}/{}".format(i, len(all_images)))

rawImages = np.array(rawImages)
features = np.array(features)
labels = np.array(labels)
print("[INFO] pixels matrix: {:.2f}MB".format(
	rawImages.nbytes / (1024 * 1000.0)))
print("[INFO] features matrix: {:.2f}MB".format(
	features.nbytes / (1024 * 1000.0)))

# model = KNeighborsClassifier(n_neighbors=3)

# ri_score = knnCrossValidate(features, labels, model)
# print("[INFO] Raw Images cross validation accuracy: {:.2f}%".format(ri_score * 100))

# feat_score = knnCrossValidate(features, labels, model)
# print("[INFO] Features cross validation accuracy: {:.2f}%".format(feat_score * 100))

(trainRI, testRI, trainRL, testRL) = train_test_split(
	rawImages, labels, test_size=0.25, random_state=42)
(trainFeat, testFeat, trainLabels, testLabels) = train_test_split(
	features, labels, test_size=0.25, random_state=42)

# train and evaluate a k-NN classifer on the raw pixel intensities
print("[INFO] evaluating raw pixel accuracy...")
ri_model = KNeighborsClassifier(n_neighbors=1)
ri_model.fit(trainRI, trainRL)
ri_acc = ri_model.score(testRI, testRL)
print("[INFO] raw pixel accuracy: {:.2f}%".format(ri_acc * 100))

# train and evaluate a k-NN classifer on the histogram
# representations
print("[INFO] evaluating histogram accuracy...")
fea_model = KNeighborsClassifier(n_neighbors=1)
fea_model.fit(trainFeat, trainLabels)
fea_acc = fea_model.score(testFeat, testLabels)
print("[INFO] histogram accuracy: {:.2f}%".format(fea_acc * 100))


[INFO] processed 500/5216
[INFO] processed 1000/5216
[INFO] processed 1500/5216
[INFO] processed 2000/5216
[INFO] processed 2500/5216
[INFO] processed 3000/5216
[INFO] processed 3500/5216
[INFO] processed 4000/5216
[INFO] processed 4500/5216
[INFO] processed 5000/5216
[INFO] pixels matrix: 62.59MB
[INFO] features matrix: 10.43MB
[INFO] evaluating raw pixel accuracy...
[INFO] raw pixel accuracy: 90.95%
[INFO] evaluating histogram accuracy...
[INFO] histogram accuracy: 81.67%


### Confusion Matrix

In [3]:
from sklearn import metrics
import pandas as pd

cm = metrics.confusion_matrix(testLabels, fea_model.predict(testFeat))
data = {
    "Normal": [cm[0][0], cm[1][0]],
    "Pneumonia": [cm[0][1], cm[1][1]]
}
df = pd.DataFrame(data, index=["Normal", "Pneumonia"])
print(df)
print("\n")
print(metrics.classification_report(testLabels, fea_model.predict(testFeat)))

           Normal  Pneumonia
Normal        234        123
Pneumonia     116        831


              precision    recall  f1-score   support

      NORMAL       0.67      0.66      0.66       357
   PNEUMONIA       0.87      0.88      0.87       947

    accuracy                           0.82      1304
   macro avg       0.77      0.77      0.77      1304
weighted avg       0.82      0.82      0.82      1304



In [6]:
import plotly.express as px


def plot_spread():
#     y_score = ri_model.predict_proba(testRI)[:, 1]
    y_score = fea_model.predict_proba(testFeat)[:, 1]

    fig = px.scatter(
        testFeat, x=0, y=1,
        color=y_score, color_continuous_scale='RdBu',
        symbol=testLabels, symbol_map={'0': 'square-dot', '1': 'circle-dot'},
        labels={'symbol': 'label', 'color': 'score of <br>first class'}
    )
    fig.update_traces(marker_size=12, marker_line_width=1.5)
    fig.update_layout(legend_orientation='h')
    fig.show()

print(features.shape)
print(labels.shape)

plot_spread()

(5216, 512)
(5216,)
