# KNN

### Load Requirements

In [1]:
%load_ext autoreload
%autoreload

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from data_utils import get_training, get_testing, HEADER
from sklearn.neighbors import KNeighborsClassifier

from validation import Validation

from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from my_pca import my_pca

%matplotlib notebook

### KNN - finding best PCA Component

In [2]:
X, y = get_training()
val = Validation(X, y, k = 10)

pca_comp = [3,5,9,11,13,15,20,25,30]

pca_results = {
    'accuracy' : [],
    'recall' : [],
    'precision' : []
}

bsf = (0,-1,3)
mdl = KNeighborsClassifier(n_neighbors=3)

for comp in pca_comp:
    val.update(pca=comp)
    acc = val.cross_val_accuracy(mdl)
    r = val.get_detailed_results()
    pca_results['accuracy'].append(np.array(r['accuracy']).mean())
    pca_results['recall'].append(np.array(r['recall']).mean())
    pca_results['precision'].append(np.array(r['precision']).mean())
    
    if bsf[0] < acc:
        bsf = (acc, comp)
        
best_pca_comp = bsf[1]

## Searching for best K

In [3]:
X, y = get_training()
val = Validation(X, y, k = 10, pca = best_pca_comp)

k_list = [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 31, 51, 71, 91]

k_results = {
    'accuracy' : [],
    'recall' : [],
    'precision' : []
}

bsf = (0,-1,3)

for k in k_list:
    mdl = KNeighborsClassifier(n_neighbors=k)
    acc = val.cross_val_accuracy(mdl)
    r = val.get_detailed_results()
    k_results['accuracy'].append(np.array(r['accuracy']).mean())
    k_results['recall'].append(np.array(r['recall']).mean())
    k_results['precision'].append(np.array(r['precision']).mean())
    
    if bsf[0] < acc:
        bsf = (acc, best_pca_comp, k)
        
best_k = bsf[2]

## Plot Scores

In [5]:
plt.close()
lines = plt.plot(pca_comp, pca_results['accuracy'], 'rd-', pca_comp, pca_results['recall'], 'bs-', pca_comp, pca_results['precision'], 'g^-')
plt.legend(lines, ('accuracy','recall','precision'),loc='lower right')
plt.title('KNN vs Varying PCA Components (k = 3)')
plt.xlabel('Number of Components')
plt.show()

<IPython.core.display.Javascript object>

In [8]:
plt.close()
lines = plt.plot(k_list, k_results['accuracy'], 'rd-', k_list, k_results['recall'], 'bs-', k_list, k_results['precision'], 'g^-')
plt.legend(lines, ('accuracy','recall','precision'),loc='lower left')
plt.title('KNN')
plt.xlabel('Neighborhood Size (k)')
plt.show()

<IPython.core.display.Javascript object>

In [22]:
# choosing best k & pca
# k = 3, pca = 30
# k = 3, pca = 10 (more regularized)
chosen_best_pca_comp = 10
chosen_best_k = 3
bsf[0]

0.96887234042553205

## Confusion matrices

In [17]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, '%.2f'%cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.gcf().subplots_adjust(bottom=0.2)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [18]:
plt.figure(figsize=(6,5))
X_reduced = my_pca(X,chosen_best_pca_comp)
mdl = KNeighborsClassifier(n_neighbors=chosen_best_k)
y_pred = mdl.fit(X_reduced, y).predict(X_reduced)
cm_pca = confusion_matrix(y, y_pred)
plot_confusion_matrix(cm_pca, classes=['Benign', 'Malignant'], normalize=True)


<IPython.core.display.Javascript object>

Normalized confusion matrix
