# K Nearest Neighbors
* KNN is a classification algorithm that assigns a data point the most likely label based on the label of the data points nearby.

### Training algorithm
1. Store all the data

### Prediction algorithm
1. Calculate the distance from x to all the points in your data.
2. Sort the points in your data by increasing distance from x.
3. Predict the majority label of the "k" closest points to x. 

Note: Choosing a "k" value will affect what class x is assigned to. 

Smaller k may overfit, larger k gives a cleaner cutoff point at the cost of mislabelling some points. It's up to you to find a healthy middle. 

### KNN Pros
* Very simple
* Training is trivial
* Works with any number of classes
* Easy to add more data
* Few parameters (just "k" and distance metric)

### KNN Cons
* High prediction cost (worse for large datasets)
* Not good with high dimensional data
* Categorical features don't work well

## Get the data 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [3]:
# Read in data
df = pd.read_csv('files/Classified Data')
df.info() # 1000 data points, 12 anonymized columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1000 non-null   int64  
 1   WTT           1000 non-null   float64
 2   PTI           1000 non-null   float64
 3   EQW           1000 non-null   float64
 4   SBI           1000 non-null   float64
 5   LQE           1000 non-null   float64
 6   QWG           1000 non-null   float64
 7   FDJ           1000 non-null   float64
 8   PJF           1000 non-null   float64
 9   HQE           1000 non-null   float64
 10  NXJ           1000 non-null   float64
 11  TARGET CLASS  1000 non-null   int64  
dtypes: float64(10), int64(2)
memory usage: 93.9 KB


In [4]:
df.head() # Need to use classified features to predict target class

Unnamed: 0.1,Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ,TARGET CLASS
0,0,0.913917,1.162073,0.567946,0.755464,0.780862,0.352608,0.759697,0.643798,0.879422,1.231409,1
1,1,0.635632,1.003722,0.535342,0.825645,0.924109,0.64845,0.675334,1.013546,0.621552,1.492702,0
2,2,0.72136,1.201493,0.92199,0.855595,1.526629,0.720781,1.626351,1.154483,0.957877,1.285597,0
3,3,1.234204,1.386726,0.653046,0.825624,1.142504,0.875128,1.409708,1.380003,1.522692,1.153093,1
4,4,1.279491,0.94975,0.62728,0.668976,1.232537,0.703727,1.115596,0.646691,1.463812,1.419167,1


In [6]:
# unnamed 0 just a duplicate index it seems
df['Unnamed: 0'].unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [7]:
# Drop the useless column
df.drop('Unnamed: 0',axis=1, inplace=True)

In [8]:
df.describe() # Features around 1 with 0.2 std, let's standardize (here not that important all on similar scales)

Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ,TARGET CLASS
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.949682,1.114303,0.834127,0.682099,1.032336,0.943534,0.963422,1.07196,1.158251,1.362725,0.5
std,0.289635,0.257085,0.291554,0.229645,0.243413,0.256121,0.255118,0.288982,0.293738,0.204225,0.50025
min,0.174412,0.441398,0.170924,0.045027,0.315307,0.262389,0.295228,0.299476,0.365157,0.639693,0.0
25%,0.742358,0.942071,0.615451,0.51501,0.870855,0.761064,0.784407,0.866306,0.93434,1.222623,0.0
50%,0.940475,1.118486,0.813264,0.676835,1.035824,0.941502,0.945333,1.0655,1.165556,1.375368,0.5
75%,1.163295,1.307904,1.02834,0.834317,1.19827,1.12306,1.134852,1.283156,1.383173,1.504832,1.0
max,1.721779,1.833757,1.722725,1.634884,1.65005,1.666902,1.713342,1.78542,1.88569,1.89395,1.0


## Standardize Features

In [9]:
# Import scaler
from sklearn.preprocessing import StandardScaler
# Instantiate it
scaler = StandardScaler()
# Fit it to the data 
scaler.fit(df.drop('TARGET CLASS',axis=1))
# Scale the features
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1))

In [10]:
# Create a dataframe with the scaled features
df_scaled = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_scaled.head()

Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ
0,-0.123542,0.185907,-0.913431,0.319629,-1.033637,-2.308375,-0.798951,-1.482368,-0.949719,-0.643314
1,-1.084836,-0.430348,-1.025313,0.625388,-0.444847,-1.152706,-1.129797,-0.20224,-1.828051,0.636759
2,-0.788702,0.339318,0.301511,0.755873,2.031693,-0.870156,2.599818,0.285707,-0.682494,-0.37785
3,0.982841,1.060193,-0.621399,0.625299,0.45282,-0.26722,1.750208,1.066491,1.241325,-1.026987
4,1.139275,-0.640392,-0.709819,-0.057175,0.822886,-0.936773,0.596782,-1.472352,1.040772,0.27651


## Split data into train and test

In [12]:
# import
from sklearn.model_selection import train_test_split
# instantiate 
X_train, X_test, y_train, y_test = train_test_split(df_scaled, df['TARGET CLASS'], test_size=0.3)

## Modelling with KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier

#### K = 1

In [14]:
# instantiate KNN with k=1 
knn = KNeighborsClassifier(n_neighbors=1)

In [15]:
# fit to the training data
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=1)

In [16]:
# Predict class of test data
pred = knn.predict(X_test)

In [18]:
# Evaluate model with k=1
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,pred)) # 136 TN, 19 FP, 6 FN, 139 TP, 
print(classification_report(y_test,pred)) # 92% accuracy, precision, recall, f1

[[136  19]
 [  6 139]]
              precision    recall  f1-score   support

           0       0.96      0.88      0.92       155
           1       0.88      0.96      0.92       145

    accuracy                           0.92       300
   macro avg       0.92      0.92      0.92       300
weighted avg       0.92      0.92      0.92       300



## Choosing the best K: The elbow method

In [19]:
# Will hold error rate for all k values
error_rate = []
# Loop through various k values
for i in range(1,40):
    # instantiate KNN with k=i 
    knn = KNeighborsClassifier(n_neighbors=i)
    # fit to the training data
    knn.fit(X_train,y_train)
    # Predict class of test data
    pred_i = knn.predict(X_test)
    # Calculate mean error
    err = np.mean(pred_i != y_test)
    # Add error to list
    error_rate.append(err)

In [24]:
# Visualize error rate
import plotly.express as px
# Minimum at K=24
px.line(x=range(1,40), y=error_rate, title='Error Rate depending on K-value', labels={'x':'K','y':'Error Rate'})

## Use k=24 for lower error

In [25]:
# instantiate KNN with k=1 
knn = KNeighborsClassifier(n_neighbors=24)
# fit to the training data
knn.fit(X_train,y_train)
# Predict class of test data
pred = knn.predict(X_test)

In [26]:
# Evaluate model performance
print(confusion_matrix(y_test,pred)) # 144 TN, 11 FP, 6 FN, 139 TP, 
print(classification_report(y_test,pred)) # 94% accuracy, precision, recall, f1 (improved by 2%)

[[144  11]
 [  6 139]]
              precision    recall  f1-score   support

           0       0.96      0.93      0.94       155
           1       0.93      0.96      0.94       145

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.94       300
weighted avg       0.94      0.94      0.94       300

