### K Nearest Neighbors Classifier (KNN)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
iris = sns.load_dataset("iris")
iris.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


#### build the KNN classifier

- Similarity $\approx$ proximity in the feature space
- classify example based on its k nearest neighbors (k = 1, 2, 3, ...)
- class of the new observation is the majority class in its k nearest neighbors

##### train/test split

- split the dataset into two subsets: a ***training set*** and a ***test set***
- the test set simulates future ***unseen*** data

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test = train_test_split(iris, test_size = 0.2, random_state = 2873)
X_train.shape, X_test.shape

((120, 5), (30, 5))

In [5]:
X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
2,4.7,3.2,1.3,0.2,setosa
66,5.6,3.0,4.5,1.5,versicolor
114,5.8,2.8,5.1,2.4,virginica
87,6.3,2.3,4.4,1.3,versicolor
145,6.7,3.0,5.2,2.3,virginica


##### build the matrix of the training dataset

In [6]:
I = X_train.iloc[:, :-2].to_numpy(dtype = 'float')

##### set the number of nearest neighbors k (avoid ties)

In [7]:
k = 5

##### step-by-step classification of a random example from the test set

In [8]:
# select a random index from the test set
rndIdx = np.random.randint(0, X_test.shape[0])
rndIdx

22

In [9]:
# get random test datapoint 
X_test.iloc[rndIdx]

sepal_length           5.7
sepal_width            3.0
petal_length           4.2
petal_width            1.2
species         versicolor
Name: 95, dtype: object

In [10]:
# get vector data
x = X_test.iloc[rndIdx, :-2].to_numpy(dtype = 'float')
x

array([5.7, 3. , 4.2])

In [11]:
# compute distances to all observations in the training set
X_train['dist2x'] = np.sqrt(np.sum((x -I)**2, axis = 1))
X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,dist2x
2,4.7,3.2,1.3,0.2,setosa,3.074085
66,5.6,3.0,4.5,1.5,versicolor,0.316228
114,5.8,2.8,5.1,2.4,virginica,0.927362
87,6.3,2.3,4.4,1.3,versicolor,0.943398
145,6.7,3.0,5.2,2.3,virginica,1.414214


In [12]:
# sort distances
X_train.sort_values(by = 'dist2x').head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,dist2x
96,5.7,2.9,4.2,1.3,versicolor,0.1
88,5.6,3.0,4.1,1.3,versicolor,0.141421
61,5.9,3.0,4.2,1.5,versicolor,0.2
99,5.7,2.8,4.1,1.3,versicolor,0.223607
66,5.6,3.0,4.5,1.5,versicolor,0.316228


In [13]:
# get k nearest neighbors
X_train.sort_values(by = 'dist2x')[:k]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,dist2x
96,5.7,2.9,4.2,1.3,versicolor,0.1
88,5.6,3.0,4.1,1.3,versicolor,0.141421
61,5.9,3.0,4.2,1.5,versicolor,0.2
99,5.7,2.8,4.1,1.3,versicolor,0.223607
66,5.6,3.0,4.5,1.5,versicolor,0.316228


In [14]:
# count class labels in the nearest neighbors set
X_train.sort_values(by = 'dist2x')[:k].species.value_counts().to_frame('n').reset_index()

Unnamed: 0,index,n
0,versicolor,5


In [15]:
# get the majority class
X_train.sort_values(by = 'dist2x')[:k].species.value_counts().to_frame('n').reset_index().iloc[0, 0]

'versicolor'

##### compact version

In [16]:
# get random test datapoint
rndIdx = np.random.randint(0, X_test.shape[0])
print(X_test.iloc[rndIdx])
# get vector datapoint
x = X_test.iloc[rndIdx, :-2].to_numpy(dtype = 'float')
# compute distances
X_train['dist2x'] = np.sqrt(np.sum((x -I)**2, axis = 1))
# sort distances, get the k nearest neighbors and class counts
X_train.sort_values(by = 'dist2x')[:k].species.value_counts().to_frame('n').reset_index()

sepal_length          6.0
sepal_width           2.2
petal_length          5.0
petal_width           1.5
species         virginica
Name: 119, dtype: object


Unnamed: 0,index,n
0,virginica,3
1,versicolor,2


##### classify all examples in the test set

In [17]:
def predict_species(t):
    x = t[:-2].to_numpy(dtype = 'float')
    X_train['d2x'] = np.sqrt(np.sum((x -I)**2, axis = 1))
    return X_train.sort_values(by = 'd2x')[:k].species.value_counts().to_frame().reset_index().iloc[0, 0]

In [18]:
X_test['predicted'] = X_test.apply(predict_species, axis = 1)
X_test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,predicted
102,7.1,3.0,5.9,2.1,virginica,virginica
25,5.0,3.0,1.6,0.2,setosa,setosa
119,6.0,2.2,5.0,1.5,virginica,virginica
128,6.4,2.8,5.6,2.1,virginica,virginica
138,6.0,3.0,4.8,1.8,virginica,versicolor


#### check predictions

In [20]:
X_test.groupby('species').predicted.value_counts().to_frame('n')

Unnamed: 0_level_0,Unnamed: 1_level_0,n
species,predicted,Unnamed: 2_level_1
setosa,setosa,9
versicolor,versicolor,10
versicolor,virginica,1
virginica,virginica,8
virginica,versicolor,2
