# kNN on a dataset of penguins

In [1]:
import seaborn as sns
from math import sqrt
from collections import Counter

### Get data

In [2]:
df = sns.load_dataset("penguins")
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female


### Remove missing values

In [3]:
df.dropna(inplace=True)

In [4]:
Xtrain = df.iloc[:-10, 2:5]  # multiple columns -> matrix
ytrain = df.iloc[:-10, 0]    # single column    -> vector

Xpred = df.iloc[-10:, 2:5]   # some data for making predictions

In [5]:
# number of rows and columns
Xtrain.shape, ytrain.shape

((323, 3), (323,))

### Distances

In [6]:
def euclidean_distance(a, b) -> float:
    """calculates the Euclidean distance of two penguins"""
    return sqrt(sum([
        (a["bill_length_mm"] - b["bill_length_mm"]) ** 2 +
        (a["bill_depth_mm"] - b["bill_depth_mm"]) ** 2 +
        (a["flipper_length_mm"] - b["flipper_length_mm"]) ** 2
    ]))

### Pseudocode of the kNN algorithm

to make a prediction for P:
    
1. go through all data points T
2. calculate the distance of T to P
3. find the k closest data points
4. calculate the average target value (regression) or majority vote (classification) among the closest points

In [7]:
k = 7  # our hyperparameter, number of closest points considered

In [8]:
p = Xpred.iloc[8]  # the penguind for which we make a prediction

In [9]:
distances = []

for i in range(Xtrain.shape[0]):
    t = Xtrain.iloc[i]
    dist = euclidean_distance(t, p)
    species = ytrain.iloc[i]
    distances.append((dist, species))

In [10]:
# same code written as a list comprehension
distances = [
    (euclidean_distance(t, p), species)
    for (i, t), species in zip(Xtrain.iterrows(), ytrain.values)
]

In [11]:
distances.sort()

In [12]:
distances[:k]  # a list of the closest (distance, price) tuples

[(0.424264068711927, 'Gentoo'),
 (1.0392304845413258, 'Gentoo'),
 (1.0440306508910562, 'Gentoo'),
 (1.6673332000533045, 'Gentoo'),
 (1.8275666882497072, 'Gentoo'),
 (2.042057785666214, 'Gentoo'),
 (2.0904544960366875, 'Gentoo')]

### Majority vote for classification

In [13]:
pred = [species for dist, species in distances[:k]]
pred

['Gentoo', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo']

In [14]:
Counter(pred).most_common(1)

[('Gentoo', 7)]