### Penguins dataset: KNN with normalization

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
import seaborn as sns
df = sns.load_dataset("penguins")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


#### prepare dataset

In [4]:
# drop NaNs
df.drop(df[df.iloc[:, :-1].isna().any(axis = 1)].index, axis = 0, inplace = True)

### normalize numerical features

In [5]:
df = df.apply(lambda col: col if col.dtype != 'float' else (col -col.min()) /(col.max() -col.min()))
df.head(3)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,0.254545,0.666667,0.152542,0.291667,Male
1,Adelie,Torgersen,0.269091,0.511905,0.237288,0.305556,Female
2,Adelie,Torgersen,0.298182,0.583333,0.389831,0.152778,Female


#### infer sex for rows where sex is NaN

In [6]:
# get dummies dataframe
df_dumm = pd.get_dummies(df, columns = ['species', 'island'])
df_dumm.head(3)

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,0.254545,0.666667,0.152542,0.291667,Male,1,0,0,0,0,1
1,0.269091,0.511905,0.237288,0.305556,Female,1,0,0,0,0,1
2,0.298182,0.583333,0.389831,0.152778,Female,1,0,0,0,0,1


In [7]:
# reference dataframe
X_train = df_dumm[df_dumm.sex.notna()].reset_index(drop = True)
# reference matrix, remove non-numerical columns (i.e. 'sex')
X = X_train.loc[:, df_dumm.dtypes != 'object'].to_numpy()

In [8]:
# inference function
def inferSex_(row):
    x = row.to_numpy()
    nnx = np.argmin(np.sum((x -X)**2, axis = 1))
    return X_train.sex.loc[nnx]

In [9]:
df['sex_'] = df_dumm.loc[:, df_dumm.dtypes != 'object'].apply(inferSex_, axis = 1)

In [10]:
df.loc[df.sex != df.sex_]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,sex_
8,Adelie,Torgersen,0.072727,0.595238,0.355932,0.215278,,Female
9,Adelie,Torgersen,0.36,0.845238,0.305085,0.430556,,Male
10,Adelie,Torgersen,0.207273,0.47619,0.237288,0.166667,,Female
11,Adelie,Torgersen,0.207273,0.5,0.135593,0.277778,,Female
47,Adelie,Dream,0.196364,0.690476,0.118644,0.076389,,Female
246,Gentoo,Biscoe,0.450909,0.142857,0.745763,0.388889,,Female
286,Gentoo,Biscoe,0.512727,0.154762,0.711864,0.541667,,Female
324,Gentoo,Biscoe,0.552727,0.083333,0.745763,0.5625,,Female
336,Gentoo,Biscoe,0.450909,0.309524,0.762712,0.604167,,Male


###  make inference using all available information

#### train/test split

In [11]:
Y_train, Y_test = train_test_split(df.drop('sex', axis = 1), test_size = 0.2, random_state = 2873)

#### define inference function

In [12]:
def infer_(row, Y, predict, k = 5):
    y = row.to_numpy(dtype = 'float')
    Y_train['d2x'] = np.sqrt(np.sum((y -Y)**2, axis = 1))
    return Y_train.sort_values(by = 'd2x')[:k][predict].value_counts().index[0]

#### classify

In [13]:
# select attribute to predict
predict = 'species'
# reference matrix
Y = pd.get_dummies(Y_train.drop(predict, axis = 1)).to_numpy()
# test data frame
test = pd.get_dummies(Y_test.drop(predict, axis = 1))
# run inference function over the test set
Y_test['predicted'] = test.apply(lambda row: infer_(row, Y, predict, k = 5), axis = 1)
# show result
Y_test.groupby(predict).predicted.value_counts()

species    predicted
Adelie     Adelie       35
Chinstrap  Chinstrap     9
Gentoo     Gentoo       25
Name: predicted, dtype: int64

In [14]:
# clean previous prediction
Y_train.drop('d2x', axis = 1, inplace = True)
Y_test.drop('predicted', axis = 1, inplace = True)

In [15]:
# select attribute to predict
predict = 'island'
# reference matrix
Y = pd.get_dummies(Y_train.drop(predict, axis = 1)).to_numpy()
# test data frame
test = pd.get_dummies(Y_test.drop(predict, axis = 1))
# run inference function over the test set
Y_test['predicted'] = test.apply(lambda row: infer_(row, Y, predict, k = 5), axis = 1)
# show result
Y_test.groupby(predict).predicted.value_counts()

island     predicted
Biscoe     Biscoe       26
           Torgersen     6
           Dream         4
Dream      Dream        15
           Biscoe        4
           Torgersen     1
Torgersen  Torgersen     6
           Dream         5
           Biscoe        2
Name: predicted, dtype: int64

In [16]:
# clean previous prediction
Y_train.drop('d2x', axis = 1, inplace = True)
Y_test.drop('predicted', axis = 1, inplace = True)

In [17]:
# select attribute to predict
predict = 'sex_'
# reference matrix
Y = pd.get_dummies(Y_train.drop(predict, axis = 1)).to_numpy()
# test data frame
test = pd.get_dummies(Y_test.drop(predict, axis = 1))
# run inference function over the test set
Y_test['predicted'] = test.apply(lambda row: infer_(row, Y, predict, k = 5), axis = 1)
# show result
Y_test.groupby(predict).predicted.value_counts()

sex_    predicted
Female  Female       27
        Male          4
Male    Male         35
        Female        3
Name: predicted, dtype: int64