### Penguins dataset: use 1-NN to infer undefined sex values

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split

In [2]:
import seaborn as sns
df = sns.load_dataset("penguins")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


### 1. find/remove rows with Nan's in any column except sex

In [4]:
df.iloc[:, :-1].isna().any(axis = 1)

0      False
1      False
2      False
3       True
4      False
       ...  
339     True
340    False
341    False
342    False
343    False
Length: 344, dtype: bool

In [5]:
df[df.iloc[:, :-1].isna().any(axis = 1)]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
3,Adelie,Torgersen,,,,,
339,Gentoo,Biscoe,,,,,


In [6]:
# get indexes of rows to be drop
df[df.iloc[:, :-1].isna().any(axis = 1)].index

Int64Index([3, 339], dtype='int64')

In [7]:
df.drop(df[df.iloc[:, :-1].isna().any(axis = 1)].index, axis = 0, inplace = True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            342 non-null    object 
 1   island             342 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 21.4+ KB


### 2. infer gender based on numerical attributes

In [9]:
X_train = df[df.sex.notna()].reset_index(drop = True)

In [10]:
# use only numeric attributes
X_train.loc[:, df.dtypes != 'object']

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
3,36.7,19.3,193.0,3450.0
4,39.3,20.6,190.0,3650.0
...,...,...,...,...
328,47.2,13.7,214.0,4925.0
329,46.8,14.3,215.0,4850.0
330,50.4,15.7,222.0,5750.0
331,45.2,14.8,212.0,5200.0


In [11]:
# reference matrix
X = X_train.loc[:, df.dtypes != 'object'].to_numpy()
X

array([[  39.1,   18.7,  181. , 3750. ],
       [  39.5,   17.4,  186. , 3800. ],
       [  40.3,   18. ,  195. , 3250. ],
       ...,
       [  50.4,   15.7,  222. , 5750. ],
       [  45.2,   14.8,  212. , 5200. ],
       [  49.9,   16.1,  213. , 5400. ]])

In [12]:
def inferSex(row):
    x = row.loc[df.dtypes != 'object'].to_numpy()
    nnx = np.argmin(np.sum((x -X)**2, axis = 1))
    return X_train.sex.loc[nnx]

In [13]:
df['sex_nn1'] = df.apply(inferSex, axis = 1)

In [14]:
# check that we only changed rows with sex == NaN
df.loc[df.sex != df.sex_nn1]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,sex_nn1
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,,Male
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,,Male
10,Adelie,Torgersen,37.8,17.1,186.0,3300.0,,Female
11,Adelie,Torgersen,37.8,17.3,180.0,3700.0,,Female
47,Adelie,Dream,37.5,18.9,179.0,2975.0,,Female
246,Gentoo,Biscoe,44.5,14.3,216.0,4100.0,,Male
286,Gentoo,Biscoe,46.2,14.4,214.0,4650.0,,Female
324,Gentoo,Biscoe,47.3,13.8,216.0,4725.0,,Female
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,,Female


### 3. infer gender based on all attributes

In [15]:
# get dataframe with dummies
df_dumm = pd.get_dummies(df, columns = ['species', 'island'])
df_dumm

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,sex_nn1,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,Male,Male,1,0,0,0,0,1
1,39.5,17.4,186.0,3800.0,Female,Female,1,0,0,0,0,1
2,40.3,18.0,195.0,3250.0,Female,Female,1,0,0,0,0,1
4,36.7,19.3,193.0,3450.0,Female,Female,1,0,0,0,0,1
5,39.3,20.6,190.0,3650.0,Male,Male,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
338,47.2,13.7,214.0,4925.0,Female,Female,0,0,1,1,0,0
340,46.8,14.3,215.0,4850.0,Female,Female,0,0,1,1,0,0
341,50.4,15.7,222.0,5750.0,Male,Male,0,0,1,1,0,0
342,45.2,14.8,212.0,5200.0,Female,Female,0,0,1,1,0,0


In [16]:
Y_train = df_dumm[df_dumm.sex.notna()].reset_index(drop = True)

In [17]:
# reference matrix
Y = Y_train.loc[:, df_dumm.dtypes != 'object'].to_numpy()
Y

array([[ 39.1,  18.7, 181. , ...,   0. ,   0. ,   1. ],
       [ 39.5,  17.4, 186. , ...,   0. ,   0. ,   1. ],
       [ 40.3,  18. , 195. , ...,   0. ,   0. ,   1. ],
       ...,
       [ 50.4,  15.7, 222. , ...,   1. ,   0. ,   0. ],
       [ 45.2,  14.8, 212. , ...,   1. ,   0. ,   0. ],
       [ 49.9,  16.1, 213. , ...,   1. ,   0. ,   0. ]])

In [18]:
def inferSex_(row):
    y = row.loc[df_dumm.dtypes != 'object'].to_numpy()
    nny = np.argmin(np.sum((y -Y)**2, axis = 1))
    return Y_train.sex.loc[nny]

In [19]:
df['sex_nn2'] = df_dumm.apply(inferSex_, axis = 1)

In [20]:
# check that we only changed rows with sex == NaN
df.loc[df.sex != df.sex_nn2]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,sex_nn1,sex_nn2
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,,Male,Male
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,,Male,Male
10,Adelie,Torgersen,37.8,17.1,186.0,3300.0,,Female,Female
11,Adelie,Torgersen,37.8,17.3,180.0,3700.0,,Female,Female
47,Adelie,Dream,37.5,18.9,179.0,2975.0,,Female,Female
246,Gentoo,Biscoe,44.5,14.3,216.0,4100.0,,Male,Male
286,Gentoo,Biscoe,46.2,14.4,214.0,4650.0,,Female,Female
324,Gentoo,Biscoe,47.3,13.8,216.0,4725.0,,Female,Female
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,,Female,Female
