In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
# reading the data set
df = pd.read_excel('input/palmer_penguins.xlsx')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [3]:
# remove rows containing NaN
print(df.shape)
df.dropna(inplace=True)
print(df.shape)

(344, 8)
(333, 8)


In [4]:
# recode target variable's values
df['sex_recoded'] = df.sex.replace({'female' : 0 , 'male' : 1})
print(df.sex_recoded.unique())

[1 0]


In [5]:
# split the data into Predictor (X) and Target (y) variables
X = df[['bill_length_mm', 'flipper_length_mm']]
y = df.sex_recoded
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

In [6]:
# instantiate the model
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(y_pred)

[0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 1
 0 0 0 0 1 1 1 1 1 0 1 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1
 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 1 1 1 1 1 0 1 1 0 0
 0 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 1 0 0 1 0 0 1]


In [7]:
# evaluate the model accuracy using z-score
knn.score(X_test, y_test)

0.753731343283582

In [8]:
# evaluate the model accuracy with confusion matrix
print(confusion_matrix(y_test, y_pred))

[[48  9]
 [24 53]]


In [9]:
print(classification_report(y_test, y_pred))
# precision: the percentage of predicted positives that are actually positive
# recall: the percentage of actual positives that are predicted positive
# f1-score: the harmonic mean of precision and recall
# support: the number of occurrences of each class

              precision    recall  f1-score   support

           0       0.67      0.84      0.74        57
           1       0.85      0.69      0.76        77

    accuracy                           0.75       134
   macro avg       0.76      0.77      0.75       134
weighted avg       0.77      0.75      0.75       134

