In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
df = pd.read_csv("../data/penguins.csv")

In [3]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


In [5]:
features = ["bill_length_mm",
            "bill_depth_mm",
            "flipper_length_mm",
            "body_mass_g" ]

In [8]:
df_not_null = df.copy()
df_not_null = df_not_null.dropna()

X = df_not_null[features]
y = df_not_null.species

le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size=0.8, random_state=1
)

In [11]:
ros = RandomOverSampler(random_state=42)

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [13]:
knn = KNeighborsClassifier()

knn.fit(X_resampled, y_resampled)

y_pred = knn.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.68      0.75        28
           1       0.53      0.62      0.57        13
           2       0.86      0.96      0.91        26

    accuracy                           0.78        67
   macro avg       0.74      0.75      0.74        67
weighted avg       0.78      0.78      0.78        67



In [14]:
lr = LogisticRegression(solver='liblinear')

lr.fit(X_resampled, y_resampled)

y_pred = lr.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        26

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67



In [16]:
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [17]:
knn = KNeighborsClassifier()

knn.fit(X_resampled, y_resampled)

y_pred = knn.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.71      0.74        28
           1       0.58      0.54      0.56        13
           2       0.83      0.92      0.87        26

    accuracy                           0.76        67
   macro avg       0.73      0.73      0.72        67
weighted avg       0.76      0.76      0.76        67



In [18]:
X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)

In [19]:
knn = KNeighborsClassifier()

knn.fit(X_resampled, y_resampled)

y_pred = knn.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.61      0.71        28
           1       0.47      0.69      0.56        13
           2       0.89      0.96      0.93        26

    accuracy                           0.76        67
   macro avg       0.74      0.75      0.73        67
weighted avg       0.79      0.76      0.76        67

