In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn_som.som import SOM
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report

In [3]:
penguin_data = pd.read_csv(
  'https://raw.githubusercontent.com/cdavidshaffer/CPSC4970-AI/master/data/penguins.csv',
  na_values=["?", " ?"]
)
penguin_data.info()
penguin_data = penguin_data.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [4]:
display(penguin_data)
penguin_data.info()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


<class 'pandas.core.frame.DataFrame'>
Index: 334 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            334 non-null    object 
 1   island             334 non-null    object 
 2   culmen_length_mm   334 non-null    float64
 3   culmen_depth_mm    334 non-null    float64
 4   flipper_length_mm  334 non-null    float64
 5   body_mass_g        334 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.9+ KB


In [12]:
feature_cols = ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g"]
X_numerical = penguin_data[feature_cols]
X_categorical = penguin_data[["island", "sex"]]
y = penguin_data["species"]

X_num_train, X_num_test, X_cat_train, X_cat_test, y_train, y_test = train_test_split(
    X_numerical, X_categorical, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

display(X_num_train)
display(X_cat_train)

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
297,51.1,16.3,220.0,6000.0
201,49.8,17.3,198.0,3675.0
340,46.8,14.3,215.0,4850.0
266,45.5,13.9,210.0,4200.0
13,38.6,21.2,191.0,3800.0
...,...,...,...,...
14,34.6,21.1,198.0,4400.0
102,37.7,16.0,183.0,3075.0
259,48.7,15.7,208.0,5350.0
288,43.5,14.2,220.0,4700.0


Unnamed: 0,island,sex
297,Biscoe,MALE
201,Dream,FEMALE
340,Biscoe,FEMALE
266,Biscoe,FEMALE
13,Torgersen,MALE
...,...,...
14,Torgersen,MALE
102,Biscoe,FEMALE
259,Biscoe,MALE
288,Biscoe,FEMALE


In [13]:
scaler_som = StandardScaler()
X_num_train_scaled = scaler_som.fit_transform(X_num_train)
X_num_test_scaled  = scaler_som.transform(X_num_test)

In [14]:
som_rows, som_cols = 3, 3 

som = SOM(
    m=som_rows,
    n=som_cols,
    dim=X_num_train_scaled.shape[1],
    lr=0.5,
    sigma=1.0,
    max_iter=3000,
    random_state=0
)

som.fit(X_num_train_scaled)

In [15]:
som_train_cat = som.predict(X_num_train_scaled) 
som_test_cat  = som.predict(X_num_test_scaled)

print(som_train_cat)
print(np.unique(som_train_cat))

[0 2 3 3 8 7 7 2 7 5 7 0 0 0 3 3 0 5 8 0 3 7 2 4 3 2 7 7 7 7 8 3 8 2 8 8 2
 5 8 0 0 7 7 0 3 8 8 3 2 7 0 3 0 3 0 8 2 2 3 7 3 5 5 7 0 2 0 5 0 2 7 8 5 4
 0 0 8 3 5 0 3 2 5 7 0 2 8 0 7 3 7 7 3 7 0 2 7 5 3 8 7 0 7 8 8 7 8 8 0 5 5
 2 2 0 2 2 0 8 3 2 3 3 5 8 8 7 0 8 7 0 2 0 2 7 2 0 8 0 5 3 7 5 0 3 4 0 5 8
 7 0 0 8 7 8 2 2 2 7 8 5 5 7 7 5 4 8 3 0 8 3 5 0 7 8 5 0 3 3 2 0 2 0 7 0 8
 2 0 0 2 0 7 5 8 8 2 7 0 3 0 7 3 0 8 3 3 7 5 7 2 7 7 6 7 3 0 7 6 0 8 7 6 8
 7 3 0 5 8 5 8 7 0 3 8]
[0 2 3 4 5 6 7 8]


In [21]:
X_train_mlp = X_cat_train.copy()
X_train_mlp["som_category"] = som_train_cat

display(X_train_mlp)

X_test_mlp = X_cat_test.copy()
X_test_mlp["som_category"] = som_test_cat

display(X_test_mlp)


Unnamed: 0,island,sex,som_category
297,Biscoe,MALE,0
201,Dream,FEMALE,2
340,Biscoe,FEMALE,3
266,Biscoe,FEMALE,3
13,Torgersen,MALE,8
...,...,...,...
14,Torgersen,MALE,8
102,Biscoe,FEMALE,7
259,Biscoe,MALE,0
288,Biscoe,FEMALE,3


Unnamed: 0,island,sex,som_category
180,Dream,FEMALE,5
235,Biscoe,MALE,0
289,Biscoe,MALE,0
308,Biscoe,FEMALE,3
81,Torgersen,MALE,4
...,...,...,...
274,Biscoe,FEMALE,0
203,Dream,MALE,2
146,Dream,MALE,8
65,Biscoe,MALE,5


In [26]:
X_train_mlp_oh = pd.get_dummies(
    X_train_mlp,
    columns=["island", "sex", "som_category"]
)

X_test_mlp_oh = pd.get_dummies(
    X_test_mlp,
    columns=["island", "sex", "som_category"]
)

display(X_train_mlp_oh)
display(X_test_mlp_oh)

# Align columns: ensure test has same dummy columns as train
X_test_mlp_oh = X_test_mlp_oh.reindex(columns=X_train_mlp_oh.columns, fill_value=0)


Unnamed: 0,island_Biscoe,island_Dream,island_Torgersen,sex_FEMALE,sex_MALE,som_category_0,som_category_2,som_category_3,som_category_4,som_category_5,som_category_6,som_category_7,som_category_8
297,True,False,False,False,True,True,False,False,False,False,False,False,False
201,False,True,False,True,False,False,True,False,False,False,False,False,False
340,True,False,False,True,False,False,False,True,False,False,False,False,False
266,True,False,False,True,False,False,False,True,False,False,False,False,False
13,False,False,True,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,False,False,True,False,True,False,False,False,False,False,False,False,True
102,True,False,False,True,False,False,False,False,False,False,False,True,False
259,True,False,False,False,True,True,False,False,False,False,False,False,False
288,True,False,False,True,False,False,False,True,False,False,False,False,False


Unnamed: 0,island_Biscoe,island_Dream,island_Torgersen,sex_FEMALE,sex_MALE,sex__,som_category_0,som_category_2,som_category_3,som_category_4,som_category_5,som_category_6,som_category_7,som_category_8
180,False,True,False,True,False,False,False,False,False,False,True,False,False,False
235,True,False,False,False,True,False,True,False,False,False,False,False,False,False
289,True,False,False,False,True,False,True,False,False,False,False,False,False,False
308,True,False,False,True,False,False,False,False,True,False,False,False,False,False
81,False,False,True,False,True,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,True,False,False,True,False,False,True,False,False,False,False,False,False,False
203,False,True,False,False,True,False,False,True,False,False,False,False,False,False
146,False,True,False,False,True,False,False,False,False,False,False,False,False,True
65,True,False,False,False,True,False,False,False,False,False,True,False,False,False


In [24]:
mlp = MLPClassifier(
    hidden_layer_sizes=(16, 8),
    activation="relu",
    solver="adam",
    max_iter=500,
    random_state=0
)

mlp.fit(X_train_mlp_oh, y_train)

y_train_pred = mlp.predict(X_train_mlp_oh)
y_test_pred  = mlp.predict(X_test_mlp_oh)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc  = accuracy_score(y_test, y_test_pred)

print("MLP training accuracy:", train_acc)
print("MLP test accuracy:", test_acc)
print("\nClassification report (test):")
print(classification_report(y_test, y_test_pred))

MLP training accuracy: 0.9828326180257511
MLP test accuracy: 0.9900990099009901

Classification report (test):
              precision    recall  f1-score   support

      Adelie       0.98      1.00      0.99        44
   Chinstrap       1.00      0.95      0.98        21
      Gentoo       1.00      1.00      1.00        36

    accuracy                           0.99       101
   macro avg       0.99      0.98      0.99       101
weighted avg       0.99      0.99      0.99       101

