In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import joblib

In [2]:
df = pd.read_csv("./parrots.csv").drop(columns=["Unnamed: 0"])
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,wing_length_mm,body_mass_g,sex
0,Philippine Hanging Parrot,Palawan,39.1,18.7,181.0,3750.0,Male
1,Philippine Hanging Parrot,Palawan,39.5,17.4,186.0,3800.0,Female
2,Philippine Hanging Parrot,Palawan,40.3,18.0,195.0,3250.0,Female
3,Philippine Hanging Parrot,Palawan,,,,,
4,Philippine Hanging Parrot,Palawan,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Eclectus Parrot,Solomon Islands,,,,,
340,Eclectus Parrot,Solomon Islands,46.8,14.3,215.0,4850.0,Female
341,Eclectus Parrot,Solomon Islands,50.4,15.7,222.0,5750.0,Male
342,Eclectus Parrot,Solomon Islands,45.2,14.8,212.0,5200.0,Female


In [3]:
df = df.dropna()

In [4]:
encode_cols = ["island", "sex"]
df[encode_cols]

Unnamed: 0,island,sex
0,Palawan,Male
1,Palawan,Female
2,Palawan,Female
4,Palawan,Female
5,Palawan,Male
...,...,...
338,Solomon Islands,Female
340,Solomon Islands,Female
341,Solomon Islands,Male
342,Solomon Islands,Female


In [5]:
onehot_encoder = OneHotEncoder(sparse_output=False)
encoded_df = onehot_encoder.fit_transform(df[["sex"]])

In [6]:
encoded_df = pd.DataFrame(encoded_df, columns=onehot_encoder.get_feature_names_out())
encoded_df

Unnamed: 0,sex_Female,sex_Male
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0
...,...,...
328,1.0,0.0
329,1.0,0.0
330,0.0,1.0
331,1.0,0.0


In [7]:
onehot_encoder_species = LabelEncoder()
onehot_encoder_island = LabelEncoder()

encoded_target_species = onehot_encoder_species.fit_transform(df[["species"]])
encoded_target_island = onehot_encoder_island.fit_transform(df[["island"]])

target_island_df = pd.Series(encoded_target_island, name="island")
target_species_df = pd.Series(encoded_target_species, name="species")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(
    df[
        [
            "bill_length_mm",
            "bill_depth_mm",
            "wing_length_mm",
            "body_mass_g",
        ]
    ]
)

scaled_training_features = scaler.transform(
    df[
        [
            "bill_length_mm",
            "bill_depth_mm",
            "wing_length_mm",
            "body_mass_g",
        ]
    ]
)

scaled_training_features_df = pd.DataFrame(
    scaled_training_features,
    columns=[
        "bill_length_mm",
        "bill_depth_mm",
        "wing_length_mm",
        "body_mass_g",
    ],
)

joblib.dump(scaler, "scaler.joblib")

['scaler.joblib']

In [9]:
train_df = pd.concat(
    [target_island_df, target_species_df, encoded_df, scaled_training_features_df],
    axis=1,
).dropna()
train_df

Unnamed: 0,island,species,sex_Female,sex_Male,bill_length_mm,bill_depth_mm,wing_length_mm,body_mass_g
0,3,2,0.0,1.0,-0.896042,0.780732,-1.426752,-0.568475
1,3,2,1.0,0.0,-0.822788,0.119584,-1.069474,-0.506286
2,3,2,1.0,0.0,-0.676280,0.424729,-0.426373,-1.190361
3,3,2,1.0,0.0,-1.335566,1.085877,-0.569284,-0.941606
4,3,2,0.0,1.0,-0.859415,1.747026,-0.783651,-0.692852
...,...,...,...,...,...,...,...,...
328,4,0,1.0,0.0,0.587352,-1.762145,0.931283,0.892957
329,4,0,1.0,0.0,0.514098,-1.457000,1.002739,0.799674
330,4,0,0.0,1.0,1.173384,-0.744994,1.502928,1.919069
331,4,0,1.0,0.0,0.221082,-1.202712,0.788372,1.234995


In [10]:
X_train_species, X_test_species, Y_train_species, Y_test_species = train_test_split(
    train_df.drop(columns=["island", "species"]),
    train_df["species"],
    test_size=0.1,
)

X_train_island, X_test_island, Y_train_island, Y_test_island = train_test_split(
    train_df.drop(columns=["species", "island"]), train_df["island"], test_size=0.1
)

In [11]:
X_train_species


Unnamed: 0,sex_Female,sex_Male,bill_length_mm,bill_depth_mm,wing_length_mm,body_mass_g
311,0.0,1.0,1.301579,-0.338134,1.717295,1.297183
98,1.0,0.0,-1.115804,0.729875,-0.569284,-1.594587
219,1.0,0.0,0.459158,-1.863860,0.645461,0.426543
19,0.0,1.0,-0.950982,0.017869,-1.498207,-0.506286
153,0.0,1.0,1.338206,0.526444,-0.283462,-0.568475
...,...,...,...,...,...,...
151,1.0,0.0,0.221082,0.323014,-0.212006,-0.319720
108,1.0,0.0,-0.804474,1.797883,-0.712196,-0.381909
9,0.0,1.0,-1.720150,2.001313,-0.212006,0.239977
267,1.0,0.0,0.459158,-1.406143,1.145650,0.861863


In [12]:
for deg in range(16):
    model_species = SVC(kernel="poly", degree=deg + 1)
    model_species.fit(train_df.drop(columns=["species", "island"]), train_df["species"])

    model_island = SVC(kernel="poly", degree=deg + 1)
    model_island.fit(train_df.drop(columns=["species", "island"]), train_df["island"])

    print(f"Degree: {deg + 1}")
    print(
        f"Species Train Accuracy: {round(model_species.score(X_train_species, Y_train_species), 3)}"
    )
    print(
        f"Species Test Accuracy: {round(model_species.score(X_test_species, Y_test_species), 3)}"
    )
    print(
        f"Island Train Accuracy: {round(model_island.score(X_train_island, Y_train_island), 3)}"
    )
    print(
        f"Island Test Accuracy: {round(model_island.score(X_test_island, Y_test_island), 3)}"
    )
    print("==============")

Degree: 1
Species Train Accuracy: 0.993
Species Test Accuracy: 0.971
Island Train Accuracy: 0.726
Island Test Accuracy: 0.676
Degree: 2
Species Train Accuracy: 0.993
Species Test Accuracy: 0.971
Island Train Accuracy: 0.746
Island Test Accuracy: 0.647
Degree: 3
Species Train Accuracy: 0.997
Species Test Accuracy: 0.971
Island Train Accuracy: 0.749
Island Test Accuracy: 0.647
Degree: 4
Species Train Accuracy: 0.98
Species Test Accuracy: 0.971
Island Train Accuracy: 0.732
Island Test Accuracy: 0.676
Degree: 5
Species Train Accuracy: 0.96
Species Test Accuracy: 0.912
Island Train Accuracy: 0.716
Island Test Accuracy: 0.706
Degree: 6
Species Train Accuracy: 0.926
Species Test Accuracy: 0.853
Island Train Accuracy: 0.706
Island Test Accuracy: 0.706
Degree: 7
Species Train Accuracy: 0.9
Species Test Accuracy: 0.853
Island Train Accuracy: 0.706
Island Test Accuracy: 0.706
Degree: 8
Species Train Accuracy: 0.886
Species Test Accuracy: 0.794
Island Train Accuracy: 0.706
Island Test Accuracy: 0.

In [13]:
model_species = SVC(kernel="poly", degree=4)
model_species.fit(train_df.drop(columns=["species", "island"]), train_df["species"])

model_island = SVC(kernel="poly", degree=4)
model_island.fit(train_df.drop(columns=["species", "island"]), train_df["island"])

In [14]:
save_model = input("Save model? (y/n)")

if save_model == "y":
    file_name = input("Filename:")
    joblib.dump(model_species, f"{file_name}1.joblib")
    joblib.dump(model_island, f"{file_name}2.joblib")
else:
    print("Exiting...")