In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
df = pd.read_csv("./parrots.csv").drop(columns=["Unnamed: 0"])
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,wing_length_mm,body_mass_g,sex
0,Philippine Hanging Parrot,Palawan,39.1,18.7,181.0,3750.0,Male
1,Philippine Hanging Parrot,Palawan,39.5,17.4,186.0,3800.0,Female
2,Philippine Hanging Parrot,Palawan,40.3,18.0,195.0,3250.0,Female
3,Philippine Hanging Parrot,Palawan,,,,,
4,Philippine Hanging Parrot,Palawan,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Eclectus Parrot,Solomon Islands,,,,,
340,Eclectus Parrot,Solomon Islands,46.8,14.3,215.0,4850.0,Female
341,Eclectus Parrot,Solomon Islands,50.4,15.7,222.0,5750.0,Male
342,Eclectus Parrot,Solomon Islands,45.2,14.8,212.0,5200.0,Female


In [3]:
df = df.dropna()

In [4]:
df.shape

(333, 7)

In [5]:
encode_cols = ["island", "sex"]
df[encode_cols]

Unnamed: 0,island,sex
0,Palawan,Male
1,Palawan,Female
2,Palawan,Female
4,Palawan,Female
5,Palawan,Male
...,...,...
338,Solomon Islands,Female
340,Solomon Islands,Female
341,Solomon Islands,Male
342,Solomon Islands,Female


In [6]:
onehot_encoder = OneHotEncoder(sparse_output=False)
encoded_df = onehot_encoder.fit_transform(df[encode_cols])

In [7]:
encoded_df = pd.DataFrame(encoded_df, columns=onehot_encoder.get_feature_names_out())
encoded_df

Unnamed: 0,island_Cebu,island_Codfish Island,island_Mindoro,island_Palawan,island_Solomon Islands,sex_Female,sex_Male
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
328,0.0,0.0,0.0,0.0,1.0,1.0,0.0
329,0.0,0.0,0.0,0.0,1.0,1.0,0.0
330,0.0,0.0,0.0,0.0,1.0,0.0,1.0
331,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [8]:
label_encoder = LabelEncoder()
encoded_target = label_encoder.fit_transform(df["species"])

target_df = pd.Series(encoded_target, name="species")
target_df

0      2
1      2
2      2
3      2
4      2
      ..
328    0
329    0
330    0
331    0
332    0
Name: species, Length: 333, dtype: int64

In [14]:
train_df = pd.concat(
    [
        target_df,
        df[
            [
                "bill_length_mm",
                "bill_depth_mm",
                "wing_length_mm",
                "body_mass_g",
            ]
        ],
        encoded_df,
    ],
    axis=1,
).dropna()
train_df

Unnamed: 0,species,bill_length_mm,bill_depth_mm,wing_length_mm,body_mass_g,island_Cebu,island_Codfish Island,island_Mindoro,island_Palawan,island_Solomon Islands,sex_Female,sex_Male
0,2.0,39.1,18.7,181.0,3750.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,2.0,39.5,17.4,186.0,3800.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,2.0,40.3,18.0,195.0,3250.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,2.0,36.7,19.3,193.0,3450.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
5,2.0,39.3,20.6,190.0,3650.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
328,0.0,43.3,14.0,208.0,4575.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
329,0.0,48.1,15.1,209.0,5500.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
330,0.0,50.5,15.2,216.0,5000.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
331,0.0,49.8,15.9,229.0,5950.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    train_df.drop(columns=["species"]), train_df["species"]
)

In [37]:
model = SVC(kernel="linear")
model.fit(X_train, Y_train)

In [38]:
print(f"Train Accuracy: {model.score(X_train, Y_train)}")
print(f"Test Accuracy: {model.score(X_test, Y_test)}")

Train Accuracy: 1.0
Test Accuracy: 1.0
