In [1]:
# Kennedi Todd
# July 31, 2024
# Support Vector Machine
# Classify penguin based on variables
# https://scatterplotpress.teachable.com/p/ml-code-exercises

# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# import data
# read data
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv')
df.head()

# drop row with missing values
df.dropna(axis = 0,        # 0 = row, 1 = column
          how = 'any',     # drop rows if at least one value missing
          subset = None,   # use all columns
          inplace = True)  # update rather than replace

# one-hot encoding
df = pd.get_dummies(df, columns = ['island','sex']) # species is the dependent var
df.head()


Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_FEMALE,sex_MALE
0,Adelie,39.1,18.7,181.0,3750.0,False,False,True,False,True
1,Adelie,39.5,17.4,186.0,3800.0,False,False,True,True,False
2,Adelie,40.3,18.0,195.0,3250.0,False,False,True,True,False
4,Adelie,36.7,19.3,193.0,3450.0,False,False,True,True,False
5,Adelie,39.3,20.6,190.0,3650.0,False,False,True,False,True


In [3]:
# Standardize the independent vars
scaler = StandardScaler()
scaler.fit(df.drop('species', axis = 1))
scaled_df = scaler.transform(df.drop('species', axis = 1))

# assign vars
X = scaled_df
y = df['species']

# split into testing/training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True)

# assign algorithm
model = SVC()
model.fit(X_train, y_train)

# make prediction
model_test = model.predict(X_test)

# evaluate
print(confusion_matrix(y_test, model_test))
print(classification_report(y_test, model_test))


[[38  1  0]
 [ 0 23  0]
 [ 0  0 38]]
              precision    recall  f1-score   support

      Adelie       1.00      0.97      0.99        39
   Chinstrap       0.96      1.00      0.98        23
      Gentoo       1.00      1.00      1.00        38

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



In [4]:
# predict data point
penguin = [
    39,   # bill length
    18.5, # bill depth
    180,  # flipper length
    3750, # body mass
    0,    # island Biscoe
    0,    # island Dream
    1,    # island Torgersen
    1,    # male
    0,    # female
]
predict_penguin = model.predict([penguin])
predict_penguin


array(['Adelie'], dtype=object)