## Analyzing the Penguin Dataset with Support Vector Machines

In [1]:
import numpy as np
from numpy.random import default_rng
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import CategoricalColorMapper
from sklearn.svm import SVC
rng = default_rng(5)
output_notebook()
import pandas as pd

Importing functions

In [2]:
def hyperplane(P,x,z=0):
    """Given an SVC object P and an array of vectors x, computes the hyperplane wx+b=z"""
    alphas = P.dual_coef_
    svs = P.support_vectors_
    c = P.intercept_[0]-z
    a = np.sum(alphas.T*svs,axis=0)[0]
    b = np.sum(alphas.T*svs,axis=0)[1]
    return (-c-a*x)/b

def pts(P):
    """Given an SVC object P, returns the two closest points in the associated reduced convex hulls."""
    alphas = P.dual_coef_[0]
    svs = P.support_vectors_
    plus_indices = np.where(alphas>0)
    minus_indices = np.where(alphas<=0)
    alphas = alphas.reshape(-1,1)
    pluspt = np.sum(alphas[plus_indices]*svs[plus_indices],axis=0)/np.sum(alphas[plus_indices])
    minuspt = np.sum(alphas[minus_indices]*svs[minus_indices],axis=0)/np.sum(alphas[minus_indices])
    return pluspt, minuspt

Importing dataset

In [3]:
df = pd.read_csv('/Users/juliaandronowitz/Desktop/thesis/thesis/Data/penguins_size.csv')

Initial exploration of data

In [4]:
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [6]:
def rstr(df): return df.shape, df.apply(lambda x: [x.unique()])
rstr(df)

((344, 7),
                        species                      island  \
 0  [Adelie, Chinstrap, Gentoo]  [Torgersen, Biscoe, Dream]   
 
                                     culmen_length_mm  \
 0  [39.1, 39.5, 40.3, nan, 36.7, 39.3, 38.9, 39.2...   
 
                                      culmen_depth_mm  \
 0  [18.7, 17.4, 18.0, nan, 19.3, 20.6, 17.8, 19.6...   
 
                                    flipper_length_mm  \
 0  [181.0, 186.0, 195.0, nan, 193.0, 190.0, 180.0...   
 
                                          body_mass_g                     sex  
 0  [3750.0, 3800.0, 3250.0, nan, 3450.0, 3650.0, ...  [MALE, FEMALE, nan, .]  )

In [7]:
df.describe()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [8]:
df.isna().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

Dropping the rows with values of NaN

In [9]:
df = df.dropna()

Creating a list of labels for each penguin species

In [10]:
labels = []
x = 0
for i in df['species']:
    if i == 'Adelie':
        x = 0
    elif i == 'Chinstrap':
        x = 1
    elif i == 'Gentoo':
        x = 2
    labels.append(x)

These assignments will be used later in graphing the functions. Red will be Adelie, blue is Chinstrap, and green is Gentoo:

In [11]:
colors = ['red','blue','green']
penguin_colors = np.array([colors[i] for i in labels])

Removing species (the class label), island, and sex

In [12]:
data = df.drop(columns=['species','island','sex'])

In [13]:
data.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
4,36.7,19.3,193.0,3450.0
5,39.3,20.6,190.0,3650.0


Converting to numpy array

In [14]:
data = data.to_numpy()

In [15]:
data[:5]

array([[  39.1,   18.7,  181. , 3750. ],
       [  39.5,   17.4,  186. , 3800. ],
       [  40.3,   18. ,  195. , 3250. ],
       [  36.7,   19.3,  193. , 3450. ],
       [  39.3,   20.6,  190. , 3650. ]])

Plotting the raw data points

In [16]:
cl_cd=figure(title='Penguin Data: culmen length vs. culmen depth')
cl_cd.scatter(x=data[:,0],y=data[:,1],color=penguin_colors)
show(cl_cd)

In [17]:
cl_fl=figure(title='Penguin Data: culmen length vs. flipper length')
cl_fl.scatter(x=data[:,0],y=data[:,2],color=penguin_colors)
show(cl_fl)

In [18]:
cl_bm=figure(title='Penguin Data: culmen length vs. body mass')
cl_bm.scatter(x=data[:,0],y=data[:,3],color=penguin_colors)
show(cl_bm)

In [19]:
cd_fl=figure(title='Penguin Data: culmen depth vs. flipper length')
cd_fl.scatter(x=data[:,1],y=data[:,2],color=penguin_colors)
show(cd_fl)

In [20]:
cd_bm=figure(title='Penguin Data: culmen depth vs. body mass')
cd_bm.scatter(x=data[:,1],y=data[:,3],color=penguin_colors)
show(cd_bm)

In [21]:
fl_bm=figure(title='Penguin Data: flipper length vs. body mass')
fl_bm.scatter(x=data[:,2],y=data[:,3],color=penguin_colors)
show(fl_bm)

try 3 different classifications:

1. culmen length vs. culmen depth
2. culmen length vs. flipper length
3. culmen length vs. body mass

see which gives highest accuracy

### 1. Culmen Length vs. Culmen Depth

In [150]:
data_cl_cd = data[:,[0,1]]

In [153]:
data_cl_cd[:5]

array([[39.1, 18.7],
       [39.5, 17.4],
       [40.3, 18. ],
       [36.7, 19.3],
       [39.3, 20.6]])

In [156]:
fit_cl_cd = SVC(kernel = 'rbf', C=1000, gamma='scale').fit(data_cl_cd,labels)
fit_cl_cd.predict(data_cl_cd)
print('Classifier yields accuracy of {:2f}%'.format(fit_cl_cd.score(data_cl_cd,labels)))

Classifier yields accuracy of 0.976048%


### 2. Culmen Length vs. Flipper Depth

In [157]:
data_cl_fd = data[:,[0,2]]

In [158]:
data_cl_fd[:5]

array([[ 39.1, 181. ],
       [ 39.5, 186. ],
       [ 40.3, 195. ],
       [ 36.7, 193. ],
       [ 39.3, 190. ]])

In [163]:
fit_cl_fd = SVC(kernel = 'rbf', C=1000, gamma='scale').fit(data_cl_fd,labels)
fit_cl_fd.predict(data_cl_fd)
print('Classifier yields accuracy of {:2f}%'.format(fit_cl_fd.score(data_cl_fd,labels)))

Classifier yields accuracy of 0.955090%


### 3. Culmen Length vs. Body Mass

In [160]:
data_cl_bm = data[:,[0,3]]

In [161]:
data_cl_bm[:5]

array([[  39.1, 3750. ],
       [  39.5, 3800. ],
       [  40.3, 3250. ],
       [  36.7, 3450. ],
       [  39.3, 3650. ]])

In [164]:
fit_cl_bm = SVC(kernel = 'rbf', C=1000, gamma='scale').fit(data_cl_bm,labels)
fit_cl_bm.predict(data_cl_bm)
print('Classifier yields accuracy of {:2f}%'.format(fit_cl_bm.score(data_cl_bm,labels)))

Classifier yields accuracy of 0.829341%


## Using all 4 predictors

In [166]:
fit_all = SVC(kernel = 'rbf', C=1000, gamma='scale').fit(data,labels)
fit_all.predict(data)
print('Classifier yields accuracy of {:2f}%'.format(fit_all.score(data,labels)))

Classifier yields accuracy of 0.871257%


## Conclusion

The support vector machine using only culmen length and culmen depth predicts the species of penguin with the highest accuracy.

### to do:

- format comparison graphs into grid
- graphs for cases 1-3
- parameter tuning