# SVM Lab
In this lab we will work with the sklearn SVM Library to apply support vector machines to the penguin data set.

We load the usual libraries:
- numpy
- bokeh
- the sklearn library  which stands for support vector classifier

In [None]:
import numpy as np
from numpy.random import default_rng
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import CategoricalColorMapper
from sklearn.svm import SVC
rng = default_rng(5)
output_notebook()

The following two functions are utilities that will help us to extract information from the SVC object in a form
suitable for plotting.  The key is that the vector $w$ that gives the optimal margin is $\sum t_{i}\alpha_{i}x_{i}$
where the $x_{i}$ are the "support vectors" in ```P.support_vectors_```, the $t_{i}$ are the associated labels, and the $\alpha_{i}$
are the "dual coefficients" in ```P.dual_coef_```.  See the documentation of the [mathematical formulation](https://scikit-learn.org/stable/modules/svm.html#svm-mathematical-formulation),
section 1.4.7.1.

In [None]:
def hyperplane(P,x,z=0):
    """Given an SVC object P and an array of vectors x, computes the hyperplane wx+b=z"""
    alphas = P.dual_coef_
    svs = P.support_vectors_
    c = P.intercept_[0]-z
    a = np.sum(alphas.T*svs,axis=0)[0]
    b = np.sum(alphas.T*svs,axis=0)[1]
    return (-c-a*x)/b

def pts(P):
    """Given an SVC object P, returns the two closest points in the associated reduced convex hulls."""
    alphas = P.dual_coef_[0]
    svs = P.support_vectors_
    plus_indices = np.where(alphas>0)
    minus_indices = np.where(alphas<=0)
    alphas = alphas.reshape(-1,1)
    pluspt = np.sum(alphas[plus_indices]*svs[plus_indices],axis=0)/np.sum(alphas[plus_indices])
    minuspt = np.sum(alphas[minus_indices]*svs[minus_indices],axis=0)/np.sum(alphas[minus_indices])
    return pluspt, minuspt

## Simulated data illustration

The command
```P=SVC(kernel='linear',C=1000).fit(data,labels)```
computes the Support Vector Classifier.  Here C bounds the coefficients $$\lambda_{i}$$, and assuming the point sets are
linearly separable you should you should take C large to find the optimal margin based on the convex hulls.  You can
read the "mathematical formulation" part of the SVC documentation to see the exact correspondence

In [None]:
N=200
# Generate two random point sets and plot them
A = rng.normal(-1,.3,size=(N,2))
B = rng.normal(1,.3,size=(N,2))

f=figure(x_range=[-2,2],y_range=[-2,2],height=500,width=500)

f.scatter(x=A[:,0],y=A[:,1],color='blue')
f.scatter(x=B[:,0],y=B[:,1],color='green')

data  = np.concatenate([A,B],axis=0)
labels = np.array([0]*N+[1]*N)

# compute the svc classifier

P = SVC(kernel='linear',C=1000).fit(data,labels)

# draw the separating hyperplane, plus the hyperplanes that define the margin.  
# here we are extracting information from the SVC object.

x=np.linspace(-2,2,100)
y = hyperplane(P,x)
f.line(x=x,y=y)
y = hyperplane(P,x,1)
f.line(x=x,y=y)
y = hyperplane(P,x,-1)
f.line(x=x,y=y)
show(f)

In [None]:
# P.support_vectors_ gives the points that lie on the marginal hyperplanes.
P.support_vectors_

In [None]:
# P.dual_coef_ gives the associated lambda's multiplied by +/-1 depending on the label
P.dual_coef_

In [None]:
# To find the "closest points" use the pts function
pts(P)
xs = [pts(P)[0][0],pts(P)[1][0]]
ys = [pts(P)[0][1],pts(P)[1][1]]

In [None]:
f.line(x=xs,y=ys,color='black',line_width=5)
f.scatter(x=xs,y=ys,color='black',size=8)
show(f)

## Penguins and multiclass classification

I've simplified life a little by removing missing data from the penguin data set and by restricting to  numerical features.

There are more than two features for the penguin data, but lets start by looking at the two we considered in the theoretical discussion:
culmen length and body mass.  These are features 0 and 3.

In [None]:
data = np.genfromtxt("penguin_data.csv",delimiter=',',skip_header=1)
labels = np.genfromtxt("penguin_labels.csv",delimiter=',',dtype=int,skip_header=1)
working_data=data[:,[0,3]]
working_data[:,1] = working_data[:,1]/200
colors = ['red','blue','green']
penguin_colors = np.array([colors[i] for i in labels])

In [None]:
f=figure(title='Penguin Data: culmen length vs body mass/200',x_range=[30,60],y_range=[10,35])
f.scatter(x=working_data[:,0],y=working_data[:,1],color=penguin_colors)
show(f)

We can try "one vs rest" classification where we consider each group against all of the other points.

In [None]:
red_points = np.where(labels==0)
blue_points =np.where(labels==1)
green_points = np.where(labels ==2)

In [None]:
blue_vs_others = np.array([0 if x ==1 else 1 for x in labels])
red_vs_others = np.array([0 if x==0 else 1 for x in labels])
green_vs_others = np.array([0 if x==2 else 1 for x in labels])

In [None]:
Pred = SVC(kernel='linear',C=1000).fit(working_data,red_vs_others)
Pgreen = SVC(kernel='linear',C=1000).fit(working_data,green_vs_others)
Pblue = SVC(kernel='linear',C=1000).fit(working_data,blue_vs_others)

In [None]:
f=figure(title='Penguin Data: culmen length vs body mass/200',x_range=[30,60],y_range=[10,35])
f.scatter(x=working_data[:,0],y=working_data[:,1],color=penguin_colors)
x=np.linspace(30,60,100)
yred=hyperplane(Pred,x)
y0 = hyperplane(Pred,x,1)
y1 = hyperplane(Pred,x,-1)
f.line(x=x,y=yred,line_width=3,color='black',line_dash='dashed',legend_label='red vs others')
f.line(x=x,y=y0,line_width=1,alpha=.5,color='black',line_dash='dashed',legend_label='red vs others')
f.line(x=x,y=y1,line_width=1,alpha=.5,color='black',line_dash='dashed',legend_label='red vs others')

#f.line(x=x,y=yblue,line_width=3,color='black',line_dash='dotted',legend_label='blue vs others')
#f.line(x=x,y=ygreen,line_width=3,color='black',line_dash='dotdash',legend_label='green vs others')
show(f)

In [None]:
f=figure(title='Penguin Data: culmen length vs body mass/200',x_range=[30,60],y_range=[10,35],width=500,height=500)
f.scatter(x=working_data[:,0],y=working_data[:,1],color=penguin_colors)
x=np.linspace(30,60,100)
yred=hyperplane(Pred,x)
y0 = hyperplane(Pred,x,1)
y1 = hyperplane(Pred,x,-1)
f.line(x=x,y=yred,line_width=3,color='black',line_dash='dashed',legend_label='red vs others')
f.line(x=x,y=y0,line_width=1,alpha=.5,color='black',line_dash='dashed',legend_label='red vs others')
f.line(x=x,y=y1,line_width=1,alpha=.5,color='black',line_dash='dashed',legend_label='red vs others')
yblue = hyperplane(Pblue,x)
y0 = hyperplane(Pblue,x,1)
y1 = hyperplane(Pblue,x,-1)
f.line(x=x,y=yblue,line_width=3,color='black',line_dash='dotted',legend_label='blue vs others')
f.line(x=x,y=y0,line_width=1,alpha=.5,color='black',line_dash='dotted',legend_label='blue vs others')
f.line(x=x,y=y1,line_width=1,alpha=.5,color='black',line_dash='dotted',legend_label='blue vs others')
ygreen=hyperplane(Pgreen,x)
y0 = hyperplane(Pgreen,x,1)
y1 = hyperplane(Pgreen,x,-1)
f.line(x=x,y=ygreen,line_width=3,color='black',line_dash='dotdash',legend_label='green vs others')
f.line(x=x,y=y0,line_width=1,alpha=.5,color='black',line_dash='dotdash',legend_label='green vs others')
f.line(x=x,y=y1,line_width=1,alpha=.5,color='black',line_dash='dotdash',legend_label='green vs others')
show(f)

The SVC classifier computes a decision function by evaluating the different hyperplane functions.  So it looks at
fred(p), fblue(p), and fgreen(p).  It assigns the point to the class where the value is largest.  Choose C as large as possible
to use the most straightforward version of the problem.

In [None]:
Pall = SVC(kernel='linear',C=1000).fit(working_data,labels)
Pall.predict(working_data)
predicted_colors = [colors[i] for i in Pall.predict(working_data)]
f=figure(title='predicted classification',x_range=[30,60],y_range=[10,35])
f.scatter(x=working_data[:,0],y=working_data[:,1],color=predicted_colors)
yred=hyperplane(Pred,x)
y0 = hyperplane(Pred,x,1)
y1 = hyperplane(Pred,x,-1)
f.line(x=x,y=yred,line_width=3,color='black',line_dash='dashed',legend_label='red vs others')
f.line(x=x,y=y0,line_width=1,alpha=.5,color='black',line_dash='dashed',legend_label='red vs others')
f.line(x=x,y=y1,line_width=1,alpha=.5,color='gray',line_dash='dashed',legend_label='red vs others')
yblue = hyperplane(Pblue,x)
y0 = hyperplane(Pblue,x,1)
y1 = hyperplane(Pblue,x,-1)
f.line(x=x,y=yblue,line_width=3,color='black',line_dash='dotted',legend_label='blue vs others')
f.line(x=x,y=y0,line_width=1,alpha=.5,color='black',line_dash='dotted',legend_label='blue vs others')
f.line(x=x,y=y1,line_width=1,alpha=.5,color='gray',line_dash='dotted',legend_label='blue vs others')
ygreen=hyperplane(Pgreen,x)
y0 = hyperplane(Pgreen,x,1)
y1 = hyperplane(Pgreen,x,-1)
f.line(x=x,y=ygreen,line_width=3,color='black',line_dash='dotdash',legend_label='green vs others')
f.line(x=x,y=y0,line_width=1,alpha=.5,color='black',line_dash='dotdash',legend_label='green vs others')
f.line(x=x,y=y1,line_width=1,alpha=.5,color='gray',line_dash='dotdash',legend_label='green vs others')
show(f)

In [None]:
print('Classifier yields accuracy of {:2f}%'.format(Pall.score(working_data,labels)))

## Using all the features

We can train an SVC classifier on 25% of the data and still get exceptionally good predictions using all the features.

In [None]:
from sklearn.model_selection import train_test_split
data_train, data_test, labels_train, labels_test = train_test_split(data,labels,test_size=.75)
P = SVC(kernel='linear').fit(data_train,labels_train)

In [None]:
P.score(data_test,labels_test)

# f-MNIST with SVM

The two files fmnist_data.csv and fmnist_labels.csv are a selection of 10000 records of the F-MNIST
data set, slightly cleaned up, for use in SVC classification.

The fmnist_data.csv files contains the 10000x784 records of the images, and the labels contains the 10000 associated labels
0-9.

In [None]:

data = np.genfromtxt('fmnist_data.csv',delimiter=',')
labels=np.genfromtxt('fmnist_labels.csv',delimiter=',')

In [None]:
data_train, data_test, labels_train, labels_test = train_test_split(data,labels,test_size=.2)

In [None]:
#P = SVC().fit(data_train,labels_train)