# Support Vector Machines

## Table of contents
* [0. Libraries and helper functions](#libraries)
* [1. A solver for quadratic programming](#solver)
* [2. SVM for the separable case](#separable-case)
* [3. SVM for the non-separable case](#non-separable-case)
* [4. Separable case implementing the primal problem](#separable-case-primal)
* [5. Nonlinear case](#nonlinear-case)


<a class="anchor" id="libraries"></a>
## 0. Libraries and helper functions (for plotly)

In [1]:
import numpy as np
from numpy                   import *
from scipy.optimize          import minimize, Bounds, LinearConstraint
from sklearn.model_selection import train_test_split
from plotly.express          import scatter, scatter_3d, imshow
from plotly.graph_objects    import Mesh3d, Layout, Figure, Contour, Scatter, Isosurface
from sklearn.datasets        import load_iris

In [2]:
iterable = lambda x: hasattr(x, '__iter__')
def rand(*args):
    if len(args) == 1 and iterable(args[0]): # allows rand([3,4])
        return np.random.rand(*args[0])      # allows rand(3,4)
    return np.random.rand(*args)
def randn(*args):
    if len(args) == 1 and iterable(args[0]):
        return np.random.randn(*args[0])  # allows randn([3,4])
    return np.random.randn(*args)         # allows randn(3,4)
def ones(*args):
    if len(args) == 1:
        return np.ones(args[0])   # allows ones([3,4])
    return np.ones(args)          # allows ones(3,4)
def zeros(*args):
    if len(args) == 1:
        return np.zeros(args[0])  # allows zeros([3,4])
    return np.zeros(args)         # allows zeros(3,4)

In [3]:
# visualize the data
def show_iris(x, y, labels=None, symbol_label=None, **kwargs):
    if labels is None:
        labels = {'x':x_names[0], 'y':x_names[1], 'z':x_names[2], 
                  'color':'species', 'symbol': symbol_label}
    fig = scatter_3d(x   = x[:,0],       # x axis of plot
                     y   = x[:,1],       # y axis of plot
                     z   = x[:,2],       # z axis of plot
                     color  = [y_names[n] for n in y],  # relabelled!
                     labels = labels,
                     **kwargs)
    fig['layout']['scene']['aspectmode'] = "data"
    return fig

In [4]:
# function to fix the axis limits
# we need this to plot a large triangle for the hyperplane 
# without automatically changing the axis limit
def lim(x):
    xmin, xmax = x.min(), x.max()
    xr         = xmax - xmin
    return [xmin - 0.05*xr, xmax + 0.05*xr]
def fix_axis(fig):
    xlim, ylim, zlim = lim(x[:,0]), lim(x[:,1]), lim(x[:,2])
    yx = (ylim[1]-ylim[0])/(xlim[1]-xlim[0])
    zx = (zlim[1]-zlim[0])/(xlim[1]-xlim[0])
    fig.update_layout(scene = dict(
        xaxis=dict(range=[xlim[0],xlim[1]]), 
        yaxis=dict(range=[ylim[0],ylim[1]]), 
        zaxis=dict(range=[zlim[0],zlim[1]]),
        aspectmode='manual',
        aspectratio=dict(x=1,y=yx,z=zx)))

In [5]:
# function to plot a plane
# this doesn't always show the plane where we are interested in it!
def add_plane(fig, w, b, opacity=0.5, scaling=100.0):
    # find three vectors that are orthogonal to w
    # vector w=(a, b, c) is orthogonal to (-b, a, 0), or (-c, 0, a) or (0, -c, b)
    # each row is a vector orthogonal to w = (a,b,c)
    x_plane = array([[-w[1],  w[0],     0],  # (-b, a, 0)
                     [-w[2],     0,  w[0]],  # (-c, 0, a)
                     [    0, -w[2],  w[1]]]) # ( 0,-c, b)
    # next shift in w direction, such that x_plane @ w + b = 0
    x_plane += -w * b / (w@w)  # shift in w direction
    tau = scaling   # scaling factor, scale in all directions
                    # increase if plane is not visible
    x_plane = array([[1-tau,   0  ,  tau ],
                     [ tau , 1-tau,   0  ],
                     [  0  ,  tau , 1-tau]]) @ x_plane
    fig.add_trace(Mesh3d(
        color='green', opacity=opacity,
        x = x_plane[:,0], y = x_plane[:,1], z = x_plane[:,2], # define three points
        i = [0], j = [1], k = [2]))                           # define a triangle
    return x_plane

In [6]:
def add_margin(fig, w, b):
    # fix the axis (the planes are much larger)
    fix_axis(fig)    # to keep the current view 
    # even when adding a large triangle for the plane
    # the separating hyperplane
    add_plane(fig, w, b)
    # the margin
    # note that for x on one margin we have x @ w + b == +1
    #          and the other margin we have x @ w + b == -1
    add_plane(fig, w, b + 1, opacity=0.15)
    add_plane(fig, w, b - 1, opacity=0.15)

In [7]:
def show_iris_solution(x, y, w, b, title=''):
    fig = show_iris(x, (y+1)//2)
    add_margin(fig, w, b)
    fig.update_layout(title={'text': title})
    fig.show()

<a class="anchor" id="solver"></a>
## 1. A solver for quadratic programming

In [8]:
# implement quadprog using scipy.optimize.minimize
def quadprog(Q, c, A=None, b=None, Aeq=None, beq=None, lb=None, ub=None, x0=None):
    # DESCRIPTION:
    # min  0.5 * x.T @ Q @ x + c.T @ x
    # s.t. A   @ x <= b
    #      Aeq @ x == beq
    #      lb <= x <= ub
    # init x with x0
    # the following ifs are necessary, because default parameters can not refer to other parameters
    if A   is None: A   =  zeros(0, c.shape[0])
    if b   is None: b   =  zeros(0)
    if Aeq is None: Aeq =  zeros(0, c.shape[0])
    if beq is None: beq =  zeros(0)
    if lb  is None: lb  = -inf * ones(c.shape[0])
    if ub  is None: ub  =  inf * ones(c.shape[0])
    if x0  is None: x0  =  zeros(c.shape[0])
    fun            = lambda x: 0.5 * x.T @ Q @ x + c.T @ x
    thebounds      = Bounds(lb, ub)
    theconstraints = LinearConstraint(vstack([A,Aeq]), 
                concatenate([-inf*ones(A.shape[0]), beq]),
                concatenate([b, beq]))
    return minimize(fun, x0, 
                    bounds=thebounds, 
                    constraints=theconstraints)

In [9]:
help(minimize)

Help on function minimize in module scipy.optimize._minimize:

minimize(fun, x0, args=(), method=None, jac=None, hess=None, hessp=None, bounds=None, constraints=(), tol=None, callback=None, options=None)
    Minimization of scalar function of one or more variables.
    
    Parameters
    ----------
    fun : callable
        The objective function to be minimized.
    
            ``fun(x, *args) -> float``
    
        where ``x`` is a 1-D array with shape (n,) and ``args``
        is a tuple of the fixed parameters needed to completely
        specify the function.
    x0 : ndarray, shape (n,)
        Initial guess. Array of real elements of size (n,),
        where ``n`` is the number of independent variables.
    args : tuple, optional
        Extra arguments passed to the objective function and its
        derivatives (`fun`, `jac` and `hess` functions).
    method : str or callable, optional
        Type of solver.  Should be one of
    
            - 'Nelder-Mead' :ref:`(see he

In [10]:
result = quadprog(Q  = array([[1,  -1],[-1,  2]]),
                  c  = array([-2, -6]), 
                  A  = array([[1, 1],[-1, 2],[2, 1]]), 
                  b  = array([2,2,3]), 
                  lb = zeros(2))
# RESULTS from Matlab:
# [x,f] = quadprog([1,-1;-1,2], [-2,-6], [1,1;-1,2;2,1], [2,2,3], [], [], [0,0])
# x==[0.66, 1.33]
# f==-8.222
print(result.x)
print(result.fun)

[0.66666667 1.33333333]
-8.222222222222266


<a class="anchor" id="separable-case"></a>
## 2. SVM for the separable case
### 2.1 Implementation

In [11]:
A = randn(5,5)
x = randn(5)
print(diag(x)@ A @ diag(x))
print( x.T * A * x)

[[-0.46379464 -0.02508694 -0.2000883  -0.07178973 -0.0474073 ]
 [ 0.11169901  0.19968138  0.08191201  0.04651784  0.07809708]
 [ 0.12831112  0.07349596  0.86041201 -0.34548452  0.12563547]
 [ 0.3031269   0.24655893  1.1722911  -0.6003873   0.38758834]
 [ 0.18185963 -0.0774232  -0.00153795  0.23054743 -0.02667619]]
[[-0.46379464 -0.01345939  0.29452839 -0.10804878 -0.01866955]
 [ 0.20819568  0.19968138 -0.22473744  0.1304967   0.05732522]
 [-0.08716835 -0.02678771  0.86041201  0.35324879 -0.03361208]
 [ 0.20140345  0.08789027 -1.14652461 -0.6003873   0.10141489]
 [ 0.46179338 -0.10547758  0.00574856  0.88110824 -0.02667619]]


In [12]:
# use quadprog for the implementation
def linear_svm_dual_separable_case(x, y, threshold=1e-10):
    n =  y.shape[0]
    Q =  diag(y) @ x @ x.T @ diag(y) # = y * (x@x.T) * y.T
    c = -ones(n)
    result = quadprog(Q, c, 
                      Aeq = y.reshape(1, n),
                      beq = zeros(1),
                      lb  = zeros(n))
    alpha = result.x
    support_vectors = (alpha>threshold)
    n_support_vectors = support_vectors.sum()  # sum up TRUE to get number
    alpha = alpha * (support_vectors)          # threshold alpha
    w     = (alpha * y) @ x
    b     = ((y - x @ w) * support_vectors).sum() / n_support_vectors
    return w, b, alpha
def predict(x, w, b):
    y_predicted = (x @ w + b > 0.0).astype(int)
    return y_predicted
def accuracy(x, y, w, b):
    return (y * (x@w + b) > 0.0).mean()

### 2.2 Toy data
#### 2.2.1 Balanced data

In [13]:
# simple balanced toy data set
# balanced ==> both classes have same size
n = 100
np.random.seed(17)
x, y = randn(n, 3), ones(n).astype(int)
x[:n//2,:] +=  5.0   # separate the classes
y[:n//2]   *= -1     # label the first half -1
w, b, alpha = linear_svm_dual_separable_case(x, y)
print(f"accuracy = {accuracy(x, y, w, b)}")
print(f"alpha =\n {alpha}")
print(f"w = {w}")
print(f"b = {b}")
fig = scatter_3d(x = x[:,0],       # x axis of plot
                 y = x[:,1],       # y axis of plot
                 z = x[:,2],       # z axis of plot
                 color = [['class -1', 'class +1'][n] for n in (y+1)//2])
fig['layout']['scene']['aspectmode'] = "data"
add_margin(fig, w, b)
fig.update_layout(title={'text': "BALANCED toy data"})
fig.show() 

accuracy = 1.0
alpha =
 [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.06587755 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.00246319 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.06834074 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.       

#### 2.2.2 Unbalanced data

In [14]:
# simple unbalanced toy data set
n_pos = 95
n_neg =  5
n     = n_pos + n_neg
np.random.seed(17)
x, y = randn(n, 3), ones(n).astype(int)
x[:n_neg,:] +=  5.0   # separate the classes
y[:n_neg]   *= -1     # label the first half -1
w, b, alpha = linear_svm_dual_separable_case(x, y)
print(f"accuracy = {accuracy(x, y, w, b)}")
print(f"alpha =\n {alpha}")
print(f"w = {w}")
print(f"b = {b}")
fig = scatter_3d(x   = x[:,0],       # x axis of plot
                 y   = x[:,1],       # y axis of plot
                 z   = x[:,2],       # z axis of plot
                 color = [['class -1', 'class +1'][n] for n in (y+1)//2])
fig['layout']['scene']['aspectmode'] = "data"
add_margin(fig, w, b)
fig.update_layout(title={'text': "UNBALANCED toy data"})
fig.show() 

accuracy = 1.0
alpha =
 [0.01810947 0.         0.         0.         0.03645164 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.04029447 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.00538764 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.008879
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0. 

Surprisingly, SVMs also work for unbalanced data sets, which is a very nice feature! 
Note that some SVM implementations allow you to assign weights to the data.  This is 
to modify the loss, not necessarily for unbalanced data sets.

### 2.3 Iris data

In [15]:
# load the iris data set 
# reduce dimensions to three and permute
iris    = load_iris()
i,j,k   = 2, 1, 0                       # pick three dimensions
x       = iris.data[:,[i,j,k]]          # select three columns
x_names = iris.feature_names            # for labelling axis
x_names = [x_names[i], x_names[j], x_names[k]]
y       = iris.target
y_names = iris.target_names    # for legend
show_iris(x, y)

For a separable two class problem, we merge virginica and versicolor.

In [16]:
y[y==0] = -1         # rename setosa      to -1
y[y==2] = +1         # rename virginica   to +1 
#                    # versicolor is already +1
y_names = [iris.target_names[0], iris.target_names[1]+'/'+iris.target_names[2]]
show_iris(x, (y+1)//2)   # remap {-1, 1} to {0, 1} for the legend

Looks separable, so we can apply our current implementation.

In [17]:
# split it into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)
x_all = vstack([x_train, x_test])    # first comes the train, then the test examples
y_all = hstack([y_train, y_test])
train_test = x_train.shape[0]*['train'] + x_test.shape[0]*['test']

In [18]:
# run the linear SVM for the separable case
w, b, alpha = linear_svm_dual_separable_case(x_train, y_train)
print(f"training accuracy = {accuracy(x_train, y_train, w, b)}")
print(f"test     accuracy = {accuracy(x_test , y_test , w, b)}")
print(f"alpha =\n {alpha}")
print(f"w = {w}")
print(f"b = {b}")
show_iris_solution(x_train, y_train, w, b, 
                   "TRAINING data with SVM hyperplane and margin")

training accuracy = 1.0
test     accuracy = 1.0
alpha =
 [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.85471821 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.28973436 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.56498385 0.         0.
 0.         0.         0.        ]
w = [ 1.22702741 -0.45053859  0.00434546]
b = -1.5776432878026527


Note that three points are touching the margin.  Those are the support vectors.

In [19]:
show_iris_solution(x_test, y_test, w, b, 
                   "TEST data with SVM hyperplane and margin")

Note that some points can be inside the margin.  However, they are correctly classified.  The support vectors are always only in the training set.

<a class="anchor" id="non-separable-case"></a>
## 3. SVM for the non-separable case

In [20]:
# the general case (can you spot the difference?)
def linear_svm_dual(x, y, C, threshold=1e-10):
    n   =  y.shape[0]
    Q   =  diag(y) @ x @ x.T @ diag(y)
    c   = -ones(n)
    result = quadprog(Q, c, 
                      Aeq = y.reshape(1, n),
                      beq = zeros(1),
                      lb  = zeros(n),
                      ub  = C*ones(n))
    alpha = result.x
    alpha[alpha < threshold  ] = 0.0   # threshold
    alpha[alpha > C-threshold] = C     # threshold
    # here support vectors are only C > \alpha > 0
    support_vectors = (alpha>threshold) & (alpha < (C-threshold))
    n_support_vectors = support_vectors.sum()  # sum up TRUE to get number
    w     = (alpha * y) @ x
    b     = ((y - x @ w) * support_vectors).sum() / n_support_vectors
    return w, b, alpha

In [21]:
# load the iris data set 
# reduce dimensions to three and permute
iris    = load_iris()
i,j,k   = 2, 1, 0                       # pick three dimensions
x       = iris.data[:,[i,j,k]]          # select three columns
x_names = iris.feature_names            # for labelling axis
x_names = [x_names[i], x_names[j], x_names[k]]
y       = iris.target
y_names = iris.target_names    # for legend
show_iris(x, y)

For a non-separable problem let's merge setosa and versicolor.

In [22]:
y[y==0] = -1         # rename setosa     to -1
y[y==1] = -1         # rename versicolor to -1
y[y==2] = +1         # rename virginica  to +1
y_names = [iris.target_names[0]+'/'+iris.target_names[1], iris.target_names[2]]
show_iris(x, (y+1)//2)   # remap {-1, 1} to {0, 1} for the legend

In [23]:
# split it into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)
x_all = vstack([x_train, x_test])    # first comes the train, then the test examples
y_all = hstack([y_train, y_test])
train_test = x_train.shape[0]*['train'] + x_test.shape[0]*['test']

In [24]:
# run the linear SVM for the nonseparable case
C = 1.0       # the hyperparameter of the SVM
w, b, alpha = linear_svm_dual(x_train, y_train, C)
print(f"training accuracy = {accuracy(x_train, y_train, w, b)}")
print(f"test     accuracy = {accuracy(x_test , y_test , w, b)}")
print(f"alpha =\n {alpha}")
print(f"w = {w}")
print(f"b = {b}")
show_iris_solution(x_train, y_train, w, b, 
                   "TRAINING data with SVM hyperplane and margin")

training accuracy = 0.96
test     accuracy = 0.96
alpha =
 [1.         0.06062526 0.         0.         0.         0.30946103
 0.         0.         0.         1.         0.         1.
 0.         0.         1.         0.         0.         1.
 0.         0.         0.         0.         0.         1.
 0.         0.         0.         0.         1.         0.
 0.         0.         0.         0.         1.         0.37008628
 1.         0.         1.         0.         1.         0.
 1.         0.         0.         1.         0.         0.
 1.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 1.         0.         0.         0.         0.         0.
 0.         1.         1.         0.         0.         1.
 0.         0.         0.        ]
w = [ 2.66512292 -0.59457095 -0.74913368]
b = -6.725415341738963


In [25]:
show_iris_solution(x_test, y_test, w, b, 
                   "TEST data with SVM hyperplane and margin")

What happens if we change the hyperparamter?

In [26]:
# run the linear SVM for the separable case
for C in [1.0, 0.1]:       # the hyperparameter of the SVM
    w, b, alpha = linear_svm_dual(x_train, y_train, C)
    show_iris_solution(x_train, y_train, w, b, 
                       "TRAINING data with SVM hyperplane and margin")


invalid value encountered in scalar divide



<a class="anchor" id="separable-case-primal"></a>
## 4. Separable case implementing the primal problem

In [27]:
#def linear_svm_primal_separable_case(x, y):
#    # YOUR CODE
#    return w, b
from ml_solutions import linear_svm_primal_separable_case

NameError: name 'prior_cov_fn' is not defined

In [None]:
n = 100
np.random.seed(17)
x, y = randn(n, 3), ones(n).astype(int)
x[:n//2,:] +=  5.0   # separate the classes
y[:n//2]   *= -1     # label the first half -1
w, b = linear_svm_primal_separable_case(x, y)
print(f"accuracy = {accuracy(x, y, w, b)}")
print(f"alpha =\n {alpha}")
print(f"w = {w}")
print(f"b = {b}")
fig = scatter_3d(x   = x[:,0],       # x axis of plot
                 y   = x[:,1],       # y axis of plot
                 z   = x[:,2],       # z axis of plot
                 color = [['class -1', 'class +1'][n] for n in (y+1)//2])
fig['layout']['scene']['aspectmode'] = "data"
add_margin(fig, w, b)
fig.update_layout(title={'text': "BALANCED toy data"})
fig.show()

<a class="anchor" id="nonlinear-case"></a>
## 5. Nonlinear case

## 5.1 Simple kernel trick example

In [None]:
def normalize(x,axis=None):
    # assumes the data is along `axis` to calculate the norm
    return x / sqrt((x*x).sum(axis=axis,keepdims=True))

In [None]:
n = 100
x = randn(n, 2)
y = ones(n).astype(int)
x[50:] +=  4.0*normalize(x[50:], axis=1)    # shift the second class examples from the origin
y[50:] *= -1      # label the second class
color = [['class -1', 'class +1'][n] for n in (y+1)//2]
fig=scatter(x=x[:,0],
            y=x[:,1],
            color=color,
            title='Input Space')
fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
fig.update_traces(marker={'size': 15})
fig.show()

In [None]:
# transform the data nonlinearly
fig=scatter_3d(x=x[:,0],
               y=x[:,1],
               z=x[:,0]**2 + x[:,1]**2,
               color=color,
               title='Feature Space')
fig.show()

In [None]:
# transform the data nonlinearly
fig=scatter_3d(x= x[:,0]**2,
               y= x[:,1]**2,
               z= sqrt(2) * x[:,0] * x[:,1],
               color=color,
               title='Feature Space')
fig.show()

## 5.2 Kernel functions

In [None]:
# here we assume that each data point is a row!
def polynomial_kernel(x, x_prime, p, b=0):
    return (x @ x_prime.T + b)**p
def linear_kernel(x, x_prime):
    return polynomial_kernel(x, x_prime, 1, 0)
def gaussian_kernel(x, x_prime, sigma_sq=1.0):
    m,n = x.shape[0], x_prime.shape[0]
    all_distances_sq = (x**2).sum(1).reshape(m,1) \
        + (x_prime**2).sum(1).reshape(1,n)        \
        - 2*(x@x_prime.T)
    return exp(-all_distances_sq / (2*sigma_sq))

In [None]:
def sample_inside_out(n=100, d=2, class_shift=4.0, shift=0.0):
    x = randn(n, d)
    y = ones(n).astype(int)
    x[n//2:] +=  class_shift*normalize(x[n//2:], axis=1)    # shift the second class examples from the origin
    x += shift
    y[n//2:] *= -1      # label the second class
    return x, y
x, y = sample_inside_out(100)
imshow(gaussian_kernel(x,x, 5.0))
#imshow(polynomial_kernel(x, x, 2))

## 5.3 Implementation of nonlinear SVM

In [None]:
# the nonlinear case
def svm_dual(x, y, k, C, threshold=1e-10):
    # `k` is the kernel function.
    # note that `k` should not have additional parameters
    # e.g. fix the parameters by currying:
    #      k = lambda x,x_prime: gaussian_kernel(x,x_prime,2.0)
    #      k = lambda x,x_prime: polynomial_kernel(x,x_prime,2,1)
    n =  y.shape[0]
    K =  k(x,x)     # the kernel matrix
    Q =  y.reshape(n,1) * K * y.reshape(1,n)
    c = -ones(n)
    result = quadprog(Q, c, 
                      Aeq = y.reshape(1, n),
                      beq = zeros(1),
                      lb  = zeros(n),
                      ub  = C*ones(n))
    alpha = result.x
    alpha[alpha < threshold  ] = 0.0   # threshold
    alpha[alpha > C-threshold] = C     # threshold
    # here support vectors are only C > \alpha > 0
    support_vectors = (alpha>=threshold) & (alpha <= (C-threshold))
    n_support_vectors = support_vectors.sum()  # sum up TRUE to get number
    b = ((y - (alpha*y) @ K) * support_vectors).sum() / n_support_vectors
    f = lambda x_prime: (alpha*y)@k(x,x_prime) + b
    return f, alpha
def accuracy_svm(x, y, f):
    return ((y * f(x)) > 0).mean()

## 5.4 Two-dimensional toy example

In [None]:
def show_margin_2d(x, y, f_svm, alpha=None):
    # visualize the nonlinear decision function
    xx = linspace(x[:,1].min(), x[:,1].max(), 50)
    yy = linspace(x[:,0].min(), x[:,0].max(), 50)
    values = f_svm(array([[yi,xi] for xi in xx for yi in yy]))
    lightgreen = 'rgb(144,238,144)'
    if alpha is None:
        color = [['class -1', 'class +1'][n] for n in (y+1)//2]
    else:
        color = alpha*y
    fig=scatter(x=x[:,0],
                y=x[:,1],
                color=color,
                symbol=y)
    fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
    fig.update_traces(marker={'size': 15})
    fig.add_trace(Contour(x=yy, y=xx, z=values.reshape(xx.shape[0],yy.shape[0]),
                          contours_coloring='lines',
                          colorscale=[[0.0,lightgreen],[0.5,'green'],[1.0,lightgreen]],
                          line_width=2,
                          contours=dict(start=-1,
                                        end=+1,
                                        size=1,
                                        showlabels = True)))
    fig.show()

In [None]:
# create training and test data
x_train, y_train = sample_inside_out(n=100, d=2, class_shift=2, shift=0.0)
x_test,  y_test  = sample_inside_out(n=100, d=2, class_shift=2, shift=0.0)
# run the SVM
k = lambda x, x_prime: gaussian_kernel(x, x_prime, 1.0)
#k = lambda x, x_prime: polynomial_kernel(x, x_prime, 2, 1)
f, alpha = svm_dual(x_train, y_train, k, C=0.08)   # change from 0.01 to 10.0
# calculate training and test error
print(f"training accuracy = {accuracy_svm(x_train, y_train, f)}")
print(f"test accuracy = {accuracy_svm(x_test, y_test, f)}")
show_margin_2d(x_train, y_train, f, alpha=None)


## 5.5 Three-dimensional toy example

In [None]:
def show_margin_3d(x, y, f_svm, alpha=None):
    # visualize the nonlinear decision function
    X, Y, Z = np.mgrid[x[:,0].min():x[:,0].max():20j, 
                       x[:,1].min():x[:,1].max():20j, 
                       x[:,2].min():x[:,2].max():20j]
    values = f_svm(stack([X.flatten(),Y.flatten(),Z.flatten()], axis=1))
    print(values)
    lightgreen = 'rgb(144,238,144)'
    if alpha is None:
        color = [['class -1', 'class +1'][n] for n in (y+1)//2]
    else:
        color = alpha*y
    fig=scatter_3d(x=x[:,0],
                   y=x[:,1],
                   z=x[:,2],
                   color=color)
    fig['layout']['scene']['aspectmode'] = "data"
    fig.add_trace(Isosurface(x=X.flatten(), y=Y.flatten(), z=Z.flatten(), 
                             value=values.flatten(),
                             opacity=0.4,
                             isomin=-1.0, isomax=1.0, 
                             surface_count=3,
                             colorbar_nticks=3,
                             colorscale=[[0.0,lightgreen],[0.5,'green'],[1.0,lightgreen]],
                             caps=dict(x_show=False, y_show=False, z_show=False)))
    fig.show()

In [None]:
# create training and test data
x_train, y_train = sample_inside_out(n=100, d=3, class_shift=2, shift=0.0)
x_test,  y_test  = sample_inside_out(n=100, d=3, class_shift=2, shift=0.0)
# run the SVM
#k = lambda x, x_prime: gaussian_kernel(x, x_prime, 2.0)
k = lambda x, x_prime: polynomial_kernel(x, x_prime, 2, 1)
f, alpha = svm_dual(x_train, y_train, k, C=20.0)
# calculate training and test error
print(f"training accuracy = {accuracy_svm(x_train, y_train, f)}")
print(f"test accuracy = {accuracy_svm(x_test, y_test, f)}")
show_margin_3d(x_train, y_train, f, alpha=None)


In [None]:
# create yet another one training and test data 
x_train, y_train = sample_inside_out(n=100, d=2, class_shift=4, shift=0.0)
x_test,  y_test  = sample_inside_out(n=100, d=2, class_shift=4, shift=0.0)
x_train = hstack([x_train, 10.0*rand(n,1)])
x_test  = hstack([x_test,  10.0*rand(n,1)])
# run the SVM
k = lambda x, x_prime: gaussian_kernel(x, x_prime, 10.0)
#k = lambda x, x_prime: polynomial_kernel(x, x_prime, 3, 1)
f, alpha = svm_dual(x_train, y_train, k, C=10)
# calculate training and test error
print(f"training accuracy = {accuracy_svm(x_train, y_train, f)}")
print(f"test accuracy = {accuracy_svm(x_test, y_test, f)}")
show_margin_3d(x_train, y_train, f, alpha=None)
