In [None]:
from cvxopt import solvers as cvxopt_solvers
from cvxopt import matrix as cvxopt_matrix
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import numpy as np
from sklearn import svm
import scipy

import cvxpy as cp
cvxopt_solvers.options['show_progress'] = False

"""
import matplotlib
from sklearn import svm
import plotly.express as px
from scipy.sparse import csr_matrix 
from numpy import linalg as LA
"""


In [None]:
col_name = [i for i in range(201)]
train = pd.read_csv('train.csv', sep = ',', names = col_name)
data_test = pd.read_csv('test.csv', sep = ',', names = col_name)
data_train = train.iloc[0:4000]
validation = train.iloc[4000:]

In [None]:
# describing data frame
def describe_data(df):
    print("statistical information: ")
    display(df.describe(include = 'all'))
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        print(f'The number of data duplicates is {duplicates}')
    else:
        print('There are no duplicates in the data')
    print("Data contains {0} samples/rows and {1} columns/features".format(df.shape[0], df.shape[1] - 1))
    # print("Number of dimensions:",df.shape[1])

describe_data(data_train)

In [None]:
# preprocessing
label_train = data_train.values[:,0] # y becomes our labels vector
label_train[label_train == 0] = -1 # replacing 0 classes with -1
data_train = data_train.values[:, 1:]

label_test = data_test.values[:,0] # y becomes our labels vector
label_test[label_test == 0] = -1
data_test = data_test.values[:, 1:]



In [None]:
label_validation = validation.values[:,0] 
label_validation[label_validation == 0] = -1
data_validation = validation.values[:, 1:]

In [None]:
print(label_train)
print(data_train)

### question 2
Please implement the training and testing algorithms of soft-margin Linear Support Vector Machine from its primal form using CVX

By setting C = 100, run your implementation, please report the solution of   and sum of all dimensions of  solution, e.g., np.sum(w). (For a quick check of the correctness of your code)

In [None]:
def svm_train_primal(x , y , c):
    # number of samples/vectors and features/dimensions respectively
    n, d = x.shape 
    # 200 by 1 column vector
    w = cp.Variable(d) 
    # offset/bias
    b = cp.Variable()
    # xi (error/slack) variable for each sample corrects the distance between correct plane and input by placing an offset
    xi = cp.Variable(n)
    
    objective = cp.Minimize(0.5 * cp.square(cp.norm(w)) + (c / n) * cp.sum(xi))
    # find individual constraints y_i, x_i and xi_i for y_i(w.T @ x_i + b) >= 1 - xi_i
    constraint_1 = [y[i] * (x[i] @ w + b) >= 1 - xi[i] for i in range(n)]
    # x_i >= 0
    constraint_2 = [xi[i] >= 0 for i in range(n)]
    # combining constraints into cvxpy form
    constraints = constraint_1 + constraint_2
    problem = cp.Problem(objective, constraints)
    res = problem.solve()
    return (w.value, b.value, xi.value)


svm_primal_model = svm_train_primal(data_train, label_train, 100)

In [None]:

def show_values(svm_model):
    w, b, xi = svm_model
    # print(w)
    print("sum of all dimenisions w: ", np.sum(w))    
    print("primal value of b: ",b)
    
def svm_predict_primal(x , y , svm_model):
    w, b, xi = svm_model
    correct_predictions = 0
    test_size = x.shape[0]
    for i in range(test_size):
        y_pred = np.sign(np.dot(x[i], w) + b)
        if y_pred == y[i]:
            correct_predictions += 1
    return correct_predictions / test_size

show_values(svm_primal_model)
accuracy = svm_predict_primal(data_test, label_test, svm_primal_model)
print("accuracy of primal form is", accuracy * 100, "%")


### Question 3
Please implement the training algorithm of the soft-margin Linear Support Vector Machine from its dual form:

In [None]:
# 1.15s training time
def svm_train_dual (x , y , c):
    # samples, dimensions     
    n,d = x.shape 
    y = y.reshape(-1,1) * 1.
    X_dash = y * x
    # calculate gram matrix H
    H = np.dot(X_dash , X_dash.T) * 1.

    P = cvxopt_matrix(H)
    q = cvxopt_matrix(-np.ones((n, 1)))
    G = cvxopt_matrix(np.vstack((np.eye(n)*-1,np.eye(n))))
    h = cvxopt_matrix(np.hstack((np.zeros(n), np.ones(n) * c)))
    # label vector of size n * 1, cvxopt_matrix(y, (1, n))
    A = cvxopt_matrix(y.reshape(1, -1))
    # scalar with value zeros
    b = cvxopt_matrix(np.zeros(1))

    # run solver by matching api
    sol = cvxopt_solvers.qp(P, q, G, h, A, b)
    alphas = np.array(sol['x'])
    return alphas, y
    
svm_dual_model = svm_train_dual(data_train, label_train , 100)

In [None]:
def display_results(svm_model):
    alphas, y = svm_model
    print(alphas.shape)
    print("sum of all dimenisions alpha: ", np.sum(alphas))    
    
display_results(svm_dual_model)


### Question 4
Write code to obtain the primal problem solution w*, b* from its dual solution  alpha*


In [None]:
# primal problem solution from dual solution

def get_primal_solution_from_dual(x, svm_dual_model):
    alphas , y = svm_dual_model
    w = ((y * alphas).T @ x).T
    b = y - np.dot(x, w)
    b = np.mean(b) # take mean bias
    return w, b

def show(w, b):
    print("W* ", w)
    print("b*", b)
    
def svm_predict_dual(x, y, w, b):
    correct_predictions = 0
    test_size = x.shape[0]
    for i in range(test_size):
        y_pred = np.sign(np.dot(x[i,:], w) + b)
        if y_pred == y[i]:
            correct_predictions += 1
    return correct_predictions / test_size

w, b = get_primal_solution_from_dual(data_train, svm_dual_model)
dual_accuracy = svm_predict_dual(data_test, label_test, w, b)
show(w, b)
print("dual accuracy is ", dual_accuracy * 100, "%")


### Question 5
Write code to find the support vectors from the primal problem solutions

equation form:  y_i = w.T * x_i + b = 1

In [None]:
x = data_test
w, b, _ = svm_primal_model
# y_pos 

x = 1 - b * (w)**-1
print(x)
w.T @ x.T + b >=  1
# y_neg
w.T @ x.T + b == - 1


### Question 6
Write code to find the support vectors from the dual problem solutions. Please copy the code snippet for the implementation

In [None]:
# x = linear space
x = data_train
w, b = get_primal_solution_from_dual(data_train, svm_dual_model)

# y_pos 
w.T * x + b == 1
# y_neg
w.T * x + b == -1

### Question 7
Write code to choose C by using the validation set. Please copy the code snippet for the implementation. Report the test accuracy you get by using the optimal C found in the validation set. 

In [None]:
def test_model_fit(x, y, w, b):
    test_size = x.shape[0]
    correct_predictions = 0
    for i in range(test_size):
        y_pred = np.sign(np.dot(x[i,:], w.T) + b)
        if y_pred == y[i]:
            correct_predictions += 1

    return correct_predictions / test_size

In [None]:
c_values = [2e-10, 2e-8, 2e-6, 2e-4, 2e-2, 2e0, 2e2, 2e4, 2e6,2e7,2e8,2e10]
max_accuracy = 0
optimal_c = 0

# start by taking the first 8 values for now as computation time is too slow;
for c in c_values[:6]:
    clf = svm.SVC(C = c, kernel = 'linear')
    clf.fit(data_validation, label_validation)
    w = clf.coef_
    b = clf.intercept_
    svm_accuracy = test_model_fit(data_train, label_train, w, b)
    if svm_accuracy > max_accuracy:
        max_accuracy = svm_accuracy
        optimal_c = c

    print("accuracy of scikit learn svm model is", svm_accuracy * 100, "% with c value", c)
    
print("most optimal c is:", optimal_c)

"""
use validation set
data_validation
label_validation
train model
get accuracy
store most accurate c """

In [None]:
# using primal form model
for c in c_values[:7]:
    svm_primal_model = svm_train_primal(data_train, label_train, 100)
    svm_accuracy = test_model_fit(data_train, label_train, w, b)
    if svm_accuracy > max_accuracy:
        max_accuracy = svm_accuracy
        optimal_c = c

    print("accuracy of scikit learn svm model is", svm_accuracy * 100, "% with c value", c)
    
print("most optimal c is:", optimal_c)


### Question 8

In [None]:
optimal_C = 0.0002
# optimal_C = 100

clf = svm.SVC(C = optimal_C, kernel = 'linear')
clf.fit(data_train, label_train) # .fit(x, y)

def show(clf):
    # print('w = ',clf.coef_)
    print('b = ',clf.intercept_)
    print('Indices of support vectors = ', clf.support_)
    # print(clf.support_.shape)
    print('Support vectors = ', clf.support_vectors_)
    print('Number of support vectors for each class = ', clf.n_support_)
    print('Coefficients of the support vector in the decision function = ', np.abs(clf.dual_coef_))
    print(clf.support_vectors_.shape)

show(clf)

In [None]:
w = clf.coef_
b = clf.intercept_
svm_accuracy = test_model_fit(data_train, label_train, w, b)
print("accuracy of scikit learn svm model is", svm_accuracy * 100, "%")