In [1]:
import numpy as np
from cvxopt import matrix as cvxopt_matrix
from cvxopt import solvers as cvxopt_solvers
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [2]:
#loading dataset
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

#selecting label and replacing 0 by -1
df_y_train = df_train.iloc[:,0]
df_y_train[df_y_train==0]=-1
df_y_test = df_test.iloc[:,0]
df_y_test[df_y_test==0]=-1

#selecting features
df_X_train = df_train.iloc[:,1:]
df_X_test = df_test.iloc[:,1:]

In [3]:
X_train = np.array(df_X_train)
y_train = np.array(df_y_train)
X_test = np.array(df_X_test)
y_test = np.array(df_y_test)

In [4]:
def svm_train_primal( data_train , label_train , regularisation_para_C):
    
    X_train = data_train
    y_train = label_train
    C = regularisation_para_C

    #number of features
    m = len(X_train[0])

    #number of samples
    n = len(y_train)

    P = np.zeros((m+n+1,m+n+1), float)
    
    #create diagonal for H
    diag_P = [0]
    for i in range(m):
        diag_P.append(1)
    for i in range(n):
        diag_P.append(0)

    np.fill_diagonal(P,diag_P)

    q = np.r_[np.zeros(1+m), np.ones(n) * C]

    X_y = []
    for i in range(n):
        for j in range(m):
            X_y.append(X_train[i][j] * y_train[i])
    X_y = np.array(X_y).reshape(n,-1)

    G = np.concatenate((y_train.reshape(-1,1),X_y, np.eye(n)* -1), axis = 1)
    G_bottom = np.concatenate((np.zeros((n,m+1), float), np.eye(n) * -1), axis = 1)

    G = np.concatenate((G, G_bottom), axis = 0)

    b = np.zeros(1)
    A = np.zeros((1,m+n+1),float)
    h = np.concatenate((np.ones((n,1), float)*-1,np.zeros((n,1), float)), axis =0)

    P = cvxopt_matrix(P)
    G = cvxopt_matrix(G)
    q = cvxopt_matrix(q)
    h = cvxopt_matrix(h)


    sol = cvxopt_solvers.qp(P, q, G, h)
    v = np.array(sol['x'])
    v = v.T[0]
    w0 = v[0]
    w = v[1:len(v)-n]
    
    return [-w,-w0]

In [5]:
def svm_predict_primal(data_test , label_test , svm_model):
    X_test = data_test
    y_test = label_test
    w0 = svm_model[1]
    w = svm_model[0]
    
    y_pred = []
    for x in X_test:
        f = w.T @ x + w0
        if f > 0 :
            y_pred.append(1)
        else:
            y_pred.append(-1)
    
    return accuracy_score(y_test, y_pred)    

In [6]:
svm_model = svm_train_primal( X_train , y_train , 60)

     pcost       dcost       gap    pres   dres
 0: -2.7551e+07  2.4397e+07  9e+07  2e+02  2e+02
 1:  3.5881e+06 -4.5732e+06  2e+07  2e+01  3e+01
 2:  1.0908e+06 -1.0638e+06  4e+06  4e+00  6e+00
 3:  6.3363e+05 -2.7716e+05  1e+06  1e+00  2e+00
 4:  2.9533e+05 -2.8381e+04  4e+05  3e-01  5e-01
 5:  1.0749e+05  1.5628e+04  1e+05  7e-02  1e-01
 6:  8.9949e+04  1.9494e+04  8e+04  4e-02  6e-02
 7:  7.3304e+04  2.1806e+04  6e+04  2e-02  3e-02
 8:  6.3213e+04  2.3456e+04  4e+04  1e-02  2e-02
 9:  5.9051e+04  2.4248e+04  4e+04  8e-03  1e-02
10:  5.4968e+04  2.5045e+04  3e+04  4e-03  5e-03
11:  4.9430e+04  2.6010e+04  2e+04  2e-03  3e-03
12:  4.5868e+04  2.6750e+04  2e+04  1e-03  2e-03
13:  4.3737e+04  2.7386e+04  2e+04  7e-04  1e-03
14:  4.1300e+04  2.8081e+04  1e+04  3e-04  5e-04
15:  3.9551e+04  2.8548e+04  1e+04  2e-04  3e-04
16:  3.7222e+04  2.9378e+04  8e+03  1e-04  2e-04
17:  3.5934e+04  2.9904e+04  6e+03  6e-05  9e-05
18:  3.4504e+04  3.0511e+04  4e+03  3e-05  4e-05
19:  3.3839e+04  3.07

In [12]:
print('w0',svm_model[1])
print('w', svm_model[0])

w0 2.1865136815114403
w [-3.47397297e-02 -7.38555434e-02  3.34996623e-02 -1.60486648e-02
  7.00195686e-02 -4.07421512e-02  6.56196368e-02  2.04228900e-02
 -1.48790169e-02 -7.23570390e-03  6.98748657e-02 -2.30935585e-02
  4.41810429e-02  6.86969617e-02  2.85422461e-02 -3.34525461e-02
 -5.33057649e-02 -6.57688458e-02 -7.14945100e-02 -3.58541373e-02
 -2.23557185e-02  2.20698870e-02  2.79265435e-02 -2.70907507e-02
 -5.03338423e-03  3.08974585e-02  3.14868724e-02  5.26287160e-02
 -6.39442274e-02  8.37327837e-02 -1.10960117e-01 -7.37757842e-03
  5.56393644e-02 -5.88326703e-03  2.32835204e-02 -5.01453509e-02
  2.03362568e-02 -2.40755728e-02  1.03212406e-01 -5.70291832e-02
  3.13139194e-02  5.84898617e-02 -3.26430832e-02  4.10083225e-02
 -5.58833977e-04 -2.97130961e-02  6.17110144e-02  2.21108870e-02
  2.99733560e-02  7.47989265e-02 -5.86580266e-02 -4.52792776e-02
  7.61154715e-02 -2.50155178e-02  3.52435401e-02 -1.61112702e-02
 -1.92682847e-02  4.07564415e-02 -2.30339099e-02  8.71051263e-01
 

In [8]:
test_accuracy = svm_predict_primal(X_train , y_train , svm_model)
print("accucracy on training set",test_accuracy)
test_accuracy_2 = svm_predict_primal(X_test , y_test, svm_model)
print("accuracy on testing set", test_accuracy_2)

accucracy on training set 0.9778797505588893
accuracy on testing set 0.9673115410273516


In [6]:
#tuning C by k-fold cross validation
Cs = [30, 60, 90]

#Kfold with k = 5
kf = KFold(n_splits=5,shuffle=False)
kf.split(df_X_train)

accuracy_scores = []


for C in Cs:
    print("Calculating average accuracy score with C =", C)
    for train_index, test_index in kf.split(df_X_train):
        df_X_train_sub, df_X_test_sub = df_X_train.iloc[train_index], df_X_train.iloc[test_index]
        df_y_train_sub, df_y_test_sub = df_y_train[train_index], df_y_train[test_index]

        X_train_sub = np.array(df_X_train_sub)
        X_test_sub = np.array(df_X_test_sub)
        y_train_sub = np.array(df_y_train_sub)
        y_test_sub = np.array(df_y_test_sub)
        svm_model = svm_train_primal(X_train_sub,y_train_sub, C)
        test_accuracy_d = svm_predict_primal(X_test_sub,y_test_sub,svm_model)
        accuracy_scores.append(test_accuracy_d)

Calculating average accuracy score with C = 30
     pcost       dcost       gap    pres   dres
 0: -5.5007e+06  4.9187e+06  2e+07  8e+01  2e+02
 1:  7.2833e+05 -9.1499e+05  3e+06  9e+00  2e+01
 2:  2.1363e+05 -2.0344e+05  7e+05  2e+00  5e+00
 3:  1.2139e+05 -5.5527e+04  3e+05  6e-01  2e+00
 4:  5.7653e+04 -2.0878e+03  8e+04  1e-01  4e-01
 5:  3.0914e+04  6.0064e+03  3e+04  5e-02  1e-01
 6:  2.6836e+04  7.3994e+03  2e+04  3e-02  7e-02
 7:  2.3128e+04  8.2303e+03  2e+04  2e-02  4e-02
 8:  2.0659e+04  8.8834e+03  1e+04  1e-02  3e-02
 9:  1.9286e+04  9.3483e+03  1e+04  6e-03  2e-02
10:  1.7926e+04  9.7358e+03  9e+03  4e-03  9e-03
11:  1.6584e+04  1.0045e+04  7e+03  2e-03  6e-03
12:  1.5632e+04  1.0284e+04  6e+03  1e-03  3e-03
13:  1.4849e+04  1.0526e+04  4e+03  6e-04  2e-03
14:  1.4199e+04  1.0740e+04  4e+03  4e-04  9e-04
15:  1.3486e+04  1.1007e+04  3e+03  2e-04  5e-04
16:  1.3193e+04  1.1096e+04  2e+03  1e-04  3e-04
17:  1.2660e+04  1.1344e+04  1e+03  5e-05  1e-04
18:  1.2401e+04  1.1479

     pcost       dcost       gap    pres   dres
 0: -2.2043e+07  1.8922e+07  7e+07  2e+02  2e+02
 1:  2.5353e+06 -3.4823e+06  1e+07  2e+01  2e+01
 2:  7.4146e+05 -7.6183e+05  3e+06  4e+00  4e+00
 3:  4.3046e+05 -2.1376e+05  1e+06  1e+00  2e+00
 4:  1.8709e+05 -1.3704e+04  3e+05  2e-01  3e-01
 5:  7.3949e+04  1.1604e+04  8e+04  6e-02  8e-02
 6:  6.4076e+04  1.3994e+04  6e+04  4e-02  5e-02
 7:  4.7854e+04  1.6088e+04  4e+04  2e-02  2e-02
 8:  4.5137e+04  1.7169e+04  3e+04  1e-02  1e-02
 9:  3.9666e+04  1.8305e+04  2e+04  7e-03  9e-03
10:  3.7664e+04  1.8980e+04  2e+04  5e-03  6e-03
11:  3.4888e+04  1.9742e+04  2e+04  3e-03  3e-03
12:  3.3462e+04  2.0051e+04  1e+04  2e-03  2e-03
13:  3.0872e+04  2.0719e+04  1e+04  1e-03  1e-03
14:  3.0009e+04  2.0974e+04  9e+03  6e-04  8e-04
15:  2.8051e+04  2.1631e+04  7e+03  3e-04  4e-04
16:  2.7369e+04  2.1835e+04  6e+03  2e-04  3e-04
17:  2.6163e+04  2.2300e+04  4e+03  1e-04  1e-04
18:  2.5648e+04  2.2503e+04  3e+03  5e-05  7e-05
19:  2.4766e+04  2.29

29:  2.4766e+04  2.4765e+04  1e+00  4e-10  6e-10
30:  2.4765e+04  2.4765e+04  1e-01  3e-11  3e-11
31:  2.4765e+04  2.4765e+04  1e-03  3e-13  3e-12
Optimal solution found.
Calculating average accuracy score with C = 90
     pcost       dcost       gap    pres   dres
 0: -4.9626e+07  4.2011e+07  2e+08  2e+02  2e+02
 1:  5.4265e+06 -7.7029e+06  3e+07  3e+01  2e+01
 2:  1.5852e+06 -1.6733e+06  6e+06  5e+00  4e+00
 3:  9.3247e+05 -4.7025e+05  2e+06  2e+00  1e+00
 4:  4.0295e+05 -3.6883e+04  6e+05  4e-01  3e-01
 5:  1.3702e+05  1.6198e+04  2e+05  8e-02  7e-02
 6:  1.0265e+05  2.1146e+04  1e+05  4e-02  3e-02
 7:  8.1369e+04  2.3523e+04  7e+04  2e-02  2e-02
 8:  7.2230e+04  2.4998e+04  5e+04  2e-02  1e-02
 9:  6.4341e+04  2.6541e+04  4e+04  9e-03  8e-03
10:  6.0204e+04  2.7609e+04  4e+04  6e-03  5e-03
11:  5.3766e+04  2.9167e+04  3e+04  4e-03  3e-03
12:  5.1814e+04  2.9684e+04  2e+04  2e-03  2e-03
13:  4.8595e+04  3.0428e+04  2e+04  1e-03  1e-03
14:  4.5171e+04  3.1424e+04  1e+04  9e-04  7e-04

21:  3.8834e+04  3.6094e+04  3e+03  6e-06  5e-06
22:  3.8205e+04  3.6472e+04  2e+03  3e-06  3e-06
23:  3.7817e+04  3.6692e+04  1e+03  2e-06  1e-06
24:  3.7591e+04  3.6808e+04  8e+02  6e-07  5e-07
25:  3.7388e+04  3.6955e+04  4e+02  2e-07  2e-07
26:  3.7272e+04  3.7045e+04  2e+02  1e-07  9e-08
27:  3.7209e+04  3.7093e+04  1e+02  3e-08  2e-08
28:  3.7168e+04  3.7129e+04  4e+01  7e-09  6e-09
29:  3.7152e+04  3.7143e+04  1e+01  2e-09  1e-09
30:  3.7148e+04  3.7146e+04  1e+00  2e-10  2e-10
31:  3.7147e+04  3.7147e+04  2e-01  2e-11  2e-11
32:  3.7147e+04  3.7147e+04  3e-03  2e-13  6e-12
Optimal solution found.


In [7]:
scores = np.array(accuracy_scores)
scores = scores.reshape(-1,5)
#calculating average accuracy scores of C= 30, 60, 90
avg_scores = np.mean(scores, axis = 1)
print(avg_scores)

[0.96623169 0.96634934 0.96634934]
