In [3]:
import numpy as np
from cvxopt import matrix as cvxopt_matrix
from cvxopt import solvers as cvxopt_solvers
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [4]:
#loading dataset
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

#selecting label and replacing 0 by -1
df_y_train = df_train.iloc[:,0]
df_y_train[df_y_train==0]=-1
df_y_test = df_test.iloc[:,0]
df_y_test[df_y_test==0]=-1

#selecting features
df_X_train = df_train.iloc[:,1:]
df_X_test = df_test.iloc[:,1:]

In [5]:
X_train = np.array(df_X_train)
y_train = np.array(df_y_train)
X_test = np.array(df_X_test)
y_test = np.array(df_y_test)

In [6]:
def svm_train_dual(data_train , label_train , regularisation_para_C):
    X_train = data_train
    y_train = label_train
    C = regularisation_para_C
    
    #number of samples
    N = len(y_train)

    # H = yi * yj * Xi * Xj = (y*X)T  * (y*X)
    y_train = y_train.reshape(-1,1)
    X_ = y_train * X_train
    H = X_ @ X_.T

    #initializing cvxopt parameters
    G = np.r_[(np.eye(N)*-1,np.eye(N))]
    q = np.ones(N) * -1
    h = np.r_[np.zeros(N), np.ones(N) * C]
    b = np.zeros(1)
    A = y_train.reshape(1,-1)

    P = cvxopt_matrix(H)
    G = cvxopt_matrix(G)
    q = cvxopt_matrix(q)
    h = cvxopt_matrix(h)
    b = cvxopt_matrix(b)
    A = cvxopt_matrix(A)
    
    sol = cvxopt_solvers.qp(P, q, G, h, A, b)
    alphas = np.array(sol['x'])


    # w = sum(alpha * y * x) 
    w = 0
    for i in range(N):
        w += alphas[i] * y_train[i] * X_train[i]
    
    
    # w0 = 1/yi - wTxi
    # because of wide range of w0s, we will get the mean
    epsilon = 1e-6
    w0 = []
    for i in range(N):
        if (alphas[i] > epsilon):
            w0.append(1/y_train[i] - w.T @ X_train[i])

    w0 = np.array(w0)
    w0 = w0.mean()
    
    return [w, w0]

In [7]:
def svm_predict_dual(data_test , label_test , svm_model_d):
    X_test = data_test
    y_test = label_test
    w0 = svm_model_d[1]
    w = svm_model_d[0]
    
    y_pred = []
    for x in X_test:
        f = w.T @ x + w0
        if f > 0 :
            y_pred.append(1)
        else:
            y_pred.append(-1)
    
    return accuracy_score(y_test, y_pred)

In [8]:
svm_model_d = svm_train_dual(X_train , y_train , 60 )

     pcost       dcost       gap    pres   dres
 0: -5.0574e+04 -2.7391e+07  9e+07  1e+00  5e-11
 1: -3.5471e+04 -9.9083e+06  2e+07  1e-01  5e-11
 2: -2.1944e+04 -2.4874e+06  4e+06  3e-02  4e-11
 3: -1.8432e+04 -1.0759e+06  1e+06  9e-03  3e-11
 4: -1.8114e+04 -4.1635e+05  4e+05  2e-03  3e-11
 5: -1.9140e+04 -1.3401e+05  1e+05  5e-04  3e-11
 6: -2.0952e+04 -1.0396e+05  8e+04  3e-04  4e-11
 7: -2.2395e+04 -8.0681e+04  6e+04  2e-04  4e-11
 8: -2.3752e+04 -6.7728e+04  4e+04  9e-05  4e-11
 9: -2.4417e+04 -6.1909e+04  4e+04  6e-05  4e-11
10: -2.5114e+04 -5.6274e+04  3e+04  3e-05  4e-11
11: -2.6045e+04 -5.0208e+04  2e+04  2e-05  4e-11
12: -2.6767e+04 -4.6303e+04  2e+04  9e-06  5e-11
13: -2.7396e+04 -4.4001e+04  2e+04  5e-06  5e-11
14: -2.8086e+04 -4.1422e+04  1e+04  2e-06  5e-11
15: -2.8550e+04 -3.9629e+04  1e+04  2e-06  5e-11
16: -2.9379e+04 -3.7265e+04  8e+03  9e-07  5e-11
17: -2.9905e+04 -3.5956e+04  6e+03  4e-07  5e-11
18: -3.0511e+04 -3.4514e+04  4e+03  2e-07  5e-11
19: -3.0758e+04 -3.38

In [13]:
print('w0',svm_model_d[1])
print('w',svm_model_d[0])

w0 2.1625100637154056
w [-3.47397296e-02 -7.38555435e-02  3.34996623e-02 -1.60486648e-02
  7.00195686e-02 -4.07421512e-02  6.56196368e-02  2.04228900e-02
 -1.48790169e-02 -7.23570390e-03  6.98748657e-02 -2.30935585e-02
  4.41810429e-02  6.86969617e-02  2.85422461e-02 -3.34525461e-02
 -5.33057649e-02 -6.57688458e-02 -7.14945100e-02 -3.58541373e-02
 -2.23557185e-02  2.20698870e-02  2.79265435e-02 -2.70907507e-02
 -5.03338425e-03  3.08974585e-02  3.14868724e-02  5.26287160e-02
 -6.39442274e-02  8.37327838e-02 -1.10960117e-01 -7.37757841e-03
  5.56393644e-02 -5.88326700e-03  2.32835203e-02 -5.01453509e-02
  2.03362568e-02 -2.40755729e-02  1.03212406e-01 -5.70291831e-02
  3.13139194e-02  5.84898617e-02 -3.26430832e-02  4.10083225e-02
 -5.58833974e-04 -2.97130961e-02  6.17110144e-02  2.21108870e-02
  2.99733560e-02  7.47989265e-02 -5.86580266e-02 -4.52792776e-02
  7.61154715e-02 -2.50155178e-02  3.52435401e-02 -1.61112702e-02
 -1.92682847e-02  4.07564415e-02 -2.30339099e-02  8.71051263e-01
 

In [9]:
test_accuracy_d = svm_predict_dual(X_train , y_train , svm_model_d)
print("accucracy on training set",test_accuracy_d)
test_accuracy_d_2 = svm_predict_dual(X_test , y_test, svm_model_d)
print("accuracy on testing set", test_accuracy_d_2)

accucracy on training set 0.9777620896576068
accuracy on testing set 0.9679786524349566


In [12]:
#tuning C by k-fold cross validation
Cs = [30, 60, 90]

#Kfold with k = 5
kf = KFold(n_splits=5,shuffle=False)
kf.split(df_X_train)

accuracy_scores = []


for C in Cs:
    print("Calculating average accuracy score with C =", C)
    for train_index, test_index in kf.split(df_X_train):
        df_X_train_sub, df_X_test_sub = df_X_train.iloc[train_index], df_X_train.iloc[test_index]
        df_y_train_sub, df_y_test_sub = df_y_train[train_index], df_y_train[test_index]

        X_train_sub = np.array(df_X_train_sub)
        X_test_sub = np.array(df_X_test_sub)
        y_train_sub = np.array(df_y_train_sub)
        y_test_sub = np.array(df_y_test_sub)
        svm_model_d = svm_train_dual(X_train_sub,y_train_sub, C)
        test_accuracy_d = svm_predict_dual(X_test_sub,y_test_sub,svm_model_d)
        accuracy_scores.append(test_accuracy_d)


Calculating average accuracy score with C = 30
     pcost       dcost       gap    pres   dres
 0: -2.0566e+04 -5.5165e+06  2e+07  1e+00  2e-11
 1: -1.4342e+04 -1.9935e+06  3e+06  1e-01  2e-11
 2: -8.6340e+03 -4.7936e+05  7e+05  3e-02  2e-11
 3: -7.2078e+03 -2.1107e+05  3e+05  9e-03  1e-11
 4: -6.9589e+03 -7.8903e+04  8e+04  2e-03  1e-11
 5: -7.5168e+03 -3.8082e+04  3e+04  7e-04  1e-11
 6: -8.0545e+03 -3.0628e+04  2e+04  4e-04  2e-11
 7: -8.5673e+03 -2.5495e+04  2e+04  2e-04  2e-11
 8: -9.0578e+03 -2.2083e+04  1e+04  1e-04  2e-11
 9: -9.4426e+03 -2.0133e+04  1e+04  9e-05  2e-11
10: -9.7878e+03 -1.8443e+04  9e+03  5e-05  2e-11
11: -1.0072e+04 -1.6894e+04  7e+03  3e-05  2e-11
12: -1.0298e+04 -1.5811e+04  6e+03  2e-05  2e-11
13: -1.0532e+04 -1.4933e+04  4e+03  9e-06  2e-11
14: -1.0744e+04 -1.4250e+04  4e+03  5e-06  2e-11
15: -1.1009e+04 -1.3513e+04  3e+03  3e-06  2e-11
16: -1.1097e+04 -1.3210e+04  2e+03  2e-06  2e-11
17: -1.1345e+04 -1.2667e+04  1e+03  8e-07  2e-11
18: -1.1479e+04 -1.2405

     pcost       dcost       gap    pres   dres
 0: -4.0383e+04 -2.1315e+07  7e+07  1e+00  4e-11
 1: -2.7800e+04 -7.5326e+06  1e+07  1e-01  4e-11
 2: -1.6373e+04 -1.7622e+06  3e+06  3e-02  3e-11
 3: -1.3651e+04 -7.7508e+05  1e+06  9e-03  3e-11
 4: -1.3220e+04 -2.5902e+05  3e+05  2e-03  3e-11
 5: -1.4131e+04 -9.2477e+04  8e+04  5e-04  3e-11
 6: -1.5248e+04 -7.5327e+04  6e+04  3e-04  3e-11
 7: -1.6501e+04 -5.3309e+04  4e+04  1e-04  3e-11
 8: -1.7399e+04 -4.8464e+04  3e+04  8e-05  3e-11
 9: -1.8428e+04 -4.1783e+04  2e+04  5e-05  3e-11
10: -1.9050e+04 -3.8982e+04  2e+04  3e-05  4e-11
11: -1.9777e+04 -3.5623e+04  2e+04  2e-05  4e-11
12: -2.0074e+04 -3.3977e+04  1e+04  1e-05  4e-11
13: -2.0731e+04 -3.1189e+04  1e+04  8e-06  4e-11
14: -2.0980e+04 -3.0180e+04  9e+03  4e-06  4e-11
15: -2.1634e+04 -2.8148e+04  7e+03  2e-06  4e-11
16: -2.1837e+04 -2.7427e+04  6e+03  1e-06  4e-11
17: -2.2300e+04 -2.6192e+04  4e+03  7e-07  4e-11
18: -2.2504e+04 -2.5663e+04  3e+03  4e-07  4e-11
19: -2.2976e+04 -2.47

29: -2.4765e+04 -2.4766e+04  1e+00  3e-12  5e-11
30: -2.4765e+04 -2.4765e+04  1e-01  3e-13  6e-11
31: -2.4765e+04 -2.4765e+04  1e-03  3e-14  6e-11
Optimal solution found.
Calculating average accuracy score with C = 90
     pcost       dcost       gap    pres   dres
 0: -6.0114e+04 -4.7396e+07  2e+08  1e+00  7e-11
 1: -4.0991e+04 -1.6620e+07  3e+07  1e-01  6e-11
 2: -2.3701e+04 -3.8462e+06  6e+06  3e-02  5e-11
 3: -1.9716e+04 -1.6881e+06  2e+06  8e-03  4e-11
 4: -1.9320e+04 -5.5754e+05  6e+05  2e-03  4e-11
 5: -2.0478e+04 -1.7361e+05  2e+05  4e-04  4e-11
 6: -2.2564e+04 -1.2028e+05  1e+05  2e-04  4e-11
 7: -2.4074e+04 -9.0677e+04  7e+04  1e-04  5e-11
 8: -2.5327e+04 -7.8748e+04  5e+04  7e-05  5e-11
 9: -2.6708e+04 -6.8222e+04  4e+04  4e-05  5e-11
10: -2.7717e+04 -6.2953e+04  4e+04  3e-05  5e-11
11: -2.9221e+04 -5.5397e+04  3e+04  2e-05  5e-11
12: -2.9717e+04 -5.2873e+04  2e+04  1e-05  5e-11
13: -3.0445e+04 -4.9193e+04  2e+04  7e-06  6e-11
14: -3.1433e+04 -4.5555e+04  1e+04  4e-06  6e-11

21: -3.6094e+04 -3.8837e+04  3e+03  3e-08  7e-11
22: -3.6472e+04 -3.8206e+04  2e+03  2e-08  7e-11
23: -3.6692e+04 -3.7817e+04  1e+03  8e-09  7e-11
24: -3.6808e+04 -3.7591e+04  8e+02  3e-09  7e-11
25: -3.6955e+04 -3.7388e+04  4e+02  1e-09  7e-11
26: -3.7045e+04 -3.7272e+04  2e+02  5e-10  7e-11
27: -3.7093e+04 -3.7209e+04  1e+02  1e-10  8e-11
28: -3.7129e+04 -3.7168e+04  4e+01  4e-11  8e-11
29: -3.7143e+04 -3.7152e+04  1e+01  8e-12  8e-11
30: -3.7146e+04 -3.7148e+04  1e+00  9e-13  8e-11
31: -3.7147e+04 -3.7147e+04  2e-01  1e-12  8e-11
32: -3.7147e+04 -3.7147e+04  3e-03  3e-12  9e-11
Optimal solution found.


In [22]:
scores = np.array(accuracy_scores)
scores = scores.reshape(-1,5)
#calculating average accuracy scores of C= 30, 60, 90
avg_scores = np.mean(scores, axis = 1)
print(avg_scores)

[0.96587875 0.96587882 0.96623176]
