In [1]:
import pandas as pd
import numpy as np
from cvxopt import matrix, solvers
from cvxopt.modeling import variable
from sklearn.svm import SVC

In [15]:
data = pd.read_csv('2019EE10577.csv', header = None)

t1 = 3
t2 = 7
num_f = 25
train_ex = 550

bin_data = data[(data[25] == t1) | (data[25] == t2)].sample(frac=1)
X_bin = np.array(bin_data.loc[:,:num_f-1])
t_bin = np.array(bin_data.loc[:,25])
t_bin[t_bin == t1] = -1
t_bin[t_bin==t2] = 1

X_bin_train = X_bin[:train_ex]
t_bin_train = t_bin[:train_ex]
X_bin_val = X_bin[train_ex:]
t_bin_val = t_bin[train_ex:]

In [17]:
ker = 'rbf'
if ker is 'linear':
    C = 0.01
if ker is 'rbf':
    C = 0.1
    gamma = 0.1

n_samples, n_features = X_bin_train.shape
K = np.zeros((n_samples, n_samples))

for i in range(n_samples):
    for j in range(n_samples):
        if ker is 'linear':
            K[i,j] = np.dot(X_bin_train[i],np.transpose(X_bin_train[j]))
        if ker is 'rbf':
            K[i,j] = np.exp(-1*gamma*np.sum(np.square(X_bin_train[i]-X_bin_train[j])))

P = matrix(np.outer(t_bin_train,t_bin_train) * K)
q = matrix(np.ones(n_samples) * -1)
A = matrix(t_bin_train, (1,n_samples))
A = matrix(A,(1,n_samples),'d')
b = matrix(0.0)
G = matrix(np.vstack((np.diag(np.ones(n_samples) * -1), np.identity(n_samples))))
h = matrix(np.hstack((np.zeros(n_samples), np.ones(n_samples) * C)))
solution = solvers.qp(P, q, G, h, A, b)
a = np.ravel(solution['x'])
sv = a > 1e-5

     pcost       dcost       gap    pres   dres
 0: -4.9528e+01 -1.1100e+02  2e+03  2e+01  4e-16
 1: -3.1886e+01 -9.9382e+01  1e+02  6e-01  4e-16
 2: -2.8786e+01 -4.5140e+01  2e+01  1e-15  5e-16
 3: -3.0766e+01 -3.3423e+01  3e+00  1e-15  3e-16
 4: -3.1597e+01 -3.2178e+01  6e-01  3e-16  2e-16
 5: -3.1824e+01 -3.1874e+01  5e-02  6e-16  3e-16
 6: -3.1847e+01 -3.1849e+01  2e-03  7e-16  3e-16
 7: -3.1848e+01 -3.1848e+01  6e-05  2e-16  3e-16
 8: -3.1848e+01 -3.1848e+01  2e-06  3e-16  3e-16
Optimal solution found.


In [18]:


lm = a[sv]
lm = lm.reshape(len(lm),1)
sv_t = t_bin_train[sv]
sv_t = sv_t.reshape(len(sv_t),1)
sv_x = X_bin_train[sv]



In [19]:
w = np.sum(lm*sv_t*sv_x,axis=0)
print('w =', w)

w = [-30.48453053  88.72882281  29.2280091    8.51882917 -32.67431905
  21.51407817  27.35737663  -6.2340093   -1.53189171   8.37425632
 -11.55679634 -21.14731712   9.28183891  -1.23713614  -4.27008555
   4.24755615   5.53982768  -2.16756143   1.1195974    7.07656947
  -1.72789216   5.02643467   0.96919255  -2.3020067    3.77825609]


In [20]:
b = np.mean(t_bin_train - np.dot(X_bin_train,w))
print('b =', b)

b = -8.609506062503877


In [21]:
y_train = np.dot(X_bin_train,w) + b
y_val = np.dot(X_bin_val,w)+b
y_train[y_train<0] = -1
y_train[y_train>=0] = 1
y_val[y_val<0] = -1
y_val[y_val>=0] = 1
y_train = y_train.astype(int)
y_val = y_val.astype(int)

In [22]:


res1 = (y_train == t_bin_train)
res2 = (y_val == t_bin_val)
print('Training score =',(res1==True).sum()/len(res1))
print('Validation score =',(res2==True).sum()/len(res2))
print(str(len(sv_x))+' support vectors using CVXOPT:', np.where(sv==True)[0].tolist())



Training score = 0.9618181818181818
Validation score = 0.9923076923076923
481 support vectors using CVXOPT: [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 80, 81, 82, 83, 85, 86, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 183, 186, 187, 188, 189, 190, 192, 193, 196, 198, 200, 202, 203, 204, 206, 207, 208, 209, 210, 211, 212, 214, 215, 216, 217, 218, 219, 220, 221, 223, 224, 225, 226, 227, 22

In [24]:
if ker is 'linear':
    s = SVC(kernel='linear',C = 1)
if ker is 'rbf':
    s = SVC(kernel='rbf', C=0.1, gamma=0.1)
s.fit(X_bin_train,t_bin_train)
print('Training score using SVC =',s.score(X_bin_train,t_bin_train))
print('Validation score using SVC =',s.score(X_bin_val,t_bin_val))
print(str(len(s.support_))+' support vectors using SVC:', sorted(s.support_))

Training score using SVC = 0.9636363636363636
Validation score using SVC = 0.9230769230769231
478 support vectors using SVC: [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 80, 81, 82, 83, 85, 86, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 165, 166, 167, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 183, 186, 187, 188, 189, 190, 192, 193, 196, 198, 200, 202, 203, 204, 206, 207, 208, 209, 210, 212, 214, 215, 216, 217, 218, 219, 220, 221, 223, 224, 225, 226,