# Processing

In [129]:
import pandas as pd
import numpy as np
from cvxopt import matrix, solvers

from datahandler import datahandler
import predictors
from kernels import Kernel
from sklearn import datasets
import matplotlib.pyplot as plt

In [130]:
path_data = 'dataset/Xtr2_mat100.csv'
path_label = 'dataset/Ytr2.csv'

In [131]:
dataset = datahandler(path_data, path_label,features_generated=True)
X,y = dataset.X, dataset.Y
X_train, X_val, Y_train, Y_val = dataset.train_val(X,y)

# Predictors

#### Ridge regression

In [132]:
Y_train_ridge, Y_val_ridge = Y_train.copy(), Y_val.copy()

lambda_reg = 1e-5
rr = predictors.RR(lambda_reg)
rr.fit(X_train, Y_train_ridge)

Y_val_predict = rr.predict(X_val)
Y_train_predict = rr.predict(X_train)

train_acc = np.sum(Y_train_predict == Y_train_ridge)/len(Y_train_ridge)
val_acc = np.sum(Y_val_predict == Y_val_ridge)/len(Y_val_predict)

print('Train Accuracy: {}%'.format(train_acc*100))
print('Val Accuracy: {}%'.format(val_acc*100))
    

Train Accuracy: 72.8870858688303%
Val Accuracy: 67.17850287907869%


#### Ridge Kernel regression

In [133]:
Y_train_ridge, Y_val_ridge = Y_train.copy(), Y_val.copy()

sigma = 0.06
lambda_reg = 0.007

kernel = Kernel(Kernel.gaussian(sigma))
K_RR = kernel.kernel_matrix(X_train)
alpha = predictors.Kernel_RR(lambda_reg).fit(K_RR,Y_train)

Y_train_predict = np.sign(np.dot(K_RR,alpha))
Y_val_predict = kernel.predict(X_train, X_val,alpha)


train_acc = np.sum(Y_train_predict == Y_train_ridge)/len(Y_train_ridge)
val_acc = np.sum(Y_val_predict == Y_val_ridge)/len(Y_val_predict)

print('Train Accuracy: {}%'.format(train_acc*100))
print('Val Accuracy: {}%'.format(val_acc*100))

100%|█████████████████████████████████████████████████████████████████████████████| 1479/1479 [00:14<00:00, 105.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 521/521 [00:09<00:00, 56.06it/s]

Train Accuracy: 98.85057471264368%
Val Accuracy: 68.71401151631477%





### SVM

#### Toy Dataset - Double moons

In [9]:
X,Y = datasets.make_moons(150,2)
Y[Y==0] = -1
Y = Y.reshape(-1,1)
XY = np.hstack([X,Y])


msk = np.random.rand(X.shape[0]) < 0.75
X_train, Y_train = XY[msk][:,:-1], XY[msk][:,-1]
X_val, Y_val = XY[~msk][:,:-1], XY[~msk][:,-1]

Y_train = Y_train.reshape(-1,1)
Y_val = Y_val.reshape(-1,1)

print('X_train and Y_train shape: {},{}'.format(X_train.shape, Y_train.shape))
print('X_val and Y_val shape: {},{}'.format(X_val.shape, Y_val.shape))

Y_train_svm, Y_val_svm = Y_train.copy(), Y_val.copy()

sigma = 0.1
lambda_reg = 100

kernel = Kernel(Kernel.gaussian(sigma))
K_SVM = kernel.kernel_matrix(X_train)
alpha = predictors.SVM.fit(K_SVM,Y_train_svm,lambda_reg)

Y_train_predict = np.sign(np.dot(K_SVM,alpha))
Y_val_predict = kernel.predict(X_train, X_val,alpha)


train_acc = np.sum(Y_train_predict == Y_train_svm)/len(Y_train_svm)
val_acc = np.sum(Y_val_predict == Y_val_svm)/len(Y_val_svm)

print('Train Accuracy: {}%'.format(train_acc*100))
print('Val Accuracy: {}%'.format(val_acc*100))

X_train and Y_train shape: (110, 2),(110, 1)
X_val and Y_val shape: (40, 2),(40, 1)




#### Real data

In [67]:
Y_train_svm, Y_val_svm = Y_train.copy(), Y_val.copy()

sigma = 0.043
lambda_reg = 1e-5

kernel = Kernel(Kernel.gaussian(sigma))
K_SVM = kernel.kernel_matrix(X_train)
alpha = predictors.SVM.fit(K_SVM,Y_train_svm,lambda_reg)

Y_train_predict = np.sign(np.dot(K_SVM,alpha))
Y_val_predict = kernel.predict(X_train, X_val,alpha)


train_acc = np.sum(Y_train_predict == Y_train_svm)/len(Y_train_svm)
val_acc = np.sum(Y_val_predict == Y_val_svm)/len(Y_val_svm)

print('Train Accuracy: {}%'.format(train_acc*100))
print('Val Accuracy: {}%'.format(val_acc*100))

100%|█████████████████████████████████████████████████████████████████████████████| 1517/1517 [00:14<00:00, 107.57it/s]


     pcost       dcost       gap    pres   dres
 0:  3.4998e+04 -1.4472e+05  2e+05  3e-17  3e-14
 1:  4.9182e+03 -8.4190e+03  1e+04  2e-16  3e-14
 2:  3.8851e+02 -1.3852e+03  2e+03  2e-16  8e-15
 3: -1.2581e+02 -3.5718e+02  2e+02  2e-16  4e-15
 4: -1.6052e+02 -1.7765e+02  2e+01  2e-16  2e-15
 5: -1.6090e+02 -1.6119e+02  3e-01  2e-16  1e-15
 6: -1.6090e+02 -1.6090e+02  3e-03  2e-16  1e-15
 7: -1.6090e+02 -1.6090e+02  3e-05  2e-16  1e-15
Optimal solution found.


100%|████████████████████████████████████████████████████████████████████████████████| 483/483 [00:08<00:00, 55.35it/s]

Train Accuracy: 100.0%
Val Accuracy: 59.42028985507246%





# String kernels

In [117]:
import pandas as pd
import numpy as np
from cvxopt import matrix, solvers

from datahandler import datahandler
import predictors
from kernels import Kernel
from sklearn import datasets
import matplotlib.pyplot as plt

In [118]:
path_data = 'dataset/Xtr2.csv'
path_label = 'dataset/Ytr2.csv'

dataset = datahandler(path_data, path_label,features_generated=False)
X,y = dataset.X, dataset.Y

### Spectrum Kernel

In [115]:
dataset.compute_vocabulary(8)
dataset.spectral_embedding(8,dataset.vocab)


X_train, X_val, Y_train, Y_val = dataset.train_val(dataset.X_embedded,y)
Y_train_svm, Y_val_svm = Y_train.copy(), Y_val.copy()

lambda_reg = 0.1

kernel = Kernel(Kernel.dot_product())



K_SVM = kernel.kernel_matrix(X_train)
alpha = predictors.SVM.fit(K_SVM,Y_train_svm,lambda_reg)

Y_train_predict = np.sign(np.dot(K_SVM,alpha))
Y_val_predict = kernel.predict(X_train, X_val,alpha)


train_acc = np.sum(Y_train_predict == Y_train_svm)/len(Y_train_svm)
val_acc = np.sum(Y_val_predict == Y_val_svm)/len(Y_val_svm)

print('Train Accuracy: {}%'.format(train_acc*100))
print('Val Accuracy: {}%'.format(val_acc*100))

100%|███████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 23836.69it/s]

Computing vocabulary...



100%|██████████████████████████████████████████████████████████████████████████████| 1523/1523 [00:31<00:00, 47.84it/s]


     pcost       dcost       gap    pres   dres
 0: -1.3523e+01 -1.8824e+01  4e+03  6e+01  1e-15
 1: -1.3491e+01 -1.8342e+01  4e+02  6e+00  7e-16
 2: -1.2586e+01 -1.6340e+01  8e+01  1e+00  1e-15
 3: -8.1576e+00 -1.2361e+01  1e+01  1e-01  4e-15
 4: -6.6340e+00 -8.5367e+00  2e+00  3e-03  2e-15
 5: -6.8726e+00 -7.1855e+00  3e-01  4e-04  8e-16
 6: -6.9428e+00 -6.9977e+00  6e-02  6e-05  7e-16
 7: -6.9582e+00 -6.9667e+00  9e-03  6e-06  7e-16
 8: -6.9613e+00 -6.9618e+00  5e-04  2e-07  7e-16
 9: -6.9615e+00 -6.9615e+00  1e-05  4e-09  8e-16


  1%|▌                                                                                 | 3/477 [00:00<00:16, 28.26it/s]

10: -6.9615e+00 -6.9615e+00  5e-07  7e-11  8e-16
Optimal solution found.


100%|████████████████████████████████████████████████████████████████████████████████| 477/477 [00:20<00:00, 23.74it/s]

Train Accuracy: 94.28759028233749%
Val Accuracy: 65.61844863731656%





## Mismatch Kernel

In [13]:
dataset.compute_vocabulary(6)
dataset.mismatch_embedding(6,1)


X_train, X_val, Y_train, Y_val = dataset.train_val(dataset.X_embedded,y)
Y_train_svm, Y_val_svm = Y_train.copy(), Y_val.copy()

lambda_reg = 0.1

kernel = Kernel(Kernel.dot_product())



K_SVM = kernel.kernel_matrix(X_train)
alpha = predictors.SVM.fit(K_SVM,Y_train_svm,lambda_reg)

Y_train_predict = np.sign(np.dot(K_SVM,alpha))
Y_val_predict = kernel.predict(X_train, X_val,alpha)


train_acc = np.sum(Y_train_predict == Y_train_svm)/len(Y_train_svm)
val_acc = np.sum(Y_val_predict == Y_val_svm)/len(Y_val_svm)

print('Train Accuracy: {}%'.format(train_acc*100))
print('Val Accuracy: {}%'.format(val_acc*100))

     pcost       dcost       gap    pres   dres
 0: -4.4647e+00 -9.5650e+00  3e+03  6e+01  2e-13
 1: -4.4494e+00 -9.3208e+00  2e+02  3e+00  2e-13
 2: -3.6450e+00 -8.3706e+00  4e+01  5e-01  1e-13
 3: -2.3852e+00 -7.1195e+00  1e+01  1e-01  8e-14
 4: -1.9639e+00 -4.6086e+00  4e+00  2e-02  3e-14
 5: -1.8683e+00 -2.4995e+00  7e-01  2e-03  3e-14
 6: -1.9329e+00 -2.0509e+00  1e-01  3e-04  3e-14
 7: -1.9513e+00 -1.9674e+00  2e-02  3e-05  3e-14
 8: -1.9543e+00 -1.9569e+00  3e-03  4e-06  3e-14
 9: -1.9549e+00 -1.9550e+00  1e-04  1e-07  3e-14
10: -1.9549e+00 -1.9549e+00  3e-06  2e-09  3e-14
11: -1.9549e+00 -1.9549e+00  8e-08  3e-11  3e-14
Optimal solution found.
Train Accuracy: 98.14077025232405%
Val Accuracy: 72.8744939271255%
