In [1]:
from kernels import * 
from learning_models import *
from tools import *
import pandas as pd
import numpy as np
from time import time 
from sklearn.svm import SVC
from tqdm import tqdm
from autoreload import superreload

In [2]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [42]:
def spectrum_kernel(X, length):
    all_sequence = {}
    
    for idx in range(len(X)):
        data = X[idx]
        for i in range(len(data)-length + 1):
            seq1 = data[i:i+length]
            if seq1 in all_sequence:
                if idx in all_sequence[seq1]:
                    all_sequence[seq1][idx] += 1
                else:
                    all_sequence[seq1][idx] = 1
            else:
                all_sequence[seq1] = {}
                all_sequence[seq1][idx] = 1
    
    kernel = np.zeros((len(X), len(X)))
    
    for seq in all_sequence:
        for key1 in all_sequence[seq]:
            for key2 in all_sequence[seq]:
                kernel[key1][key2] += all_sequence[seq][key1]*all_sequence[seq][key2]
                
    
    return kernel

In [3]:
kernel = pd.read_csv('Kernel_MisMatch_5_1.csv', index_col=0).values

In [63]:
data = pd.read_csv("./data/Xtr2.csv", header=None)[0].values.tolist()

In [64]:
%%time
kernel = spectrum_kernel(data,5)

CPU times: user 28.6 s, sys: 141 ms, total: 28.8 s
Wall time: 29 s


In [65]:
y = pd.read_csv("./data/Ytr2.csv", index_col=0)['Bound'].values
#dataY = pd.read_csv("Data/Train/Ytr0.csv", index_col=0)
#y = dataY['Bound'].values

In [49]:
print(y)

[1 0 1 ..., 1 0 0]


In [5]:
X = load_and_transform("./data/Xtr0.csv")

In [6]:
lambdas = [0.001,0.01,0.05,0.1,1,10,100,1000]

# SVM Performance

## 1. Hinge Loss

In [7]:
svm = SVM(loss='hinge')

In [68]:
from sklearn.svm import SVC

for c in [1e-6,1e-5,1e-4,0.001,0.01,0.05,0.1,1,10,100,1000]:
    print("------- {} ---------".format(c))
    svc = SVC(C = c, kernel='precomputed')
    
    K_norm = normalize_kernel(kernel)

    X_train, X_test, y_train, y_test, K_train, K_test = train_test_split(X,y,kernel,test_size=0.2,
                                                                                     verbose=False)
    svc.fit(K_train, y_train)

    y_pred = svc.predict(K_test)
    y_pred_t = svc.predict(K_train)
    print(accuracy_score(y_test, y_pred))
    print(accuracy_score(y_train, y_pred_t))

------- 1e-06 ---------
0.56
0.58125
------- 1e-05 ---------
0.61
0.625625
------- 0.0001 ---------
0.54
0.60125
------- 0.001 ---------
0.5675
0.725
------- 0.01 ---------
0.605
0.825
------- 0.05 ---------
0.635
0.87625
------- 0.1 ---------
0.58
0.90375
------- 1 ---------
0.5975
0.9725
------- 10 ---------
0.58
1.0
------- 100 ---------
0.575
1.0
------- 1000 ---------
0.595
1.0


### 1.1 With Kernel unmodified

In [66]:
df = evaluate_model(svm,X,y,kernel, lambdas=lambdas, n_validations=10, normalize=False)

% of validation rounds: 100%|██████████| 10/10 [04:38<00:00, 27.87s/it]

Maximal Testing Accuracy Score obtained with Lambda = 0.1
	Train accuracy score : 0.788
	Test accuracy score : 0.638





In [67]:
df

Unnamed: 0,avg Training Score,avg Testing Score
0.001,0.936375,0.59875
0.01,0.866375,0.61025
0.05,0.814312,0.634
0.1,0.788312,0.6385
1.0,0.675875,0.61
10.0,0.674813,0.608
100.0,0.674813,0.60825
1000.0,0.674812,0.60825


In [14]:
N_kernel = normalize_kernel(kernel)

In [21]:
print(N_kernel[0])

[ 1.          0.52899877  0.52787922 ...,  0.54665661  0.54742772
  0.52422398]


In [14]:
df = evaluate_model(svm,X,y,kernel, lambdas=lambdas, n_validations=10, normalize=True)

% of validation rounds: 100%|██████████| 10/10 [04:13<00:00, 25.35s/it]

Maximal Testing Accuracy Score obtained with Lambda = 0.001
	Train accuracy score : 0.770
	Test accuracy score : 0.560





### 1.2 With Kernel taking into account the bias (1 added to every entry)

In [15]:
df = evaluate_model(svm,X,y,kernel+1, lambdas=lambdas, n_validations=10, normalize=False)

% of validation rounds: 100%|██████████| 10/10 [04:58<00:00, 29.84s/it]

Maximal Testing Accuracy Score obtained with Lambda = 0.05
	Train accuracy score : 0.754
	Test accuracy score : 0.556





In [16]:
df = evaluate_model(svm,X,y,kernel+1, lambdas=lambdas, n_validations=10, normalize=True)

% of validation rounds: 100%|██████████| 10/10 [04:37<00:00, 27.79s/it]

Maximal Testing Accuracy Score obtained with Lambda = 0.001
	Train accuracy score : 0.769
	Test accuracy score : 0.564





## 2. Squared Hinge Loss

In [17]:
svm = SVM(loss='squared_hinge')

### 2.1 With Kernel unmodified

In [18]:
df = evaluate_model(svm,X,y,kernel, lambdas=lambdas, n_validations=10, normalize=False)

% of validation rounds: 100%|██████████| 10/10 [01:25<00:00,  8.54s/it]

Maximal Testing Accuracy Score obtained with Lambda = 0.05
	Train accuracy score : 0.837
	Test accuracy score : 0.567





In [19]:
df = evaluate_model(svm,X,y,kernel, lambdas=lambdas, n_validations=10, normalize=True)

% of validation rounds: 100%|██████████| 10/10 [01:08<00:00,  6.86s/it]

Maximal Testing Accuracy Score obtained with Lambda = 0.05
	Train accuracy score : 0.775
	Test accuracy score : 0.569





### 2.2 With Kernel taking into account the bias (1 added to every entry)

In [20]:
df = evaluate_model(svm,X,y,kernel+1, lambdas=lambdas, n_validations=10, normalize=False)

% of validation rounds: 100%|██████████| 10/10 [01:18<00:00,  7.84s/it]

Maximal Testing Accuracy Score obtained with Lambda = 1.0
	Train accuracy score : 0.781
	Test accuracy score : 0.571





In [21]:
df = evaluate_model(svm,X,y,kernel+1, lambdas=lambdas, n_validations=10, normalize=True)

% of validation rounds: 100%|██████████| 10/10 [01:08<00:00,  6.87s/it]

Maximal Testing Accuracy Score obtained with Lambda = 0.01
	Train accuracy score : 0.788
	Test accuracy score : 0.565





# KRR Performance

In [22]:
krr = KRR()

### 1. With Kernel unmodified

In [24]:
df = evaluate_model(krr,X,y,kernel, lambdas=lambdas, n_validations=10, normalize=False, binary=False)

% of validation rounds: 100%|██████████| 10/10 [00:06<00:00,  1.55it/s]

Maximal Testing Accuracy Score obtained with Lambda = 0.1 and threshold = 0.5
	Train accuracy score : 0.793
	Test accuracy score : 0.567





In [25]:
df = evaluate_model(krr,X,y,kernel, lambdas=lambdas, n_validations=10, normalize=True, binary=False)

% of validation rounds: 100%|██████████| 10/10 [00:06<00:00,  1.64it/s]

Maximal Testing Accuracy Score obtained with Lambda = 0.001 and threshold = 0.5
	Train accuracy score : 0.839
	Test accuracy score : 0.553





### 2. With Kernel taking into account the bias (1 added to every entry)

In [26]:
df = evaluate_model(krr,X,y,kernel+1, lambdas=lambdas, n_validations=10, normalize=False, binary=False)

% of validation rounds: 100%|██████████| 10/10 [00:06<00:00,  1.57it/s]

Maximal Testing Accuracy Score obtained with Lambda = 0.05 and threshold = 0.5
	Train accuracy score : 0.826
	Test accuracy score : 0.564





In [27]:
df = evaluate_model(krr,X,y,kernel+1, lambdas=lambdas, n_validations=10, normalize=True, binary=False)

% of validation rounds: 100%|██████████| 10/10 [00:06<00:00,  1.66it/s]

Maximal Testing Accuracy Score obtained with Lambda = 0.001 and threshold = 0.5
	Train accuracy score : 0.838
	Test accuracy score : 0.570



