In [1]:
from kernels import * 
from learning_models import *
from tools import *
import pandas as pd
import numpy as np
from time import time 
from sklearn.svm import SVC
from tqdm import tqdm
from autoreload import superreload

In [2]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
def load_and_transform(data_path):
    '''
    Loads the data located at data_path and transforms it from lines of strings containing A,T,C,G to an array of 
    integer values in {0,1,2,3}.
    
    Parameters
    ----------
    data_path: string
        the path to the data to be loaded
    '''
    data = pd.read_csv(data_path, header=None)[0].values.tolist()
    transformed = []
    for i in range(len(data)):
        s = data[i].replace("A", "0")
        s = s.replace("T", "1")
        s = s.replace("C", "2")
        s = s.replace("G", "3")
        transformed.append(list(map(int, s)))
    return np.array(transformed)

In [5]:
X_train = load_and_transform("Data/Train/Xtr0.csv")

In [8]:
X_sub = load_and_transform("Data/Test/Xte0.csv")

In [19]:
trie = MismatchTrie(verbose=0)
print("-------- Computing mismatch kernel --------")
start = time()
kern = trie.traverse(X, 4, 3, 1)[0]
#kern = normalize_kernel(kern)
end = time()
print("-------- Done in {:.3f} minutes --------".format((end-start)/60))

-------- Computing mismatch kernel --------
-------- Done in 20.122 minutes --------


In [17]:
def submission_kernel_pipeline(train_data_path, submission_data_path, compute_kernel, normalize=False):
    '''
    Computes and returns the kernels K_train and K_sub for training on the whole training set and predicting on the 
    submission data, after having loaded and transformed the training and submission data located respectively at 
    train_data_path and submission_data_path. Normalizes the kernel if required.
    
    Parameters
    ----------
    train_data_path: string
        path to the training data
    submission_data_path: string
        path to the submission data
    compute_kernel: function
        a function of the file kernels.py which computes a kernel when given data
    normalize: Boolean, deflaut False
        whether or not to normalize the (whole) kernel before splitting and returning the two train and submission 
        kernels
    '''
    X_train = load_and_transform(train_data_path)
    train_size = len(X_train)
    X_sub = load_and_transform(submission_data_path)
    X = np.concatenate((X_train, X_sub))
    print('------ Data loaded, transformed and concatenated ------')
    print('------ Computing Kernel ------')
    K = compute_kernel(X)
    if normalize:
        K = normalize_kernel(K)
    return K[:train_size, :train_size], K[train_size:,:train_size]

In [18]:
K_train, K_test = submission_kernel_pipeline("Data/Train/Xtr0.csv", "Data/Test/Xte0.csv", lambda x: polynomial_kernel_matrix(x, deg=3))

------ Data loaded, transformed and concatenated ------
------ Computing Kernel ------


In [50]:
dataY = pd.read_csv("Data/Train/Ytr0.csv", index_col=0)
y_train = dataY['Bound'].values

In [53]:
lmbd = 0.001
model = SVM(lmbd=lmbd)
model.train(K_train+1, y_train)
y_train_pred = model.predict(K_train+1)
train_error = accuracy_score(y_train, y_train_pred)
print('Accuracy score on training set : {:.3f}'.format(train_error))

Accuracy score on training set : 0.600
