In [1]:
import numpy as np
import pandas as pd
from kernels import * 
from learning_models import *
from tools import *
import pandas as pd
import numpy as np
from time import time 
from sklearn.svm import SVC
from tqdm import tqdm
from autoreload import superreload
import operator

In [2]:
def spectrum_kernel(X, length):
    all_sequence = {}
    
    for idx in range(len(X)):
        data = X[idx]
        for i in range(len(data)-length + 1):
            seq1 = data[i:i+length]
            if seq1 in all_sequence:
                if idx in all_sequence[seq1]:
                    all_sequence[seq1][idx] += 1
                else:
                    all_sequence[seq1][idx] = 1
            else:
                all_sequence[seq1] = {}
                all_sequence[seq1][idx] = 1
    
    kernel = np.zeros((len(X), len(X)))
    
    for seq in all_sequence:
        for key1 in all_sequence[seq]:
            for key2 in all_sequence[seq]:
                kernel[key1][key2] += all_sequence[seq][key1]*all_sequence[seq][key2]
                
    return kernel

In [3]:
def embedding_mismatch_kernel(X, lengths, mismatch, verbose = False):
    
    all_sequences_index = {}
    id_last_seq = 0
    for data in X:
        for i in range(len(data)-lengths + 1):
            seq = data[i:i+lengths]
            if seq not in all_sequences_index:
                all_sequences_index[seq] = id_last_seq
                id_last_seq += 1

    
    vectors = np.zeros((len(X),len(all_sequences_index)))
    
    for idx in tqdm(range(len(X))):
        data = X[idx]
        for i in range(len(data)-lengths + 1):
            seq = data[i:i+lengths]
            vectors[idx, all_sequences_index[seq]] += 1
            if mismatch >= 1:
                for seq_mis in one_mismatch_away(seq):
                    if seq_mis in all_sequences_index:
                        vectors[idx, all_sequences_index[seq_mis]] += 1/2
            if mismatch >= 2:
                for seq_mis in two_mismatch_away(seq):
                    if seq_mis in all_sequences_index:
                        vectors[idx, all_sequences_index[seq_mis]] += 1/3
     
    if verbose:
        print(all_sequences_index)
        print('Embedding :')
        print(vectors)
    return np.dot(vectors,vectors.T)

In [4]:
def one_mismatch_away(s):
    res = []
    for i,letter in enumerate(s):
        if letter not in ['A','T','C','G']:
            print('Please use only letters in ATCG')
        if letter != 'A':
            res.append(str(s[:i]) + 'A' + str(s[i+1:]))
        if letter != 'T':
            res.append(str(s[:i]) + 'T' + str(s[i+1:]))
        if letter != 'C':
            res.append(str(s[:i]) + 'C' + str(s[i+1:]))
        if letter != 'G':
            res.append(str(s[:i]) + 'G' + str(s[i+1:]))
    return res

In [5]:
def two_mismatch_away(s):
    res = []
    for a in one_mismatch_away(s):
        for t in one_mismatch_away(a):
            if t not in res and t != s and t not in one_mismatch_away(s):
                res.append(t)
    return res

In [6]:
def submitResults(filename, y_pred_final):
    '''
    Creates, from a 1d array of final predicted values, a file named filename formated appropriately in order to make a
    submission on the Kaggle platform.
    '''
    y = np.concatenate([y_pred_final[i] for i in [0,1,2]])
    with open("data/submission/{}.csv".format(filename), 'w') as f:
        string = "Id,Bound\n"
        for j in range(0,3000):
            string += str(j)+','+str(int(y[j]))+'\n'
        f.write(string)
    print("----------------- Prediction written on {}.csv ---------------".format(filename))

In [7]:
X = ['TTCG',
    'ATCG',
    'ATCG',
    'ATCA']
m1 = embedding_mismatch_kernel(X,3,1, False)
print(m1)

100%|██████████| 4/4 [00:00<00:00, 6505.32it/s]

[[ 2.5   2.25  2.25  2.  ]
 [ 2.25  2.5   2.5   2.25]
 [ 2.25  2.5   2.5   2.25]
 [ 2.    2.25  2.25  2.5 ]]





## Cross validation to launch

In [8]:
y_pred_final = []
length = [11,11,8]
kernels = []
nb_iter = 20
lambdas = [np.logspace(-1.5, 0, 15, endpoint=True),
           np.logspace(-2, -1, 15, endpoint=True),
           np.logspace(-0.5, 1, 15, endpoint=True)]

for file in [0,1,2]:
    X_train = pd.read_csv("data/Xtr{}.csv".format(file), sep=' ',header=None)[0].values.tolist()
    Y_train = pd.read_csv("./data/Ytr{}.csv".format(file), index_col=0)['Bound'].values
    print("----- File {} read ------".format(file))

    res = {}
    
    for lmdb in lambdas[file]:
        res[lmdb] = [0,0]

    begin = time()
    K_train = embedding_mismatch_kernel(X_train,length[file],2)
    kernels.append(K_train)
    end = time()
    print("----- Kernel {} computed in {} sec ------".format(file, end - begin))

    
    for i in tqdm(range(nb_iter)):
        for lmdb in lambdas[file]:
            X_train_c, X_test_c, Y_train_c, Y_test_c, K_train_c, K_test_c = K_train_test_split(X_train,Y_train,K_train,test_size=0.2)
            Y_train_c.shape = (Y_train_c.shape[0])
            Y_test_c.shape = (Y_test_c.shape[0])
            Y_train_c.shape = (Y_train_c.shape[0])
            model = SVM(lmbd=lmdb)
            model.train(K_train_c, Y_train_c)
            Y_test_pred = list(map(int,model.predict(K_test_c)))
            Y_train_pred = list(map(int,model.predict(K_train_c)))
            res[lmdb][1] += accuracy_score(Y_train_c, Y_train_pred)/nb_iter
            res[lmdb][0] += accuracy_score(Y_test_c, Y_test_pred)/nb_iter

    print("Best for file {} :".format(file))
    sorted_x = sorted(res.items(), key=operator.itemgetter(1))

    print(sorted_x[-1])

  0%|          | 0/20 [00:00<?, ?it/s]

----- File 0 read ------


100%|██████████| 20/20 [00:45<00:00,  2.26s/it]
100%|██████████| 1/1 [00:00<00:00, 32.72it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

----- Kernel 0 computed in 45.1355938911438 sec ------
Best for file 0 :
(0.29126326549087367, [1.0, 1.0])
----- File 1 read ------


100%|██████████| 20/20 [00:46<00:00,  2.34s/it]
  0%|          | 0/1 [00:00<?, ?it/s]


----- Kernel 1 computed in 46.8635458946228 sec ------


AssertionError: y_true_values = [0], y_pred_values = [0 1]

## Submission

In [458]:
from sklearn.svm import SVC

y_pred_final = []
length = [11,11,8]
l = [0.1668, 0.0220, 1.3226]

for file in [0,1,2]:
    X_train = pd.read_csv("data/Xtr{}.csv".format(file), sep=' ',header=None)[0].values.tolist()
    Y_train = pd.read_csv("./data/Ytr{}.csv".format(file), index_col=0)['Bound'].values
    X_test = pd.read_csv("data/Xte{}.csv".format(file), sep=' ',header=None)[0].values.tolist()

    big_X = np.concatenate((X_train, X_test),axis=0)
    
    print("----- File {} read ------".format(file))

    res = []
    if kernels_computed:
        
    begin = time()
    big_K = embedding_mismatch_kernel(big_X,length[file],2)
    end = time()
    print("----- Kernel {} computed in {} sec ------".format(file, end - begin))

    K_train = big_K[:len(X_train),:len(X_train)]
    K_test = big_K[len(X_train):,:len(X_train)]
    
    clf = SVM(lmbd=l[file], loss='squared_hinge')
    clf.train(K_train, Y_train)
    y_pred = clf.predict(K_test)
    
    y_pred_final.append(y_pred)

    print("----- File {} predicted ------".format(file))


submitResults("SixthSubmission", y_pred_final)


----- File 0 read ------
----- Kernel 0 computed in 3.0994415283203125e-06 sec ------
----- File 0 predicted ------
----- File 1 read ------
----- Kernel 1 computed in 2.1457672119140625e-06 sec ------
----- File 1 predicted ------
----- File 2 read ------
----- Kernel 2 computed in 2.1457672119140625e-06 sec ------
----- File 2 predicted ------
----------------- Prediction written on SixthSubmission.csv ---------------


Travail sur le dataset 0:
Mismatch 6-0 : lambda = 0.02712 score = 0.7575
Mismatch 9-1 : lambda = 0.007742 score = 0.7692
Mismatch 11-2 : lambda = 0.1668 score = 0.7713

Travail sur le dataset 1:
Mismatch 6-0 : lambda = 0.05994 score = 0.8825
Mismatch 9-1 : lambda = 0.01584 score = 0.9050
Mismatch 11-2 : lambda = 0.0220 score = 0.9075

Travail sur le dataset 2:
Mismatch 5-0 : lambda = 0.00774 score = 0.64225
Mismatch 7-1 : lambda = 0.46415 score = 0.6652
Mismatch 8-2 : lambda = 1.3226 score = 0.67175

In [25]:
file = 0
X_train = pd.read_csv("data/Xtr{}.csv".format(file), sep=' ',header=None)[0].values.tolist()
Y_train = pd.read_csv("./data/Ytr{}.csv".format(file), index_col=0)['Bound'].values
X_train_c, X_test_c, Y_train_c, Y_test_c, K_train_c, K_test_c = K_train_test_split(X_train[:10],Y_train,K_train,test_size=0.2)
