# This is the notebook of our final approach

# Preliminaries

## Imports

In [1]:
import numpy as np

#to read/write csv
import pandas as pd

#for SVM
from SVM import fit_SVM_and_predict

#for other features
from tqdm import tqdm
from kernels import phi

## Constants

In [2]:
split=0.7 # We will take 90% on data for train set

## Load the data

In [3]:
# Load and standardize features by removing the mean and scaling to unit variance
StandardScaler = lambda df: (df-df.mean())/df.std()

XTrain0 = pd.read_csv('./data/Xtr0_mat100.csv', sep=' ', header=None)
XTrain1 = pd.read_csv('./data/Xtr1_mat100.csv', sep=' ', header=None)
XTrain2 = pd.read_csv('./data/Xtr2_mat100.csv', sep=' ', header=None)

# Standardize the Test features using the mean and std of the TRAIN features
XTest0 = (pd.read_csv('./data/Xte0_mat100.csv', sep=' ', header=None)-XTrain0.mean())/XTrain0.std()
XTest1 = (pd.read_csv('./data/Xte1_mat100.csv', sep=' ', header=None)-XTrain1.mean())/XTrain1.std()
XTest2 = (pd.read_csv('./data/Xte2_mat100.csv', sep=' ', header=None)-XTrain2.mean())/XTrain2.std()

YTrain0 = pd.read_csv('./data/Ytr0.csv', usecols = ['Bound'])
YTrain1 = pd.read_csv('./data/Ytr1.csv', usecols = ['Bound'])
YTrain2 = pd.read_csv('./data/Ytr2.csv', usecols = ['Bound'])

XTrain0_ATGC = pd.read_csv('./data/Xtr0.csv', sep=' ', header=None)
XTrain1_ATGC = pd.read_csv('./data/Xtr1.csv', sep=' ', header=None)
XTrain2_ATGC = pd.read_csv('./data/Xtr2.csv', sep=' ', header=None)

XTest0_ATGC = pd.read_csv('./data/Xte0.csv', sep=' ', header=None)
XTest1_ATGC = pd.read_csv('./data/Xte1.csv', sep=' ', header=None)
XTest2_ATGC = pd.read_csv('./data/Xte2.csv', sep=' ', header=None)


# Standardize Train features
XTrain0 = StandardScaler(XTrain0)
XTrain1 = StandardScaler(XTrain1)
XTrain2 = StandardScaler(XTrain2)

# Generate features with our own kernel from raw sequences

In [4]:
k=8 #length of subsequences considered

print('dimension of features:{}'.format(4**k))
XTrain0KF=np.zeros([len(XTrain0_ATGC)-1,4**k]) 
XTrain1KF=np.zeros([len(XTrain1_ATGC)-1,4**k])
XTrain2KF=np.zeros([len(XTrain2_ATGC)-1,4**k])

XTest0KF=np.zeros([len(XTest0_ATGC)-1,4**k])
XTest1KF=np.zeros([len(XTest1_ATGC)-1,4**k])
XTest2KF=np.zeros([len(XTest2_ATGC)-1,4**k])

ATGC=[XTrain0_ATGC,XTrain1_ATGC,XTrain2_ATGC,XTest0_ATGC,XTest1_ATGC,XTest2_ATGC]
KF=[XTrain0KF,XTrain1KF,XTrain2KF,XTest0KF,XTest1KF,XTest2KF]  #KF stands for kernel_features

for data in tqdm(range(6)):
    for idx,sequence in enumerate(ATGC[data][0][1:]): #[1:] pour ne pas prendre la ligne 'Id,seq'
        x = sequence.split(',')[1]
        KF[data][idx,:]=phi(x,k,kernel='spectrum_efficient')

# Standardize Train features
XTrain0KF = StandardScaler(XTrain0KF)
XTrain1KF = StandardScaler(XTrain1KF)
XTrain2KF = StandardScaler(XTrain2KF)


XTest0KF = XTest0KF-XTrain0KF.mean()/XTrain0KF.std()
XTest1KF = XTest1KF-XTrain1KF.mean()/XTrain1KF.std()
XTest2KF = XTest2KF-XTrain2KF.mean()/XTrain2KF.std()

  0%|          | 0/6 [00:00<?, ?it/s]

dimension of features:65536


100%|██████████| 6/6 [00:02<00:00,  2.12it/s]


# Prediction

## Asses the model

In [5]:
best_lambda=1
best_accuracy=0

for lambdaa in [1e-8,1e-7,1e-6,5e-6,8e-6,9e-6,1e-5,2e-5,5e-5,1e-4,1e-3]:

    K0 = np.dot(XTrain0KF[:int(len(XTrain0)*split)],XTrain0KF[:int(len(XTrain0)*split)].T) 
    G0 = np.dot(XTrain0KF[int(len(XTrain0)*split):], XTrain0KF[:int(len(XTrain0)*split)].T)
    Y0 = np.squeeze(np.array(YTrain0[:int(len(XTrain0)*split)]))
    
    K1 = np.dot(XTrain1KF[:int(len(XTrain1)*split)],XTrain1KF[:int(len(XTrain1)*split)].T) 
    G1 = np.dot(XTrain1KF[int(len(XTrain1)*split):], XTrain1KF[:int(len(XTrain1)*split)].T)
    Y1 = np.squeeze(np.array(YTrain1[:int(len(XTrain1)*split)]))
    
    K2 = np.dot(XTrain2KF[:int(len(XTrain0)*split)],XTrain2KF[:int(len(XTrain2)*split)].T) 
    G2 = np.dot(XTrain2KF[int(len(XTrain2)*split):], XTrain2KF[:int(len(XTrain2)*split)].T)
    Y2 = np.squeeze(np.array(YTrain2[:int(len(XTrain2)*split)]))
    
    #Predictions -1/1
    p0=fit_SVM_and_predict(K=K0,gram=G0,Y=Y0,C=lambdaa,get_proba=False)
    p1=fit_SVM_and_predict(K=K1,gram=G1,Y=Y1,C=lambdaa,get_proba=False)
    p2=fit_SVM_and_predict(K=K2,gram=G2,Y=Y2,C=lambdaa,get_proba=False)
    
    #Predictions 0/1
    predictions0 = (1+np.sign(p0))/2
    predictions1 = (1+np.sign(p1))/2
    predictions2 = (1+np.sign(p2))/2

    #Asses the predictions
    y_true0 = np.array(YTrain0['Bound'][int(len(XTrain0)*split):])
    y_true1 = np.array(YTrain1['Bound'][int(len(XTrain1)*split):])
    y_true2 = np.array(YTrain2['Bound'][int(len(XTrain2)*split):])
    
    #Compute the accuracy
    accuracy0=100*(1-np.mean(abs(y_true0-predictions0)))
    accuracy1=100*(1-np.mean(abs(y_true1-predictions1)))
    accuracy2=100*(1-np.mean(abs(y_true2-predictions2)))
    accuracy=(accuracy0+accuracy1+accuracy2)/3
    
    if accuracy>best_accuracy:
        best_accuracy=accuracy
        best_lambda=lambdaa

    print("{:.2f}% accuracy on dataset 0 for lambda = {}".format(accuracy0,lambdaa))
    print("{:.2f}% accuracy on dataset 1 for lambda = {}".format(accuracy1,lambdaa))
    print("{:.2f}% accuracy on dataset 2 for lambda = {}".format(accuracy2,lambdaa))
    print("--------------------------------------------")

59.33% accuracy on dataset 0 for lambda = 1e-08
58.17% accuracy on dataset 1 for lambda = 1e-08
70.00% accuracy on dataset 2 for lambda = 1e-08
--------------------------------------------
60.67% accuracy on dataset 0 for lambda = 1e-07
58.17% accuracy on dataset 1 for lambda = 1e-07
70.50% accuracy on dataset 2 for lambda = 1e-07
--------------------------------------------
61.33% accuracy on dataset 0 for lambda = 1e-06
58.50% accuracy on dataset 1 for lambda = 1e-06
70.50% accuracy on dataset 2 for lambda = 1e-06
--------------------------------------------
61.17% accuracy on dataset 0 for lambda = 5e-06
58.17% accuracy on dataset 1 for lambda = 5e-06
72.33% accuracy on dataset 2 for lambda = 5e-06
--------------------------------------------
62.00% accuracy on dataset 0 for lambda = 8e-06
58.33% accuracy on dataset 1 for lambda = 8e-06
73.17% accuracy on dataset 2 for lambda = 8e-06
--------------------------------------------
62.50% accuracy on dataset 0 for lambda = 9e-06
58.50% 

In [6]:
best_accuracy

64.5

# Generate csv file to upload on Kaggle

In [7]:
#Dataset 0
K0 = np.dot(XTrain0KF,XTrain0KF.T)  
G0 = np.dot(XTest0KF, XTrain0KF.T)
Y0 = np.squeeze(np.array(YTrain0))
p0 = fit_SVM_and_predict(K=K0,gram=G0,Y=Y0,C=best_lambda,get_proba=False)

#Dataset 1
K1 = np.dot(XTrain1KF,XTrain1KF.T)  
G1 = np.dot(XTest1KF, XTrain1KF.T)
Y1 = np.squeeze(np.array(YTrain1))
p1 = fit_SVM_and_predict(K=K1,gram=G1,Y=Y1,C=best_lambda,get_proba=False)

#Dataset 2
K2 = np.dot(XTrain2KF,XTrain2KF.T)  
G2 = np.dot(XTest2KF, XTrain2KF.T)
Y2 = np.squeeze(np.array(YTrain2))
p2 = fit_SVM_and_predict(K=K2,gram=G2,Y=Y2,C=best_lambda,get_proba=False)


#Predictions 0/1
predictions0 = (1+np.sign(p0))/2
predictions1 = (1+np.sign(p1))/2
predictions2 = (1+np.sign(p2))/2

# Creation of the Kaggle submission file
df0 = pd.DataFrame({'Id': np.arange(1000), 'Bound': predictions0.squeeze().astype(int)})
df1 = pd.DataFrame({'Id': np.arange(1000,2000), 'Bound': predictions1.squeeze().astype(int)})
df2 = pd.DataFrame({'Id': np.arange(2000,3000), 'Bound': predictions2.squeeze().astype(int)})
dfResult = pd.concat([df0,df1,df2])
dfResult.to_csv('./data/submissionSVM.csv', index=False)