# This is the notebook of our final approach

# Preliminaries

## Imports

In [1]:
import numpy as np

#to read/write csv
import pandas as pd

#for SVM
from cvxopt import matrix as cvxopt_matrix
from cvxopt import solvers as cvxopt_solvers

#for other features
from tqdm import tqdm
from kernels import phi

## Constants

In [2]:
split=0.7 # We will take 90% on data for train set

## Load the data

In [3]:
# Load and standardize features by removing the mean and scaling to unit variance
StandardScaler = lambda df: (df-df.mean())/df.std()

XTrain0 = pd.read_csv('./data/Xtr0_mat100.csv', sep=' ', header=None)
XTrain1 = pd.read_csv('./data/Xtr1_mat100.csv', sep=' ', header=None)
XTrain2 = pd.read_csv('./data/Xtr2_mat100.csv', sep=' ', header=None)

# Standardize the Test features using the mean and std of the TRAIN features
XTest0 = (pd.read_csv('./data/Xte0_mat100.csv', sep=' ', header=None)-XTrain0.mean())/XTrain0.std()
XTest1 = (pd.read_csv('./data/Xte1_mat100.csv', sep=' ', header=None)-XTrain1.mean())/XTrain1.std()
XTest2 = (pd.read_csv('./data/Xte2_mat100.csv', sep=' ', header=None)-XTrain2.mean())/XTrain2.std()

YTrain0 = pd.read_csv('./data/Ytr0.csv', usecols = ['Bound'])
YTrain1 = pd.read_csv('./data/Ytr1.csv', usecols = ['Bound'])
YTrain2 = pd.read_csv('./data/Ytr2.csv', usecols = ['Bound'])

XTrain0_ATGC = pd.read_csv('./data/Xtr0.csv', sep=' ', header=None)
XTrain1_ATGC = pd.read_csv('./data/Xtr1.csv', sep=' ', header=None)
XTrain2_ATGC = pd.read_csv('./data/Xtr2.csv', sep=' ', header=None)

XTest0_ATGC = pd.read_csv('./data/Xte0.csv', sep=' ', header=None)
XTest1_ATGC = pd.read_csv('./data/Xte1.csv', sep=' ', header=None)
XTest2_ATGC = pd.read_csv('./data/Xte2.csv', sep=' ', header=None)


# Standardize Train features
XTrain0 = StandardScaler(XTrain0)
XTrain1 = StandardScaler(XTrain1)
XTrain2 = StandardScaler(XTrain2)

# Generate features with our own kernel from raw sequences

In [8]:
k=8 #length of subsequences considered

print('dimension of features:{}'.format(4**k))
XTrain0KF=np.zeros([len(XTrain0_ATGC)-1,4**k]) 
XTrain1KF=np.zeros([len(XTrain1_ATGC)-1,4**k])
XTrain2KF=np.zeros([len(XTrain2_ATGC)-1,4**k])

XTest0KF=np.zeros([len(XTest0_ATGC)-1,4**k])
XTest1KF=np.zeros([len(XTest1_ATGC)-1,4**k])
XTest2KF=np.zeros([len(XTest2_ATGC)-1,4**k])

ATGC=[XTrain0_ATGC,XTrain1_ATGC,XTrain2_ATGC,XTest0_ATGC,XTest1_ATGC,XTest2_ATGC]
KF=[XTrain0KF,XTrain1KF,XTrain2KF,XTest0KF,XTest1KF,XTest2KF]  #KF stands for kernel_features

for data in tqdm(range(6)):
    for idx,sequence in enumerate(ATGC[data][0][1:]): #[1:] pour ne pas prendre la ligne 'Id,seq'
        x = sequence.split(',')[1]
        KF[data][idx,:]=phi(x,k,kernel='spectrum_efficient')

# Standardize Train features
XTrain0KF = StandardScaler(XTrain0KF)
XTrain1KF = StandardScaler(XTrain1KF)
XTrain2KF = StandardScaler(XTrain2KF)

XTest0 = XTest0-XTrain0.mean()/XTrain0.std()
XTest1 = XTest1-XTrain1.mean()/XTrain1.std()
XTest2 = XTest2-XTrain2.mean()/XTrain2.std()

  0%|          | 0/6 [00:00<?, ?it/s]

dimension of features:65536


100%|██████████| 6/6 [00:02<00:00,  2.06it/s]


# Prediction

## SVM function

In [5]:
def SVM(K,Y,lmbda=1):
    N = K.shape[0]

    P = K
    G = np.vstack([np.eye(N,N),-np.eye(N,N)])

    C = 1/2/N/lmbda
    h = np.concatenate([np.repeat(C,N),np.repeat(0,N)])
    label = -1.*np.logical_not(Y)+1.*Y
    A = np.matrix(Y.astype(np.double))
    b = np.zeros(1)
    q = -np.ones(N)

    P = cvxopt_matrix(P)
    G =cvxopt_matrix(G)
    h =cvxopt_matrix(h)
    b =cvxopt_matrix(b)
    A =cvxopt_matrix(A)
    q =cvxopt_matrix(q)

    sol = cvxopt_solvers.qp(P=P, q=q, G=G, h=h, A=A, b=b, )
    alphas = np.array(sol['x'])

    return alphas

## Asses the model

In [6]:
for lambdaa in [0.001,0.1,10]:

    #Load TRAIN data (%split of the labelled dataset)
    K0 = np.dot(XTrain0KF[:int(len(XTrain0)*split)],XTrain0KF[:int(len(XTrain0)*split)].T)  
    Y0 = np.squeeze(2*YTrain0[:int(len(YTrain0)*split)].to_numpy()-1)

    K1 = np.dot(XTrain1KF[:int(len(XTrain1)*split)],XTrain1KF[:int(len(XTrain1)*split)].T)  
    Y1 = np.squeeze(2*YTrain1[:int(len(YTrain1)*split)].to_numpy()-1)

    K2 = np.dot(XTrain2KF[:int(len(XTrain2)*split)],XTrain2KF[:int(len(XTrain2)*split)].T) 
    Y2 = np.squeeze(2*YTrain2[:int(len(YTrain2)*split)].to_numpy()-1)

    #Predict alphas
    alpha0 = SVM(K0,Y0,lambdaa)
    alpha1 = SVM(K1,Y1,lambdaa)
    alpha2 = SVM(K2,Y2,lambdaa)

    #Predictions -1/1 on VALIDATION DATA (%(1-split) of the labelled dataset)
    predictions0 = alpha0.T@np.dot(XTrain0KF[:int(len(XTrain0)*split)], XTrain0KF[int(len(XTrain0)*split):].T)
    predictions1 = alpha1.T@np.dot(XTrain1KF[:int(len(XTrain0)*split)], XTrain1KF[int(len(XTrain0)*split):].T)
    predictions2 = alpha2.T@np.dot(XTrain2KF[:int(len(XTrain0)*split)], XTrain2KF[int(len(XTrain0)*split):].T)

    #Predictions 0/1 on VALIDATION DATA (%(1-split) of the labelled dataset)
    predictions0 = np.squeeze((1+np.sign(predictions0))/2)
    predictions1 = np.squeeze((1+np.sign(predictions1))/2)
    predictions2 = np.squeeze((1+np.sign(predictions2))/2)
                             
    #Asses the predictions
    y_true0 = np.array(YTrain0['Bound'][int(len(XTrain0)*split):])
    y_true1 = np.array(YTrain1['Bound'][int(len(XTrain0)*split):])
    y_true2 = np.array(YTrain2['Bound'][int(len(XTrain0)*split):])

    print("{:.2f}% accuracy on dataset 0".format(100*(1-np.mean(abs(y_true0-predictions0)))))
    print("{:.2f}% accuracy on dataset 1".format(100*(1-np.mean(abs(y_true1-predictions1)))))
    print("{:.2f}% accuracy on dataset 2".format(100*(1-np.mean(abs(y_true2-predictions2)))))

     pcost       dcost       gap    pres   dres
 0: -3.5224e-03 -5.0002e+02  4e+03  4e+00  1e-15
 1: -1.3893e-03 -2.7424e+02  3e+02  4e-02  9e-16
 2:  3.2634e-03 -3.8699e+00  4e+00  5e-04  1e-15
 3:  1.2287e-03 -5.7850e-01  6e-01  8e-05  1e-15
 4: -1.0872e-04 -1.4699e-01  2e-01  2e-05  1e-15
 5: -3.4707e-03 -7.6645e-03  4e-03  2e-07  1e-15
 6: -3.8910e-03 -4.3087e-03  4e-04  1e-08  9e-16
 7: -3.9320e-03 -4.0538e-03  1e-04  3e-09  5e-16
 8: -3.9456e-03 -3.9705e-03  2e-05  4e-10  6e-16
 9: -3.9490e-03 -3.9519e-03  3e-06  2e-16  6e-16
10: -3.9493e-03 -3.9494e-03  7e-08  2e-16  5e-16
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -3.4880e-03 -5.0002e+02  4e+03  4e+00  1e-15
 1: -1.3756e-03 -2.7424e+02  3e+02  4e-02  1e-15
 2:  3.2417e-03 -3.7917e+00  4e+00  5e-04  1e-15
 3:  1.5863e-03 -6.8461e-01  7e-01  9e-05  1e-15
 4:  1.2879e-03 -1.9624e-01  2e-01  2e-05  4e-15
 5: -3.2711e-03 -9.0330e-03  6e-03  2e-07  2e-15
 6: -3.8993e-03 -4.2326e-03  3e-04  3e-09  9e-1

# Generate csv file to upload on Kaggle

In [7]:
#Load data
K0 = np.dot(XTrain0,XTrain0.T)  #linear kernel
Y0 = np.squeeze(2*YTrain0.to_numpy()-1)

K1 = np.dot(XTrain1,XTrain1.T)   #linear kernel
Y1 = np.squeeze(2*YTrain1.to_numpy()-1)

K2 = np.dot(XTrain2,XTrain2.T)   #linear kernel
Y2 = np.squeeze(2*YTrain2.to_numpy()-1)

#Predict alphas
alpha0 = SVM(K0,Y0)
alpha1 = SVM(K1,Y1)
alpha2 = SVM(K2,Y2)

#Predictions -1/1
predictions0 = alpha0.T@np.dot(XTrain0, XTest0.T)
predictions1 = alpha1.T@np.dot(XTrain1, XTest1.T)
predictions2 = alpha2.T@np.dot(XTrain2, XTest2.T)

#Predictions 0/1
predictions0 = (1+np.sign(predictions0))/2
predictions1 = (1+np.sign(predictions1))/2
predictions2 = (1+np.sign(predictions2))/2

# Creation of the Kaggle submission file
df0 = pd.DataFrame({'Id': np.arange(1000), 'Bound': predictions0.squeeze().astype(int)})
df1 = pd.DataFrame({'Id': np.arange(1000,2000), 'Bound': predictions1.squeeze().astype(int)})
df2 = pd.DataFrame({'Id': np.arange(2000,3000), 'Bound': predictions2.squeeze().astype(int)})
dfResult = pd.concat([df0,df1,df2])
dfResult.to_csv('./data/submissionSVM.csv', index=False)

     pcost       dcost       gap    pres   dres
 0: -9.9859e+02 -1.0209e+00  9e+03  1e+02  3e-13
 1: -1.0367e+01 -1.0208e+00  9e+01  1e+00  3e-13
 2: -4.7310e-01 -1.0108e+00  1e+00  7e-03  4e-15
 3: -4.3691e-01 -5.7537e-01  1e-01  9e-18  8e-16
 4: -4.8040e-01 -5.0202e-01  2e-02  3e-18  2e-15
 5: -4.8084e-01 -4.8257e-01  2e-03  2e-18  1e-15
 6: -4.8086e-01 -4.8145e-01  6e-04  5e-18  5e-16
 7: -4.8098e-01 -4.8106e-01  8e-05  2e-18  1e-15
 8: -4.8100e-01 -4.8100e-01  4e-06  1e-17  2e-15
 9: -4.8100e-01 -4.8100e-01  4e-08  5e-18  2e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -1.0002e+03 -1.0005e+00  8e+03  9e+01  3e-13
 1: -1.0351e+01 -1.0004e+00  8e+01  9e-01  3e-13
 2: -4.5203e-01 -9.9054e-01  1e+00  5e-03  3e-15
 3: -4.4945e-01 -5.2947e-01  8e-02  3e-18  5e-16
 4: -4.9900e-01 -5.0077e-01  2e-03  3e-18  2e-15
 5: -4.9949e-01 -4.9968e-01  2e-04  5e-18  2e-15
 6: -4.9950e-01 -4.9950e-01  3e-06  3e-18  2e-15
 7: -4.9950e-01 -4.9950e-01  5e-07  2e-18  1e-1