## Kernel Method Challenge 

### Student Name: Lionel Nanguep Komen and Abduljaleel Adejumo




### Kernel classes

In [1]:
import numpy as np
import scipy.sparse as sparse
from tqdm import tqdm
import pickle
import pandas as pd

class Kernel():
   
    def __init__(self):
        pass

    def similarity(self, x, y):
        
        return -1

    def gram(self, X1, X2=None):
        """ Compute the gram matrix of a data vector X where the (i,j) entry is defined as <Xi,Xj>\\
        X1: data vector (n_samples_1 x n_features)
        X2: data vector (n_samples_2 x n_features), if None compute the gram matrix for (X1,X1)
        """
        if X2 is None: 
            X2=X1
        n_samples_1 = X1.shape[0]
        n_samples_2 = X2.shape[0]
        G = np.zeros((n_samples_1, n_samples_2))
        for ii in tqdm(range(n_samples_1)):
            for jj in range(n_samples_2):
                G[ii,jj] = self.similarity(X1[ii], X2[jj])
        return G


class SumKernel(Kernel):

    def __init__(self, kernels, weights=None):
        """ kernels: list of kernels """
        self.kernels = kernels
        self.weights = weights
        if self.weights is None:
            self.weights = [1.0 for _ in kernels]
        super().__init__()

    def similarity(self, x, y):
        """ x, y: string """
        s = self.kernels[0].similarity(x,y) * self.weights[0]
        for ii, kernel in enumerate(self.kernels[1:]):
            s += kernel.similarity(x,y) * self.weights[ii]
        return s

    def gram(self, X1, X2=None):
        """ Compute the sum of the gram matrices of all kernels\\
        X1: array of string (n_samples_1,)
        X2: array of string (n_samples_2,), if None compute the gram matrix for (X1,X1)
        """
        G = self.kernels[0].gram(X1,X2) * self.weights[0]
        for ii, kernel in tqdm(enumerate(self.kernels[1:])):
            G += kernel.gram(X1,X2) * self.weights[ii]
        return G


class LinearKernel(Kernel):

    def __init__(self):
        super().__init__()

    def similarity(self, x, y):
        """ linear kernel : k(x,y) = <x,y> \\
        x, y: array (n_features,)
        """
        return np.dot(x,y)


class GaussianKernel(Kernel):

    def __init__(self, sigma,normalize=True):
        super().__init__()
        self.sigma = sigma
        self.normalize = normalize

    def similarity(self, x, y):
        """ gaussian kernel : k(x,y) = 1/ sqrt(2 pi sigma2)^n * exp( - ||x-y||^2 / 2 sigma^2 )\\
        x, y: array (n_features,)
        """

        if self.normalize:
            norm_fact = (np.sqrt(2 * np.pi) * self.sigma) ** len(x)
            return np.exp(-np.linalg.norm(x-y)**2 / (2 * self.sigma**2)) / norm_fact
        else:
            return np.exp(-np.linalg.norm(x-y)**2 / (2 * self.sigma**2))


class PolynomialKernel(Kernel):

    def __init__(self, gamma=1, coef0=1, degree=3):
        super().__init__()
        self.gamma = gamma
        self.coef0 = coef0
        self.degree = degree

    def similarity(self, x, y):
        """ polynomial kernel : k(x,y) = (gamma <x,y> + r)^d \\
        x, y: array (n_features,)
        """
        return (self.gamma * np.dot(x,y) + self.coef0)**self.degree


class SpectrumKernel(Kernel):

    def __init__(self, k):
        super().__init__()
        self.k = k

    def similarity(self, x, y):
        """ Spectrum kernel \\
        x, y: string
        """
        substr_x, counts_x = np.unique([x[i:i+self.k] for i in range(len(x)-self.k+1)], return_counts=True)
        return np.sum(np.char.count(y, substr_x)*counts_x)


class MismatchKernel(Kernel):

    def __init__(self, k, m, neighbours, kmer_set, normalize=False):
        super().__init__()
        self.k = k
        self.m = m
        self.kmer_set = kmer_set #kmer_set and neighbours have to be pre-computed (to save computational time when running multiple experiments)
        self.neighbours = neighbours
        self.normalize = normalize

    def neighbour_embed_kmer(self, x):
        """
        Embed kmer with neighbours.
        x: str
        """
        kmer_x = [x[j:j + self.k] for j in range(len(x) - self.k + 1)]
        x_emb = {}
        for kmer in kmer_x:
            neigh_kmer = self.neighbours[kmer]
            for neigh in neigh_kmer:
                idx_neigh = self.kmer_set[neigh]
                if idx_neigh in x_emb:
                    x_emb[idx_neigh] += 1
                else:
                    x_emb[idx_neigh] = 1
        return x_emb
        

    def neighbour_embed_data(self, X):
        """
        Embed data with neighbours.
        X: array of string
        """
        X_emb = []
        for i in range(len(X)):
            x = X[i]
            x_emb = self.neighbour_embed_kmer(x)
            X_emb.append(x_emb)
        return X_emb
    
    def to_sparse(self, X_emb):
        """
        Embed data to sparse matrix.
        X_emb: list of dict.
        """
        data, row, col = [], [], []
        for i in range(len(X_emb)):
            x = X_emb[i]
            data += list(x.values())
            row += list(x.keys())
            col += [i for j in range(len(x))]
        X_sm = sparse.coo_matrix((data, (row, col)))
        return X_sm



    def similarity(self, x, y):
        """ Mismatch kernel \\
        x, y: string
        """
        x_emb = self.neighbour_embed_kmer(x)
        y_emb = self.neighbour_embed_kmer(y)
        sp = 0
        for idx_neigh in x_emb:
            if idx_neigh in y_emb:
                sp += x_emb[idx_neigh] * y_emb[idx_neigh]
        if self.normalize:
            sp /= np.sqrt(np.sum(np.array(list(x_emb.values()))**2))
            sp /= np.sqrt(np.sum(np.array(list(y_emb.values()))**2))
        return sp

    def gram(self, X1, X2=None):
        """ Compute the gram matrix of a data vector X where the (i,j) entry is defined as <Xi,Xj>\\
        X1: array of string (n_samples_1,)
        X2: array of string (n_samples_2,), if None compute the gram matrix for (X1,X1)
        """
        
        X1_emb = self.neighbour_embed_data(X1)
        X1_sm = self.to_sparse(X1_emb)
        
        if X2 is None:
            X2 = X1
        X2_emb = self.neighbour_embed_data(X2)
        X2_sm = self.to_sparse(X2_emb)

        # Reshape matrices if the sizes are different
        nadd_row = abs(X1_sm.shape[0] - X2_sm.shape[0])
        if X1_sm.shape[0] > X2_sm.shape[0]:
            add_row = sparse.coo_matrix(([0], ([nadd_row-1], [X2_sm.shape[1]-1])))
            X2_sm = sparse.vstack((X2_sm, add_row))
        elif X1_sm.shape[0] < X2_sm.shape[0]:
            add_row = sparse.coo_matrix(([0], ([nadd_row - 1], [X1_sm.shape[1] - 1])))
            X1_sm = sparse.vstack((X1_sm, add_row))

        G = (X1_sm.T * X2_sm).todense().astype('float')
        
        if self.normalize:
            G /= np.array(np.sqrt(X1_sm.power(2).sum(0)))[0,:,None]
            G /= np.array(np.sqrt(X2_sm.power(2).sum(0)))[0,None,:]
            
        return G






### Kmer

In [9]:
def create_kmer_set(X, k, kmer_set={}):
    """
    Return a set of all kmers appearing in the dataset.
    """
    len_seq = len(X[0])
    idx = len(kmer_set)
    for i in range(len(X)):
        x = X[i]
        kmer_x = [x[i:i + k] for i in range(len_seq - k + 1)]
        for kmer in kmer_x:
            if kmer not in kmer_set:
                kmer_set[kmer] = idx
                idx += 1
    return kmer_set


def m_neighbours(kmer, m, recurs=0):
    """
    Return a list of neighbours kmers (up to m mismatches).
    """
    if m == 0:
        return [kmer]

    letters = ['G', 'T', 'A', 'C']
    k = len(kmer)
    neighbours = m_neighbours(kmer, m - 1, recurs + 1)

    for j in range(len(neighbours)):
        neighbour = neighbours[j]
        for i in range(recurs, k - m + 1):
            for l in letters:
                neighbours.append(neighbour[:i] + l + neighbour[i + 1:])
    return list(set(neighbours))


def get_neighbours(kmer_set, m):
    """
    Find the neighbours given a set of kmers.
    """
    kmers_list = list(kmer_set.keys())
    kmers = np.array(list(map(list, kmers_list)))
    num_kmers, kmax = kmers.shape
    neighbours = {}
    for i in range(num_kmers):
        neighbours[kmers_list[i]] = []

    for i in tqdm(range(num_kmers)):
        kmer = kmers_list[i]
        kmer_neighbours = m_neighbours(kmer, m)
        for neighbour in kmer_neighbours:
            if neighbour in kmer_set:
                neighbours[kmer].append(neighbour)
    return neighbours


def load_neighbors(dataset, k, m):
    """
    dataset: 0, 1 or 2\\
    k: len of the kmers
    m: number of possible mismatches
    """
    file_name = 'neighbours_'+str(dataset)+'_'+str(k)+'_'+str(m)+'.p'
    # Load
    neighbours, kmer_set = pickle.load(open(file_name, 'rb'))
    print('Neighbors correctly loaded!')
    return neighbours, kmer_set


def load_or_compute_neighbors(dataset,k,m):
    """
    dataset: 0, 1 or 2\\
    k: len of the kmers
    m: number of possible mismatches
    """
    
    try:
        #Load the neighbors
        neighbours, kmer_set = load_neighbors(dataset, k, m)
    except:
        print('No file found, creating kmers neighbors')
        #Compute the neighbors
        file_name = 'neighbours_'+str(dataset)+'_'+str(k)+'_'+str(m)+'.p'
        if dataset==0:
            X0_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr0.csv", sep=",", index_col=0).values
            X0_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte0.csv", sep=",", index_col=0).values
            kmer_set = create_kmer_set(X0_train[:,0], k, kmer_set={})
            kmer_set = create_kmer_set(X0_test[:,0], k, kmer_set)
            neighbours = get_neighbours(kmer_set, m)
            pickle.dump([neighbours, kmer_set], open(file_name, 'wb'))
        elif dataset==1:
            X1_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr1.csv", sep=",", index_col=0).values
            X1_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte1.csv", sep=",", index_col=0).values
            kmer_set = create_kmer_set(X1_train[:,0], k, kmer_set={})
            kmer_set = create_kmer_set(X1_test[:,0], k, kmer_set)
            neighbours = get_neighbours(kmer_set, m)
            pickle.dump([neighbours, kmer_set], open(file_name, 'wb'))
        elif dataset==2:
            X2_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr2.csv", sep=",", index_col=0).values
            X2_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte2.csv", sep=",", index_col=0).values
            kmer_set = create_kmer_set(X2_train[:,0], k, kmer_set={})
            kmer_set = create_kmer_set(X2_test[:,0], k, kmer_set)
            neighbours = get_neighbours(kmer_set, m)
            pickle.dump([neighbours, kmer_set], open(file_name, 'wb'))
            
    return neighbours, kmer_set

In [3]:
!pip install cvxopt

Collecting cvxopt
  Downloading cvxopt-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: cvxopt
Successfully installed cvxopt-1.3.1


### Classifiers

In [10]:
# Ridge Regression (RR)
def solveRR(y, X, lam):
    n, p = X.shape
    assert (len(y) == n)
    
    A = X.T.dot(X)
    # Adjust diagonal due to Ridge
    # A[np.diag_indices_from(A)] += lam * n
    A += n * lam * np.eye(p)
    b = X.T.dot(y)
    # Hint:
    beta = np.linalg.solve(A, b)
    # Finds solution to the linear system Ax = b
    return (beta)


# Weighted Ridge Regression (WRR)
def solveWRR(y, X, w, lam):
    n, p = X.shape
    assert (len(y) == len(w) == n)

    w_sqrt = np.sqrt(w)
    
    y1 = w_sqrt * y
    X1 = X * w_sqrt[:, None] 
    # Or X1 = np.diag(w_sqrt) @ X # (Less efficient)

    # Hint:
    # Find y1 and X1 such that:
    beta = solveRR(y1, X1, lam)
    return (beta)



# Logistic Ridge Regression (LRR) with gradient descent (GD)
def solveLRR_gradient(y, X, lam, h=0.01, max_iter=500, eps=1e-12):
    '''
    lam: Regularization parameter
    max_iter: Max number of iterations of gradient descent
    eps: Tolerance for stopping criteria 
    '''
    n, p = X.shape
    assert (len(y) == n)
    
    beta_old = np.zeros(p)
    
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
            
    for i in range(max_iter):
        # yi beta^T xi
        f = (X * y[:, None]).dot(beta_old)
        gradient = - 1 / n * (y * sigmoid(-f)).dot(X)
        gradient += 2 * lam * beta_old
        
        # Step
        beta_new = beta_old - h * gradient
        
        if np.sum((beta_new-beta_old)**2) < eps:
            break
        beta_old = beta_new
    #         
    return (beta_new)


# Logistic Ridge Regression with Newton-Raphson
def solveLRR_newton(y, X, lam, max_iter=500, eps=1e-3):
    n, p = X.shape
    assert (len(y) == n)
    
    # Parameters
    max_iter = 500
    eps = 1e-3
    sigmoid = lambda a: 1/(1 + np.exp(-a))
    
    # Initialize
    beta = np.zeros(p)
            
    # Hint: Use IRLS
    for i in range(max_iter):
        beta_old = beta
        f = X.dot(beta_old)
        w = sigmoid(f) * sigmoid(-f)
        z = f + y / sigmoid(y*f)
        beta = solveWRR(z, X, w, 2*lam)
        # Break condition (achieved convergence)
        if np.sum((beta-beta_old)**2) < eps:
            break
    return (beta)

In [11]:
class KernelRidgeClassifier(KernelRidgeRegression):
    '''
    Kernel Ridge Classification
    '''
    def predict(self, X):
        return np.sign(super().predict(X))
    

class KernelRidgeClassifier(KernelRidgeRegression):
    '''
    Kernel Ridge Classification
    '''
    def predict(self, X):
        return np.sign(super().predict(X))

In [12]:
import cvxopt
from cvxopt import matrix

class SVM():
    """
    SVM implementation
    
    Usage:
        svm = SVM(kernel='linear', C=1)
        svm.fit(X_train, y_train)
        svm.predict(X_test)
    """

    def __init__(self, kernel, C=1.0, tol_support_vectors=1e-4):
        """
        kernel: Which kernel to use
        C: float > 0, default=1.0, regularization parameter
        tol_support_vectors: Threshold for alpha value to consider vectors as support vectors
        """
        self.kernel = kernel
        self.C = C
        self.tol_support_vectors = tol_support_vectors

    def fit(self, X, y):

        self.X_train = X
        n_samples = X.shape[0]
        print("Computing the kernel...")
        self.X_train_gram = self.kernel.gram(X)
        print("Done!")

        #Define the optimization problem to solve

        P = self.X_train_gram
        q = -y.astype('float')
        G = np.block([[np.diag(np.squeeze(y).astype('float'))],[-np.diag(np.squeeze(y).astype('float'))]])
        h = np.concatenate((self.C*np.ones(n_samples),np.zeros(n_samples)))

        #Solve the problem
        #With cvxopt

        P=matrix(P)
        q=matrix(q)
        G=matrix(G)
        h=matrix(h)
        solver = cvxopt.solvers.qp(P=P,q=q,G=G,h=h)
        x = solver['x']
        self.alphas = np.squeeze(np.array(x))

        #Retrieve the support vectors
        self.support_vectors_indices = np.squeeze(np.abs(np.array(x))) > self.tol_support_vectors
        self.alphas = self.alphas[self.support_vectors_indices]
        self.support_vectors = self.X_train[self.support_vectors_indices]

        print(len(self.support_vectors), "support vectors out of",len(self.X_train), "training samples")

        return self.alphas


    def predict(self, X):
        """
        X: array (n_samples, n_features)\\
        Return: float array (n_samples,)
        """
        K = self.kernel.gram(X, self.support_vectors)
        y = np.dot(K, self.alphas)
        return y

    def predict_classes(self, X, threshold=0):
        """
        X: array (n_samples, n_features)\\
        Return: 0 and 1 array (n_samples,)
        """
        K = self.kernel.gram(X, self.support_vectors)
        y = np.dot(K, self.alphas)
        return np.where(y > threshold, 1, -1)


### Training and Prediction

In [13]:
from sklearn.model_selection import train_test_split

In [15]:
"""
Individual mismatch try
"""

# Imports
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm


# Read csv files

# shape (2000,1): string
X0_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr0.csv", sep=",", index_col=0).values
X1_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr1.csv", sep=",", index_col=0).values
X2_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr2.csv", sep=",", index_col=0).values

# shape (2000,100): float
X0_mat100_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr0_mat100.csv", sep=" ", header=None).values
X1_mat100_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr1_mat100.csv", sep=" ", header=None).values
X2_mat100_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr2_mat100.csv", sep=" ", header=None).values

# shape (2000,1): string
X0_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte0.csv", sep=",", index_col=0).values
X1_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte1.csv", sep=",", index_col=0).values
X2_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte2.csv", sep=",", index_col=0).values

# shape (2000,100): float
X0_mat100_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte0_mat100.csv", sep=" ", header=None).values
X1_mat100_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte1_mat100.csv", sep=" ", header=None).values
X2_mat100_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte2_mat100.csv", sep=" ", header=None).values

# shape (2000,1): 0 or 1
Y0_train = pd.read_csv("/kaggle/input/bhnkkjkj/Ytr0.csv", sep=",", index_col=0).values
Y1_train = pd.read_csv("/kaggle/input/bhnkkjkj/Ytr1.csv", sep=",", index_col=0).values
Y2_train = pd.read_csv("/kaggle/input/bhnkkjkj/Ytr2.csv", sep=",", index_col=0).values





## Preprocessing

fraction_of_data = 0.2 #Put a small value for faster tests
split_ratio = 0.8 #Ratio of data in train set
shuffle = True #Shuffle the data
rescale_y = True #Rescale labels to -1 and 1

#Creating validation set and prediction set
Y_train_full = np.where(Y2_train == 0, -1, 1)
X_train, X_val,Y_train, Y_val = train_test_split(X2_train, Y_train_full, test_size=0.2, random_state=42)
X_mat_train, X_mat_val,Y_train, Y_val = train_test_split(X1_mat100_train, Y_train_full, test_size=0.2, random_state=42)




test_our_svm = False
test_spectrum = False
test_mismatch = True

if test_our_svm:

    ## Test our SVM implementation

    #Parameters
    kernel = 'poly' # 'linear' 'rbf' or 'poly'
    C = 1.0
    gamma = 1/(X_mat_train.shape[1] * X_mat_train.var())
    coef0 = 1.0
    degree = 3

    print("Kernel:", kernel)
    print("C:", C)
    if kernel != 'linear':
        print("Gamma:", gamma)
    if kernel == 'poly':
        print("Coef0:", coef0)
        print("Degree:", degree)
    print()

    #Our SVM
    print("Applying our SVM...")
    if kernel=='linear':
        our_svm = SVM(kernel=LinearKernel(),C=C)
    elif kernel=='rbf':
        our_svm = SVM(kernel=GaussianKernel(sigma=np.sqrt(0.5/gamma),normalize=False),C=C)
    elif kernel=='poly':
        our_svm = SVM(kernel=PolynomialKernel(gamma=gamma,coef0=coef0,degree=degree),C=C)
    our_svm.fit(X_mat_train, Y_train)
    our_svm_classes_train = our_svm.predict_classes(X_mat_train)
    our_svm_classes_val = our_svm.predict_classes(X_mat_val)

    print("Accuracy on train (our SVM):", np.sum(np.squeeze(our_svm_classes_train)==np.squeeze(Y_train))/len(Y_train))    
    print("Accuracy on val (our SVM):", np.sum(np.squeeze(our_svm_classes_val)==np.squeeze(Y_val))/len(Y_val))



if test_mismatch:

    def create_kmer_set(X, k, kmer_set={}):
        """
        Return a set of all kmers appearing in the dataset.
        """
        len_seq = len(X[0])
        idx = len(kmer_set)
        for i in range(len(X)):
            x = X[i]
            kmer_x = [x[i:i + k] for i in range(len_seq - k + 1)]
            for kmer in kmer_x:
                if kmer not in kmer_set:
                    kmer_set[kmer] = idx
                    idx += 1
        return kmer_set


    def m_neighbours(kmer, m, recurs=0):
        """
        Return a list of neighbours kmers (up to m mismatches).
        """
        if m == 0:
            return [kmer]

        letters = ['G', 'T', 'A', 'C']
        k = len(kmer)
        neighbours = m_neighbours(kmer, m - 1, recurs + 1)

        for j in range(len(neighbours)):
            neighbour = neighbours[j]
            for i in range(recurs, k - m + 1):
                for l in letters:
                    neighbours.append(neighbour[:i] + l + neighbour[i + 1:])
        return list(set(neighbours))


    def get_neighbours(kmer_set, m):
        """
        Find the neighbours given a set of kmers.
        """
        kmers_list = list(kmer_set.keys())
        kmers = np.array(list(map(list, kmers_list)))
        num_kmers, kmax = kmers.shape
        neighbours = {}
        for i in range(num_kmers):
            neighbours[kmers_list[i]] = []

        for i in tqdm(range(num_kmers)):
            kmer = kmers_list[i]
            kmer_neighbours = m_neighbours(kmer, m)
            for neighbour in kmer_neighbours:
                if neighbour in kmer_set:
                    neighbours[kmer].append(neighbour)
        return neighbours
    
    k = 10
    m = 1

    try:
        # Load
        neighbours, kmer_set = pickle.load(open('neighbours_22'+str(k)+'_'+str(m)+'.p', 'rb'))
        print('Neighbors correctly loaded')
    except:
        print('No file found, creating kmers neighbors')
        kmer_set = create_kmer_set(X2_train[:,0], k)
        kmer_set = create_kmer_set(X2_test[:,0], k, kmer_set)
        neighbours = get_neighbours(kmer_set, m)
        
        # Save neighbours and kmer set
        pickle.dump([neighbours, kmer_set], open('neighbours_22'+str(k)+'_'+str(m)+'.p', 'wb'))

    print('Doing SVM')
    C = 1
    svm = SVM(kernel=MismatchKernel(k=k, m=m, neighbours=neighbours, kmer_set=kmer_set), C=C)
    
    # X0_train, X0_val = X0_train[:1600], X0_train[1600:]

    # Y0_train = np.where(Y0_train == 0, -1, 1)
    # Y0_train, Y0_val = Y0_train[:1600], Y0_train[1600:]

    svm.fit(X_train[:,0], Y_train)

    pred_train = svm.predict_classes(X_train[:,0])
    # pred = np.where(pred == -1, 0, 1) 
    print( np.sum(np.squeeze(pred_train)==np.squeeze(Y_train)) / len(Y_train) )
    pred_val = svm.predict_classes(X_val[:,0])
    # pred = np.where(pred == -1, 0, 1)
    
    print( np.sum(np.squeeze(pred_val)==np.squeeze(Y_val)) / len(Y_val) )
    
    
    

Neighbors correctly loaded
Doing SVM
Computing the kernel...
Done!
     pcost       dcost       gap    pres   dres
 0: -6.6990e-03 -1.6108e+03  6e+03  1e+00  4e-15
 1:  1.2980e+00 -4.3057e+02  5e+02  2e-02  4e-15
 2:  9.1395e-01 -6.3230e+01  7e+01  3e-03  4e-15
 3: -2.3402e-01 -7.0133e+00  7e+00  2e-04  3e-15
 4: -6.1989e-01 -9.1297e-01  3e-01  2e-06  2e-15
 5: -6.4188e-01 -6.8678e-01  4e-02  3e-07  9e-16
 6: -6.4434e-01 -6.5172e-01  7e-03  3e-08  8e-16
 7: -6.4482e-01 -6.4587e-01  1e-03  4e-09  8e-16
 8: -6.4491e-01 -6.4499e-01  9e-05  3e-10  1e-15
 9: -6.4491e-01 -6.4492e-01  2e-06  3e-12  1e-15
10: -6.4492e-01 -6.4492e-01  7e-08  6e-14  1e-15
Optimal solution found.
1377 support vectors out of 1600 training samples
1.0
0.7625


In [None]:
#Prediction by replacing the data

pred_train1 = svm.predict_classes(X0_test[:,0])
pred_train2 = svm.predict_classes(X1_test[:,0])
pred_train3 = svm.predict_classes(X2_test[:,0])

In [None]:
X1= np.arange(3000).reshape(-1, 1)
sample = pd.DataFrame(data=X1, columns=['Id'])
sample.head()


In [None]:
y_pred_ = np.concatenate([pred_train1, pred_train2,pred_train3], axis=1).T.flatten()

print(len(y_pred_))
y_pred_

y_pred_f = np.where(y_pred_==-1.0, 0, 1)
y_pred_f

sample['Bound'] = y_pred_f
sample.tail()
sample.to_csv('subkernel3.csv', index=False)

### Final model

In [None]:

##### LOAD DATA #####

# shape (2000,1): string
X0_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr0.csv", sep=",", index_col=0).values
X1_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr1.csv", sep=",", index_col=0).values
X2_train = pd.read_csv("/kaggle/input/bhnkkjkj/Xtr2.csv", sep=",", index_col=0).values



# shape (2000,1): string
X0_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte0.csv", sep=",", index_col=0).values
X1_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte1.csv", sep=",", index_col=0).values
X2_test = pd.read_csv("/kaggle/input/bhnkkjkj/Xte2.csv", sep=",", index_col=0).values



# shape (2000,1): 0 or 1
Y0_train = pd.read_csv("/kaggle/input/bhnkkjkj/Ytr0.csv", sep=",", index_col=0).values
Y1_train = pd.read_csv("/kaggle/input/bhnkkjkj/Ytr1.csv", sep=",", index_col=0).values
Y2_train = pd.read_csv("/kaggle/input/bhnkkjkj/Ytr2.csv", sep=",", index_col=0).values


##### PREPROCESS DATA #####

#Rescaling labels
Y0_train = np.where(Y0_train == 0, -1, 1)
Y1_train = np.where(Y1_train == 0, -1, 1)
Y2_train = np.where(Y2_train == 0, -1, 1)

##### PARAMETERS #####

C = 1
weights = [1.0,1.0,1.0,1.0,1.0,1.0] #List of weights for sum of mismatch kernels
list_k = [5,8,9,10,12,13] #List of parameters k for sum of mismatch kernels
list_m = [1,1,1,1,2,2]

#Shuffling


shuffling_0 = np.random.permutation(len(X0_train))
X0_train = X0_train[shuffling_0][:,0]

Y0_train = Y0_train[shuffling_0]

shuffling_1 = np.random.permutation(len(X1_train))
X1_train = X1_train[shuffling_1][:,0]

Y1_train = Y1_train[shuffling_1]

shuffling_2 = np.random.permutation(len(X2_train))
X2_train = X2_train[shuffling_2][:,0]

Y2_train = Y2_train[shuffling_2]


#Put test matrices into the right format
X0_test = X0_test[:,0]
X1_test = X1_test[:,0]
X2_test = X2_test[:,0]


##### APPLY SVM ON DATASET 0 #####

print("Applying SVM on dataset 0...")


dataset_nbr = 0 
kernels = []
for k,m in zip(list_k,list_m):
    neighbours, kmer_set = load_or_compute_neighbors(dataset_nbr, k, m)
    kernels.append(MismatchKernel(k=k, m=m, neighbours=neighbours, kmer_set=kmer_set, normalize = True))
svm = SVM(kernel=SumKernel(kernels=kernels, weights=weights), C=C)


svm.fit(X0_train, Y0_train)
pred_0 = svm.predict_classes(X0_test)

##### APPLY SVM ON DATASET 1 #####

print("Applying SVM on dataset 1...")


dataset_nbr = 1
kernels = []
for k,m in zip(list_k,list_m):
    neighbours, kmer_set = load_or_compute_neighbors(dataset_nbr, k, m)
    kernels.append(MismatchKernel(k=k, m=m, neighbours=neighbours, kmer_set=kmer_set, normalize = True))
svm = SVM(kernel=SumKernel(kernels=kernels, weights=weights), C=C)

svm.fit(X1_train, Y1_train)
pred_1 = svm.predict_classes(X1_test)


##### APPLY SVM ON DATASET 2 #####

print("Applying SVM on dataset 2...")


dataset_nbr = 2
kernels = []
for k,m in zip(list_k,list_m):
    neighbours, kmer_set = load_or_compute_neighbors(dataset_nbr, k, m)
    kernels.append(MismatchKernel(k=k, m=m, neighbours=neighbours, kmer_set=kmer_set, normalize = True))
svm = SVM(kernel=SumKernel(kernels=kernels, weights=weights), C=C)

svm.fit(X2_train, Y2_train)
pred_2 = svm.predict_classes(X2_test)



In [None]:
##### CREATE SUBMISSION FILE #####

pred = np.concatenate([pred_0.squeeze(),pred_1.squeeze(),pred_2.squeeze()])
pred = np.where(pred == -1, 0, 1)
pred_df = pd.DataFrame()
print(pred.shape)
print(pred)
pred_df['Bound'] = pred
pred_df.index.name = 'Id'
pred_df.to_csv('prediction.csv', sep=',', header=True)

In [None]:
pred_train1 = svm.predict_classes(X0_test[:,0])
pred_train1