In [1]:
from sklearn.model_selection import train_test_split #just for crossvalidation 
import numpy as np
import pandas as pd
import cvxopt
import cvxopt.solvers
from collections import Counter
from itertools import combinations_with_replacement
from time import time

**Reading files**

In [2]:
xtr0 = pd.read_csv("Data/Xtr0.csv", " ", header=0)
xtr1 = pd.read_csv("Data/Xtr1.csv", " ", header=0)
xtr2 = pd.read_csv("Data/Xtr2.csv", " ", header=0)
xtrain_temp = np.append(np.append(xtr0, xtr1), xtr2)
xtrain = np.array(xtrain_temp)

xte0 = pd.read_csv("Data/Xte0.csv", " ", header=0)
xte1 = pd.read_csv("Data/Xte1.csv", " ", header=0)
xte2 = pd.read_csv("Data/Xte2.csv", " ", header=0)
xtest_temp = np.append(np.append(xte0, xte1), xte2)
xtest = np.array(xtest_temp)

ytr0 = pd.read_csv("Data/Ytr0.csv", index_col=0, header=0)
ytr1 = pd.read_csv("Data/Ytr1.csv", index_col=0, header=0)
ytr2 = pd.read_csv("Data/Ytr2.csv", index_col=0, header=0)
ytrain_temp = np.append(np.append(ytr0, ytr1), ytr2)
ytrain = np.array(ytrain_temp)
ytrain[ytrain[:] == 0] = -1

**Preparing features: kmers**

In [3]:
#All possible substrings of size 5
def create_subsequences(lenght):
    p = ['A','C','G','T','C','G','T','A','G','T','A','C','T','A','C','G']
    subseq = []
    for i in combinations_with_replacement(p, lenght):
        subseq.append(list(i))
    subseq = np.asarray(subseq)
    subseq= np.unique(subseq, axis = 0)    
    subseq =["".join(j) for j in subseq[:,:].astype(str)]
    return subseq

def create_kmers(x, y, test, k):
    subseq = create_subsequences(k)
    features = np.zeros((len(x), len(subseq)))   #To store the occurence of each string
    for i in range(0,len(x)):
        s = x[i]
        c = [ s[j:j+k] for j in range(len(s)-k+1) ]
        counter = Counter(c)
        j=0
        for m in subseq:
            features[i][j] = counter[m]
            j=j+1
            
    features0 = features[y[:] == 0]
    features0_sum = features0.sum(axis=0)
    index0 = np.argpartition(features0_sum, 0)[:]
    
    features1 = features[y[:] == 1]
    features1_sum = features1.sum(axis=0)
    print(features1_sum)
    index1 = np.argpartition(features1_sum, 0)[:]
    
    index = np.append(index0,index1)
    features_train = features[:,index]
    features_train = features_train / np.max(np.abs(features_train),axis=0)

    if test.size != 0:
        features_test = create_test_feature(test,subseq,index, k)
        return features_train , features_test
    else:
        return features_train    
    
def create_test_feature(testdata, subsequences, ind, k):
    features_test = np.zeros((len(testdata), len(subsequences)))
    for i in range(0,len(testdata)):
        s = testdata[i]
        c = [ s[j:j+k] for j in range(len(s)-k+1) ]
        counter = Counter(c)
        j = 0
        for m in subsequences:
            features_test[i][j] = counter[m]
            j = j+1
    features_test = features_test[:,[ind]]
    features_test = features_test / np.max(np.abs(features_test),axis=0)

    return features_test

**Different non linear kernels**

In [4]:
def polynomial_kernel(x, y, p = 3):
    return (1 + np.dot(x, y)) ** p

def rbf_kernel(x, y, sigma = 3):
    return np.exp(-np.linalg.norm(x-y)**2 / (2 * (sigma ** 2)))

In [5]:
class SVM(object):
    def __init__(self, kernel = polynomial_kernel, C = None):
        self.kernel = kernel
        self.C = C
        if self.C is not None: self.C = float(self.C)
            
    def fit_svm(self, x, y):
        num_obs, num_features = x.shape
        #Gram matrix
        Gram = np.zeros((num_obs, num_obs))
        for i in range(num_obs):
            if (i%100 == 0):
               print(i, "/", num_obs)
            for j in range(num_obs):
                Gram[i,j] = self.kernel(x[i], x[j])  
                
        #Components for quadratic program problem        
        P = cvxopt.matrix(np.outer(y,y) * Gram)
        q = cvxopt.matrix(-np.ones((num_obs, 1)))
        A = cvxopt.matrix(y, (1,num_obs), 'd')
        b = cvxopt.matrix(np.zeros(1))
        diag = np.eye(num_obs)
        G = cvxopt.matrix(np.vstack((diag, diag * -1)))
        h = cvxopt.matrix(np.hstack((np.zeros(num_obs), np.ones(num_obs) * self.C)))
        
        #Solving quadratic progam problem
        sol = cvxopt.solvers.qp(P, q, G, h, A, b)
        alphas = np.ravel(sol['x'])
        
        #Support vectors have non zero lagrange multipliers, cut off at 1e-5
        sup_vec = alphas > 1e-6
        ind = np.arange(len(alphas))[sup_vec]
        
        #Creating support vectors
        self.alphas = alphas[sup_vec]
        self.sup_vec = x[sup_vec]
        self.sup_vec_y = y[sup_vec]
        
        #Fitting support vectors with the intercept
        self.b = 0
        for i in range(len(self.alphas)):
            self.b += self.sup_vec_y[i]
            self.b -= np.sum(self.alphas * self.sup_vec_y * Gram[ind[i],sup_vec])
        self.b /= len(self.alphas)
        print(self.b)
        
        #Weight for non linear kernel(polynomial or rbf)
        self.w = None  
            
    #Predict the sign
    def predict(self, X):
        y_pred = np.zeros(len(X))
        for i in range(len(X)):
            s = 0
            for alphas, sup_vec_y, sup_vec in zip(self.alphas, self.sup_vec_y, self.sup_vec):
                s += alphas * sup_vec_y * self.kernel(X[i], sup_vec)
            y_pred[i] = s
        return np.sign(y_pred + self.b)

In [6]:
#Creating features
x_trainf,x_testf = create_kmers(xtrain, ytrain, xtest, k=5)

#splitting for testing
X_train2, X_test2, y_train2, y_test2 = train_test_split(x_trainf, ytrain, test_size=0.8)

#Initialization
init = SVM(polynomial_kernel, 0.1)
#SVM Fitting
start = time()
init.fit_svm(X_train2, y_train2)
#Prediction
print("Predicting")
pred1 = init.predict(X_test2)
end = time()
print("Training and prediction time is: ", end - start)
#Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test2, pred1)

[ 710.  328.  347.  490.  469.  298.   86.  286.  333.  292.  326.  266.
  332.  212.  459.  314.  365.  358.  444.  290.  348.  288.  126.  304.
   51.  108.  100.   97.  167.  235.  338.  220.  388.  213.  339.  233.
  402.  350.  107.  275.  376.  321.  366.  242.  200.  168.  367.  243.
  287.  189.  212.  243.  316.  227.   55.  251.  382.  217.  297.  309.
  222.  218.  248.  323.  363.  258.  321.  300. 1523.  406.  328.  336.
  485.  407.  486.  267.  304.  240.  824.  315.  272.  433.  387.  304.
  441.  332.  134.  295.   74.  125.  117.   66.  164.  392.  473.  230.
   46.   49.   86.   46.  267.  180.  109.  107.  102.  123.  114.   93.
   84.  112.  394.   78.  175.  117.  146.  147.  329.  322.   66.  279.
  272.  370.  361.  324.  153.  237.  282.  259.  444.  265.  386.  311.
  394.  276.  133.  256.  444.  358.  481.  272.  230.  168.  335.  243.
  489.  353.  542.  311.  533.  455.  172.  454.  123.  139.  150.  124.
  183.  389.  537.  259.  478.  288.  549.  302.  5

0.5952083333333333

**Saving Prediction to file**

In [7]:
col2 = pred1
col2[col2 == -1] = 0
col2 = col2.astype(int)
col1 = range(0, col2.shape[0])
predictions = pd.DataFrame({'Id': col1, 'Bound': col2})
predictions.to_csv('submission_Test.csv', sep=',', encoding='utf-8',index=False)