# kmer-based models for comparison

Maps input sequences into kmer space, then trains standard learning models.

I've been running these locally and with the exception of the SGDRegressor, they are very
slow to train... I have actually never trained the SVM...

Increasing k tends to help, but the memory requirements quickly get quite large.
My machine runs out of memory with k=5.

In [8]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
from mrpa_data import MrpaData
data = MrpaData()

In [10]:
def kmerize(seq, k):
    """
    Returns np.array of kmer counts corresponding to given seq
    """
    bases = {'A': '00', 'C': '01', 'G': '10', 'T': '11'}
    binary_seq = ''.join([bases[char] for char in seq])
    counts = np.zeros((4**k,))
    for i in range(0, len(seq) - k + 1):
        counts[int(''.join(binary_seq[i*2:(i+k)*2]), 2)] += 1
    return counts

In [16]:
X = np.array([kmerize(data.seqs[key], 5) for key in data.valid_keys])
y = data.y_multitask()

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [18]:
def print_errors(models):
    for task, model in enumerate(models):
        mse =  sum([(i-j) **2 for i, j in zip (model.predict(X_valid), y_valid[:, task])]) / float(y_valid.shape[0])
        var = np.var(y_valid[:, task])
        print "MSE is {} and var is {}".format(mse, var)
        
def plot_predictions(models):
    for task, model in enumerate(models):
        plt.scatter(model.predict(X_valid), y_valid[:, task])
        plt.show()

In [19]:
from sklearn.linear_model import SGDRegressor
sgd_models = []
for task in range(y.shape[1]):
    sgd_models += [SGDRegressor().fit(X_train, y_train[:, task])]
print_errors(sgd_models)

MSE is 1.19620762906 and var is 1.2999556736
MSE is 2.07573783149 and var is 2.2128594924
MSE is 2.34090237937 and var is 2.37297107557
MSE is 3.56821466468 and var is 3.84131700344


In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_models = []
for task in range(y.shape[1]):
    rf_models += [RandomForestRegressor(n_estimators = 50, n_jobs=2).fit(X_train, y_train[:, task])]
print_errors(rf_models)

In [None]:
from sklearn.svm import SVR
svm_models = []
for task in range(y.shape[1]):
    svm_models += [SVR().fit(X_train, y_train[:, task])]
print_errors(svm_models)