In [1]:
!which python

/home/users/rmovva/anaconda2/bin/python


In [2]:
import xgboost
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

# k-mer XGBoost on Sharpr
Get all k-mers (k from 1 to X, where X = 6/7/8 depending on training capabilities), use as features to train XGB on Sharpr

In [7]:
import gzip

def get_seqs_from_file(fasta):
    seqfile = gzip.open(fasta)
    seqs = {}
    for line in seqfile:
        if line[0] == '>':
            name = line.strip()[1:]
        else:
            seq = line.strip()
            seqs[name] = seq
    seqfile.close()
    return seqs

def get_labels_from_file(filename):
    labelfile = gzip.open(filename)
    labels = {}
    labelfile.readline()
    for line in labelfile:
        line = line.strip().split('\t')
        name = line[0]
        vals = [float(lbl) for lbl in line[13:]] # 1 for name + 12 for counts --> 13:
        labels[name] = vals
    labelfile.close()
    return labels
        

def get_seqs_labels_from_split(filename, seqs, labels):
    splitFile = gzip.open(filename)
    seqs_from_split = []
    labels_from_split = []
    for line in splitFile:
        name = line.strip()
        seqs_from_split.append(seqs[name])
        labels_from_split.append(labels[name])
    splitFile.close()
    return np.array(seqs_from_split), np.array(labels_from_split)

# Functions from Joe Paggi (https://github.com/jpaggi/deepmpra/blob/master/models/kmer_model.py)

BASES = ['A', 'C', 'G', 'T']

def seqs_to_matrix(seqs):
    return np.vstack([map(lambda x: BASES.index(x), seq)
                     for seq in seqs])

def get_kmer_features(seqs, k):
    X = seqs_to_matrix(seqs)
    bases = ['00', '01', '10', '11']
    counts = []
    for seq in X:
        binary_seq = ''.join(map(lambda x: bases[x], seq))
        k_vals = np.arange(k)
        count = np.zeros(np.sum(map(lambda x: 4**x, k_vals)), dtype = np.uint8)
        
        count_idx = 0
        for k_val in range(k):
            for i in range(0, len(seq) - k_val + 1):
                count[count_idx] += 1
                count_idx += 1
#                 count[int(binary_seq[i*2:(i+k)*2], 2)] += 1
        counts += [count]
    return np.vstack(counts)    

In [4]:
seqsPath = '../features/sequences_sharpr_znormed_jul23.fa.gz'
labelsPath = '../labels/labels_sharpr_znormed_jul23.txt.gz'

trainSplitPath = '../splits/sharpr_znormed_jul23/train_split.txt.gz'
valSplitPath = '../splits/sharpr_znormed_jul23/val_split.txt.gz'
testSplitPath = '../splits/sharpr_znormed_jul23/test_split.txt.gz'

seqs = get_seqs_from_file(seqsPath)
labels = get_labels_from_file(labelsPath)

trainSeqs, trainLabels = get_seqs_labels_from_split(trainSplitPath, seqs, labels)
valSeqs, valLabels = get_seqs_labels_from_split(valSplitPath, seqs, labels)

In [5]:
train_idxs_without_N = [i for (i, seq) in enumerate(trainSeqs) if 'N' not in seq]
trainSeqs = trainSeqs[train_idxs_without_N]
trainLabels = trainLabels[train_idxs_without_N]

val_idxs_without_N = [i for (i, seq) in enumerate(valSeqs) if 'N' not in seq]
valSeqs = valSeqs[val_idxs_without_N]
valLabels = valLabels[val_idxs_without_N]

In [6]:
print trainSeqs.shape, trainLabels.shape, valSeqs.shape, valLabels.shape

(914336,) (914336, 12) (19833,) (19833, 12)


In [None]:
import time

ntest = 20000
k = 6
label_idx = 2 # k562_minp_norm_avg

t0 = time.time()
X_train = get_kmer_features(trainSeqs, k)
y_train = trainLabels[:, label_idx]
X_val = get_kmer_features(valSeqs, k)
y_val = valLabels[:, label_idx]
print("Creating k-mer features for train/val set took %.3f s" % (time.time() - t0))

In [None]:
eval_set = [(X_val, y_val)]
random_state = np.random.RandomState(seed = 0)
model = xgboost.XGBRegressor(max_depth = 6, 
                             learning_rate = 0.05,
                             n_estimators = 250,
                             objective = 'reg:linear',
                             silent = False,
                             updater = 'grow_gpu',
                             random_state = 0
                             gpu_id = 2
                            )

In [None]:
xgb_param = model.get_xgb_params()
print xgb_param

In [None]:
from scipy.stats import spearmanr

ntest = 20000
t0 = time.time()
model.fit(X_train[:ntest], y_train[:ntest], eval_set = eval_set, early_stopping_rounds = 8, eval_metric = 'mae')
print("Fitting model took %.3f s" % (time.time() - t0))

In [None]:
y_val_pred = model.predict(X_val)
print spearmanr(y_val, y_val_pred)