In [29]:
from __future__ import division, absolute_import, print_function
!./grab_data.sh

File sequences.simdata.gz exists already


In [2]:
try:
    import simdna
except ImportError, e:
    print("installing simdna package")
    !pip install -e "git://github.com/kundajelab/simdna.git@0.4.0#egg=simdna"
    print("\n******************************************************************************")
    print("RESTART THE JUPYTER KERNEL TO PICK UP ON THE INSTALLATION!!!")
    print("******************************************************************************")

In [3]:
import simdna.synthetic as synthetic
reload(synthetic)
reload(synthetic.core)
import gzip
data_filename = "sequences.simdata.gz"
data = synthetic.read_simdata_file(data_filename)

One-hot encode the sequence data into a 4

In [4]:
import numpy as np

def one_hot_encode_along_channel_axis(sequence):
    #theano dim ordering, uses row axis for one-hot
    to_return = np.zeros((len(sequence),4), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=sequence, one_hot_axis=1)
    return to_return

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1
            
onehot_data = np.array([one_hot_encode_along_channel_axis(seq) for seq in data.sequences])
print(onehot_data.shape)

(8000, 200, 4)


Computed the gapped kmer embeddings; first, obtain a function that will compute the embeddings

In [5]:
import ssvmimp

In [110]:
import ssvmimp.train
reload(ssvmimp.train)

num_mismatches=1

string_reps, embedding_func = ssvmimp.train.get_gapped_kmer_embedding_filters_and_func(
                                kmer_len=6, alphabet=['A','C','G','T'],
                                num_gaps=2, num_mismatches=num_mismatches)

'string_reps' stores string representations of the filters. Positions are separated by commas. The number represents the letter (0=A, 1=C, 2=G, 3=T).

In [66]:
print(len(string_reps))
print("First ten filters:")
print("\n".join(string_reps[:10]))
print("Last ten filters:")
print("\n".join(string_reps[-10:]))

2560
First ten filters:
AAAA  
AAAC  
AAAG  
AAAT  
AACA  
AACC  
AACG  
AACT  
AAGA  
AAGC  
Last ten filters:
T  TCG
T  TCT
T  TGA
T  TGC
T  TGG
T  TGT
T  TTA
T  TTC
T  TTG
T  TTT


Computed the embeddings, which are the sum of the number of matches to each filter per sequence, accounting for the desired number of mismatches

In [33]:
embeddings = embedding_func(onehot=onehot_data, batch_size=20, progress_update=500)
print(embeddings.shape)

Done 0
Done 500
Done 1000
Done 1500
Done 2000
Done 2500
Done 3000
Done 3500
Done 4000
Done 4500
Done 5000
Done 5500
Done 6000
Done 6500
Done 7000
Done 7500
(8000, 2560)


Train a classifier on task 1 ("GATA present") and task 2 ("TAL present")

In [61]:
import sklearn.svm
import sys

train_set_num = 6000 #6000 examples will be used in the training set
print("Training t1")
sys.stdout.flush()
t1_classifier = sklearn.svm.LinearSVC().fit(X=embeddings[:train_set_num], y=data.labels[:train_set_num,1])
print("Training t2")
sys.stdout.flush()
t2_classifier = sklearn.svm.LinearSVC().fit(X=embeddings[:train_set_num], y=data.labels[:train_set_num,2])

Training t1
Training t2


In [62]:
#check prediction accuracy on testing set (6000:8000)
for classifier, task in [(t1_classifier,1), (t2_classifier,2)]:
    preds = classifier.predict(embeddings[train_set_num:])
    print(np.sum(data.labels[train_set_num:,task] == preds)/len(preds))

0.8735
0.9235


Inspect the top-ranked filters for the two tasks by eye. coef_[0] stores the weights on the filters. The GATA PWM used was GATA_disc1 and the TAL motif used was TAL1_known1 from here: http://compbio.mit.edu/encode-motifs/

In [67]:
#get the top ranked filters for t1
print("t1 filters:")
assert len(string_reps)==len(t1_classifier.coef_[0])
t1_sorted_filters = sorted(zip(t1_classifier.coef_[0], string_reps), key=lambda x: -x[0])
print(t1_sorted_filters[:10])

print("t2 filters:")
assert len(string_reps)==len(t2_classifier.coef_[0])
t2_sorted_filters = sorted(zip(t2_classifier.coef_[0], string_reps), key=lambda x: -x[0])
print(t2_sorted_filters[:10])

t1 filters:
[(0.072067056160371512, 'C A TA'), (0.069113433440973732, 'GCT A '), (0.067823727133263609, 'T GAT '), (0.067592827570667258, 'G TCA '), (0.067471113890326895, 'G T AT'), (0.064120598074550811, 'TAT  G'), (0.062947203147330932, 'G TGA '), (0.058946593636511939, 'G TC G'), (0.058413859538917604, 'GC  AG'), (0.057370484628632257, 'TA AA ')]
t2 filters:
[(0.061417155777389619, 'C T TG'), (0.054530675145669651, 'C G GG'), (0.053834522908509777, 'AGTT  '), (0.051887418943053785, 'CA C G'), (0.049335179785569727, 'AT CT '), (0.048619322668626619, 'A CTG '), (0.047997753788824607, 'C GG G'), (0.047607954786425488, 'AA  AA'), (0.046646509260457297, 'TAT  T'), (0.045815000471540031, 'G AAA ')]


As a sanity check, we will look at the scores for the GATAAG kmer (the GATA motif) and the CAGATG kmer (the TAL motif) for task 0 and task 1. First, we define functions to help us look at the total score for a kmer

In [107]:
from collections import OrderedDict

def get_filter_matches(kmer_string, filter_strings, num_mismatches):
    matching_filters = []
    for filter_string in filter_strings:
        match = True
        mismatches_so_far = 0
        for kmer_letter, filter_letter in zip(kmer_string, filter_string):
            if (filter_letter!=" " and kmer_letter!=filter_letter):
                mismatches_so_far += 1
                if mismatches_so_far > num_mismatches:
                    match=False
                    break
        if (match):
            matching_filters.append(filter_string)
    return matching_filters

def get_total_kmer_score(kmer_string, filter_to_score, num_mismatches):
    assert isinstance(filter_to_score, OrderedDict)
    filter_matches = get_filter_matches(kmer_string=kmer_string, filter_strings=filter_to_score.keys(),
                                        num_mismatches=num_mismatches)
    total_score = 0
    for a_filter in filter_matches:
        total_score += filter_to_score[a_filter]
    return total_score

We find that GATAAG scores pretty high for task 1 but not task 2, and CAGATG scores pretty high for task 2 but not task 1:

In [112]:
from collections import OrderedDict

t1_filter_to_score = OrderedDict(zip(string_reps, t1_classifier.coef_[0]))
t2_filter_to_score = OrderedDict(zip(string_reps, t2_classifier.coef_[0]))

print("Scores for GATAAG for task 1 and task 2:")
print(get_total_kmer_score(kmer_string='GATAAG', filter_to_score=t1_filter_to_score, num_mismatches=num_mismatches))
print(get_total_kmer_score(kmer_string='GATAAG', filter_to_score=t2_filter_to_score, num_mismatches=num_mismatches))
print("Scores for CAGATG for task 1 and task 2:")
print(get_total_kmer_score(kmer_string='CAGATG', filter_to_score=t1_filter_to_score, num_mismatches=num_mismatches))
print(get_total_kmer_score(kmer_string='CAGATG', filter_to_score=t2_filter_to_score, num_mismatches=num_mismatches))

Scores for GATAAG for task 1 and task 2:
2.43825497038
0.232850659289
Scores for CAGATG for task 1 and task 2:
0.405037627412
2.02005926243
