In [1]:
from __future__ import print_function
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import gzip
import re
from collections import OrderedDict

def load_sequences(seqfile):
        seqs = OrderedDict()
        fp = gzip.open(seqfile, "rb")
        print("#Loading " + seqfile + " ...")
        expecting = "label"
        label=''
        for line in fp:
                if expecting == "label":
                        match = re.match(">(.*)$", line)
                        if match:
                                label = match.group(1)
                                expecting = "sequence"
                        else:
                                print("Expecting LABEL but found (!!): " + line)
                                continue
                else:
                        match = re.match("(\w+)$", line)
                        if match:
                                sequence = match.group(1)
                                seqs[label]=sequence
                        else:
                                print("Expecting SEQUENCE but found (!!): " + line)
                        expecting = "label"
                        label=''
        fp.close()
        print("#Loaded " + str(len(seqs.keys())) + " sequences from " + seqfile)
        return seqs

In [3]:
def load_motif_matches(motif_match_file, doprint=False):
        """
        Loads a homer motif match file into an ordered dictionary with key as se
quence name
        and value as list of dictionaries each containing the keys - motif, sequ
ence,
        begin (0-indexed inclusive begin index of motif), end (0-indexed exclusi
ve end index),
        strand (+ or -), seqval. Each dictionary
        represents one motif match on that sequence
        """
        motif_matches = OrderedDict()
        fp = open(motif_match_file, "r")
        if doprint:
                print("#Loading " + motif_match_file + " ...")
        numlines = 0
        for line in fp:
                match = re.match("((\w|\-)+)\s+((\w|\:|\-)+)\s+(\d+)\s+(\d+)\s+(\+|\-)\s+.+\s+(\w+)$", line)
                if match:
                        numlines = numlines + 1
                        motif = match.group(1)
                        sequence = match.group(3)
                        begin = int(match.group(5))
                        end = int(match.group(6))
                        strand = match.group(7)
                        seqval = match.group(8)
                        entry = dict()
                        entry['motif'] = motif
                        entry['sequence'] = sequence
                        entry['begin'] = begin-1 # Homer motif match file is 1 indexed, convert to 0
                        entry['end'] = end # Homer motif match file is 1 indexed AND inclusive, convert to 0 and exclusive
                        entry['strand'] = strand
                        entry['seqval'] = seqval
                        if sequence not in motif_matches:
                                motif_matches[sequence] = list()
                        motif_matches[sequence].append(entry)
        fp.close()
        if doprint:
                print("#Loaded " + str(numlines) + " motif matches in " + str(len(motif_matches.keys())) + " sequences")
        return motif_matches

In [4]:
def rename(label):
    match=re.match('.*_(chr.*)$',label)
    if match:
        return match.group(1)
    else:
        return ""

In [5]:
def load_sequences_from_bedfile(seqfile):
    seqs = OrderedDict()
    fp = gzip.open(seqfile, "rb")
    print("#Loading " + seqfile + " ...")
    for line in fp:
        (label, sequence)=line.split()
        seqs[label]=sequence
    fp.close()
    print("#Loaded " + str(len(seqs.keys())) + " sequences from " + seqfile)
    return seqs

In [6]:
def get_value(label):
    value = -1
    match = re.match("((dinuc_shuff_|dinuc_shuffled_).+)$", label)
    if match:            
        chrom = match.group(1)
        if match.group(2) == 'dinuc_shuff_':
            value = 0
        else:
            value = 1
    return value 

In [7]:
import random
def get_random_set(seqdict, num, sort=True):
    newlist=seqdict.items()
    if sort:
        newlist = [newlist[i] for i in sorted(random.sample(range(len(newlist)), num))]
    else:
        newlist = [newlist[i] for i in random.sample(range(len(newlist)), num)]
    return dict(newlist)

In [8]:
def remove_labels_not_in_motif_matches(sequences, motif_matches):
    positive_keys=[]
    for key in motif_matches.keys():
        positive_keys.append(key)
    positive_keys_set = set(positive_keys)
    #print("Positive Keys Set")
    #print(positive_keys_set)
    #print("Sequence keys")
    #print(sequences.keys())
    new_seqs = OrderedDict()
    for seq in sequences.keys():
        if rename(seq) in positive_keys_set:
            new_seqs[seq] = sequences[seq]
    return new_seqs

In [9]:
import numpy as np

data_filename_positive = "/users/eprakash/projects/benchmarking/newdata/A549/A549.summits.400bp.implanted.5Ksubsample.bed.gz"
#data_filename_negative = "/users/eprakash/projects/benchmarking/newdata/GM12878/400bp/universal_dnase.matched.GM12878.summits.400bp.hg38.implanted.bed.gz"
positives=load_sequences_from_bedfile(data_filename_positive)
labeled_sequences = load_sequences_from_bedfile(data_filename_positive)
#print("Initially got %d positive sequences" % len(labeled_sequences))
#motif_matches=load_motif_matches('/users/eprakash/projects/benchmarking/newdata/GM12878/400bp/GM12878.motif.matches.txt', True)
#labeled_sequences = remove_labels_not_in_motif_matches(labeled_sequences, motif_matches)
#del labeled_sequences['chr1:203649072-203650072']
print("Got %d positive sequences" % len(labeled_sequences))
positive_labels = labeled_sequences.keys()

#Temporarily not including negative seqs
#neg_seqs = load_sequences_from_bedfile(data_filename_negative)

#del neg_seqs['chr1:203649072-203650072']
#neg_seqs = get_random_set(neg_seqs, 200000)

#print("Got %d negative sequences" % len(neg_seqs))
#negative_labels = neg_seqs.keys()
#print("Number of labels common to both sets of sequences is %d " % len(set(positive_labels).intersection(set(negative_labels))))
#labeled_sequences.update(neg_seqs)
#labeled_sequences=get_random_set(labeled_sequences, 419730, sort=False)
#labeled_sequences=get_random_set(labeled_sequences, 68407, sort=False)

labels = labeled_sequences.keys()
sequences=labeled_sequences.values()
#values = np.array([get_value(label) for label in labels])
#check_negatives = np.nonzero(values == -1)[0]
#assert (len(check_negatives) == 0)
#print("Labels length: ", len(labels))
print("Sequences length: ", len(sequences))
#print("Values length: ", len(values))

#Loading /users/eprakash/projects/benchmarking/newdata/A549/A549.summits.400bp.implanted.5Ksubsample.bed.gz ...
#Loaded 5000 sequences from /users/eprakash/projects/benchmarking/newdata/A549/A549.summits.400bp.implanted.5Ksubsample.bed.gz
#Loading /users/eprakash/projects/benchmarking/newdata/A549/A549.summits.400bp.implanted.5Ksubsample.bed.gz ...
#Loaded 5000 sequences from /users/eprakash/projects/benchmarking/newdata/A549/A549.summits.400bp.implanted.5Ksubsample.bed.gz
Got 5000 positive sequences
Sequences length:  5000


In [10]:
removed=[]
chars=['R','Y','S','W','K','M','B','D','H','V','N']
print(len(sequences))
for seq in sequences:
    if any((c in chars) for c in seq):
        removed.append(seq)
        sequences.remove(seq)
print(len(sequences))

5000
5000


In [11]:
for i in removed:
    key=labeled_sequences.keys()[labeled_sequences.values().index(i)]
    print(key)
    del labeled_sequences[key]
    labels.remove(key)
print (len(labels))
print(len(labeled_sequences))

5000
5000


In [12]:
print(type(labeled_sequences))
with open("/users/eprakash/projects/benchmarking/newdata/A549/models/deepseabeluga/results/ISM_deepseabeluga_A549_positive.labels.txt", "w") as ff:
    for ss in labeled_sequences.keys():
        ff.write(str(ss) +"\n")
ff.close()

<class 'collections.OrderedDict'>


In [13]:
import numpy as np

#this is set up for 1d convolutions where examples
#have dimensions (len, num_channels)
#the channel axis is the axis for one-hot encoding.
def one_hot_encode_along_channel_axis(sequence):
    to_return = np.zeros((len(sequence),4), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=sequence, one_hot_axis=1)
    return to_return

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1
#sequences = sequences[-5000:]            
onehot_data = np.array([one_hot_encode_along_channel_axis(seq) for seq in sequences])

In [14]:
print(onehot_data.shape)

(5000, 400, 4)


In [15]:
import deeplift
from deeplift.conversion import kerasapi_conversion as kc

In [16]:
#model_id = "record_4_model_4W8mu"
model_json = "/users/eprakash/projects/benchmarking/newdata/A549/models/deepseabeluga/momma_dragonn/examples/fasta_sequential_model/model_files/record_1_model_SkPDS_modelJson.json"
model_weights = "/users/eprakash/projects/benchmarking/newdata/A549/models/deepseabeluga/momma_dragonn/examples/fasta_sequential_model/model_files/record_1_model_SkPDS_modelWeights.h5"
#model_json = "model_files/model_UqOJX_modelJson.json"
#model_weights = "model_files/model_UqOJX_modelWeights.h5"
deeplift_genomicsdefault_model =\
    kc.convert_model_from_saved_files(
        json_file=model_json,
        h5_file=model_weights,
        nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode.DeepLIFT_GenomicsDefault) 
deeplift_rescale_model =\
    kc.convert_model_from_saved_files(
        json_file=model_json,
        h5_file=model_weights,
        nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode.Rescale)

nonlinear_mxts_mode is set to: DeepLIFT_GenomicsDefault
For layer 1 the preceding linear layer is 0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
For layer 3 the preceding linear layer is 2 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer 7 the preceding linear layer is 6 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
For layer 9 the preceding linear layer is 8 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer 13 the preceding lin

In [17]:
def list_wrapper(func):
    def wrapped_func(input_data_list, **kwargs):
        if (isinstance(input_data_list, list)):
            remove_list_on_return=False
        else:
            remove_list_on_return=True
            input_data_list = [input_data_list]
        to_return = func(input_data_list=input_data_list,
                         **kwargs)
        return to_return
    return wrapped_func

def empty_ism_buffer(results_arr,
                     input_data_onehot,
                     perturbed_inputs_preds,
                     perturbed_inputs_info):
    for perturbed_input_pred,perturbed_input_info\
        in zip(perturbed_inputs_preds, perturbed_inputs_info):
        example_idx = perturbed_input_info[0]
        if (perturbed_input_info[1]=="original"):
            results_arr[example_idx] +=\
                (perturbed_input_pred*input_data_onehot[example_idx])
        else:
            pos_idx,base_idx = perturbed_input_info[1]
            results_arr[example_idx,pos_idx,base_idx] = perturbed_input_pred

def make_ism_func(prediction_func,
                  flank_around_middle_to_perturb,
                  batch_size=200):
    @list_wrapper
    def ism_func(input_data_list, progress_update=10000, **kwargs):
        assert len(input_data_list)==1
        input_data_onehot=input_data_list[0]
        
        results_arr = np.zeros_like(input_data_onehot).astype("float64")
        
        perturbed_inputs_info = []
        perturbed_onehot_seqs = []
        perturbed_inputs_preds = []
        num_done = 0
        for i,onehot_seq in enumerate(input_data_onehot):
            perturbed_onehot_seqs.append(onehot_seq)
            perturbed_inputs_info.append((i,"original"))
            for pos in range(int(len(onehot_seq)/2)-flank_around_middle_to_perturb,
                             int(len(onehot_seq)/2)+flank_around_middle_to_perturb):
                for base_idx in range(4):
                    if onehot_seq[pos,base_idx]==0:
                        assert len(onehot_seq.shape)==2
                        new_onehot = np.zeros_like(onehot_seq) + onehot_seq
                        new_onehot[pos,:] = 0
                        new_onehot[pos,base_idx] = 1
                        perturbed_onehot_seqs.append(new_onehot)
                        perturbed_inputs_info.append((i,(pos,base_idx)))
                        num_done += 1
                        if ((progress_update is not None)
                            and num_done%progress_update==0):
                            print("Done",num_done)
                        if (len(perturbed_inputs_info)>=batch_size):
                            empty_ism_buffer(
                                 results_arr=results_arr,
                                 input_data_onehot=input_data_onehot,
                                 perturbed_inputs_preds=
                                  prediction_func([perturbed_onehot_seqs]),
                                 perturbed_inputs_info=perturbed_inputs_info)
                            perturbed_inputs_info = []
                            perturbed_onehot_seqs = []
        if (len(perturbed_inputs_info)>0):
            empty_ism_buffer(
                 results_arr=results_arr,
                 input_data_onehot=input_data_onehot,
                 perturbed_inputs_preds=
                  prediction_func([perturbed_onehot_seqs]),
                 perturbed_inputs_info=perturbed_inputs_info)
        perturbed_inputs_info = []
        perturbed_onehot_seqs = []
        results_arr = results_arr - np.mean(results_arr,axis=-1)[:,:,None]
        return input_data_onehot*results_arr
    return ism_func

In [18]:
pred_func = deeplift.util.compile_func(
    inputs=[deeplift_rescale_model.get_layers()[0].get_activation_vars()],
    outputs=deeplift_rescale_model.get_layers()[-2].get_activation_vars()[:,0])

In [19]:
ism_func = make_ism_func(prediction_func=pred_func,
                         flank_around_middle_to_perturb=200,
                         batch_size=200)

In [20]:
scores_ism = np.array(ism_func(input_data_list=[onehot_data],
                                       progress_update=10000))

Using TensorFlow backend.


Done 10000
Done 20000
Done 30000
Done 40000
Done 50000
Done 60000
Done 70000
Done 80000
Done 90000
Done 100000
Done 110000
Done 120000
Done 130000
Done 140000
Done 150000
Done 160000
Done 170000
Done 180000
Done 190000
Done 200000
Done 210000
Done 220000
Done 230000
Done 240000
Done 250000
Done 260000
Done 270000
Done 280000
Done 290000
Done 300000
Done 310000
Done 320000
Done 330000
Done 340000
Done 350000
Done 360000
Done 370000
Done 380000
Done 390000
Done 400000
Done 410000
Done 420000
Done 430000
Done 440000
Done 450000
Done 460000
Done 470000
Done 480000
Done 490000
Done 500000
Done 510000
Done 520000
Done 530000
Done 540000
Done 550000
Done 560000
Done 570000
Done 580000
Done 590000
Done 600000
Done 610000
Done 620000
Done 630000
Done 640000
Done 650000
Done 660000
Done 670000
Done 680000
Done 690000
Done 700000
Done 710000
Done 720000
Done 730000
Done 740000
Done 750000
Done 760000
Done 770000
Done 780000
Done 790000
Done 800000
Done 810000
Done 820000
Done 830000
Done 840000
D

In [21]:
print(scores_ism.shape)

(5000, 400, 4)


In [22]:
print(scores_ism[2144])

[[ 0.00000000e+00 -0.00000000e+00  0.00000000e+00 -4.92358208e-03]
 [ 9.63139534e-03 -0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 4.46605682e-03  0.00000000e+00 -0.00000000e+00 -0.00000000e+00]
 ...
 [ 0.00000000e+00 -0.00000000e+00 -0.00000000e+00  4.76837158e-07]
 [ 0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00 -7.15255737e-07  0.00000000e+00]]


In [23]:
np.save('/users/eprakash/projects/benchmarking/newdata/A549/models/deepseabeluga/results/A549.deepseabeluga.ISM.scores.5Ksubsample.npy', scores_ism)