In [1]:
from __future__ import print_function
import h5py
import numpy as np
import deeplift
import evautils
from evautils import sequtils
from evautils import kerasutils
from evautils import dirutils
from evautils import windowscoringutils
from evautils import impscoringutils
from collections import OrderedDict, defaultdict
import os

Using TensorFlow backend.


In [2]:
REGION_SIZE = 400

POS_LABELS = '/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz'
SCORES_FILE='/users/eprakash/git/interpret-benchmark/scripts/basset/A549/sim_pos_and_neg.h5'
MOTIF_MATCHES='/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/fimo_out/fimo.txt'

RESULTS_DIRECTORY='A549auROCauPRCResults'

In [3]:
dirutils.createDir(RESULTS_DIRECTORY, mustcreate=True)

In [4]:
from collections import OrderedDict
import gzip
import numpy as np
def load_labels_from_bedfile(seqfile):
    seqs = []
    fp = gzip.open(seqfile, "rb")
    print("#Loading " + seqfile + " ...")
    for line in fp:
        line=line.decode('utf8').split()
        seqs.append(line[0])
    fp.close()
    print("#Loaded " + str(len(seqs)) + " seqnames from " + seqfile)
    return np.array(seqs)

In [5]:
h5f = h5py.File(SCORES_FILE,'r')
pos_labels = load_labels_from_bedfile(POS_LABELS)
print("\nImp scoring methods: " + str(list(h5f.keys())))

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz ...
#Loaded 10000 seqnames from /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz

Imp scoring methods: ['deeplift-rcrs_ref:allzeros', 'deeplift-rcrs_ref:avgc', 'deeplift-rcrs_ref:shuff-10', 'deeplift-rcrs_ref:shuff-20', 'deeplift-rs_ref:allzeros', 'deeplift-rs_ref:avgc', 'deeplift-rs_ref:shuff-10', 'deeplift-rs_ref:shuff-20', 'gradtimesinp_ref:allzeros', 'ig-10_ref:shuff-10', 'ig-10_ref:shuff-20', 'ig-20_ref:shuff-10', 'ig-20_ref:shuff-20', 'ism', 'seqids']


In [6]:
deeplift_rcrs_allzeros_scores=np.array(h5f.get("deeplift-rcrs_ref:allzeros"))
deeplift_rcrs_avgc_scores=np.array(h5f.get("deeplift-rcrs_ref:avgc"))
deeplift_rcrs_shuff10_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-10"))
deeplift_rcrs_shuff20_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-20"))

deeplift_rs_allzeros_scores=np.array(h5f.get("deeplift-rs_ref:allzeros"))
deeplift_rs_avgc_scores=np.array(h5f.get("deeplift-rs_ref:avgc"))
deeplift_rs_shuff10_scores=np.array(h5f.get("deeplift-rs_ref:shuff-10"))
deeplift_rs_shuff20_scores=np.array(h5f.get("deeplift-rs_ref:shuff-20"))

grad_times_input_allzeros_scores = np.array(h5f.get("gradtimesinp_ref:allzeros"))
ism_scores = np.array(h5f.get("ism"))

ig10_shuff10_scores = np.array(h5f.get("ig-10_ref:shuff-10"))
ig10_shuff20_scores =np.array(h5f.get("ig-10_ref:shuff-20"))
ig20_shuff10_scores =np.array(h5f.get("ig-20_ref:shuff-10"))
ig20_shuff20_scores =np.array(h5f.get("ig-20_ref:shuff-20"))

seqids = [x.decode('utf-8') for x in np.array(h5f.get("seqids"))]

h5f.close()

In [7]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)
for key in list(motif_matches.keys()):
    if key not in pos_labels:
        del motif_matches[key]
print("Motif matches len: " + str(len(motif_matches)))
seq_ids_of_interest = list(motif_matches.keys())
print("Seq ids of interest len: " + str(len(seq_ids_of_interest)))

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/fimo_out/fimo.txt ...
#Loaded 2102956 motif matches in 744982 sequences
Motif matches len: 10000
Seq ids of interest len: 10000


In [8]:
seq_ids_of_interest_set = set(seq_ids_of_interest)
relevant_indices_list, relevant_labels_list=sequtils.get_relevant_labels_in_order_of_scores(seqids, motif_matches)
seq_ids_of_interest = relevant_labels_list
seq_ids_of_interest_set = set(relevant_labels_list)

deeplift_rcrs_allzeros_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_allzeros_scores, REGION_SIZE)
deeplift_rcrs_avgc_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_avgc_scores, REGION_SIZE)
deeplift_rcrs_shuff10_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff10_scores, REGION_SIZE)
deeplift_rcrs_shuff20_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff20_scores, REGION_SIZE)

deeplift_rs_allzeros_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_allzeros_scores, REGION_SIZE)
deeplift_rs_avgc_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_avgc_scores, REGION_SIZE)
deeplift_rs_shuff10_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff10_scores, REGION_SIZE)
deeplift_rs_shuff20_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff20_scores, REGION_SIZE)

grad_times_input_allzeros_scores = sequtils.get_relevant_scores(relevant_indices_list, grad_times_input_allzeros_scores, REGION_SIZE)
ism_scores = sequtils.get_relevant_scores(relevant_indices_list, ism_scores, REGION_SIZE)

ig10_shuff10_scores = sequtils.get_relevant_scores(relevant_indices_list, ig10_shuff10_scores, REGION_SIZE)
ig10_shuff20_scores = sequtils.get_relevant_scores(relevant_indices_list, ig10_shuff20_scores, REGION_SIZE)
ig20_shuff10_scores = sequtils.get_relevant_scores(relevant_indices_list, ig20_shuff10_scores, REGION_SIZE)
ig20_shuff20_scores = sequtils.get_relevant_scores(relevant_indices_list, ig20_shuff20_scores, REGION_SIZE)


Motif matches sequences are 10000
Supplied labels are 10000


In [9]:
method_to_saved_scores = OrderedDict([('grad_times_input', grad_times_input_allzeros_scores),
                                      ('deeplift_rcrs_allzeros', deeplift_rcrs_allzeros_scores),
                                      ('deeplift_rcrs_avgc', deeplift_rcrs_avgc_scores),
                                      ('deeplift_rcrs_shuff10', deeplift_rcrs_shuff10_scores),
                                      ('deeplift_rcrs_shuff20', deeplift_rcrs_shuff20_scores),
                                      ('deeplift_rs_allzeros', deeplift_rs_allzeros_scores),
                                      ('deeplift_rs_avgc', deeplift_rs_avgc_scores),
                                      ('deeplift_rs_shuff10', deeplift_rs_shuff10_scores),
                                      ('deeplift_rs_shuff20', deeplift_rs_shuff20_scores),
                                      ('ig10_shuff10', ig10_shuff10_scores),
                                      ('ig10_shuff20', ig10_shuff20_scores),
                                      ('ig20_shuff10', ig20_shuff10_scores),
                                      ('ig20_shuff20', ig20_shuff20_scores),
                                      ('ism', ism_scores)])
                                      
method_to_seq_id_to_scores={}
windowscoringutils.collectSeqIdToScoresForAllMethods(method_to_saved_scores, seq_ids_of_interest, relevant_labels_list, seq_ids_of_interest, method_to_seq_id_to_scores, seq_ids_of_interest)

In [10]:
seq_id_to_covered_positions={}
motif_id_to_hit_locations=defaultdict(list)
motif_id_to_motif_length={}
motif_len_to_negatives=defaultdict(list)
motif_id_to_pos_locs={}

windowscoringutils.computeAndAccumulateEmbeddings(REGION_SIZE, seq_ids_of_interest, motif_matches, seq_id_to_covered_positions, motif_id_to_hit_locations, motif_id_to_motif_length)
windowscoringutils.computeMotifLenToNegatives(seq_id_to_covered_positions, motif_id_to_motif_length, motif_len_to_negatives)
windowscoringutils.motifToPosLocs(motif_id_to_pos_locs, motif_id_to_hit_locations, motif_id_to_motif_length)
top_motif_ids=windowscoringutils.topEnrichedMotifs(motif_id_to_pos_locs)

Saw 10000 new sequences
Motif positions: 842372.0, total positions: 4000000


In [11]:
method_list=['grad_times_input', 'deeplift_rcrs_allzeros', 'deeplift_rcrs_avgc', 'deeplift_rcrs_shuff10', 'deeplift_rcrs_shuff20', 'deeplift_rs_allzeros', 'deeplift_rs_avgc', 'deeplift_rs_shuff10', 'deeplift_rs_shuff20', 'ig10_shuff10', 'ig10_shuff20', 'ig20_shuff10', 'ig20_shuff20', 'ism']
methods_to_plot_list=[]
for method in method_list:
    methods_to_plot_list.append([method])
windowscoringutils.displayResults(RESULTS_DIRECTORY, method_list, top_motif_ids, methods_to_plot_list, motif_id_to_motif_length, motif_id_to_hit_locations, motif_len_to_negatives, method_to_seq_id_to_scores, motif_id_to_pos_locs)

Number of motifs is 29


grad_times_input

TGCGCABGCGCV: [0.9042121231142148, 0.3813302649245715, 13426.0]
CCACTAGRKGGC: [0.861581458878292, 0.23799726148280717, 7813.0]
CKCGCGAG: [0.8287181940363931, 0.09292022160789287, 6921.0]
AGGTCCCGCCCC: [0.75896339539527, 0.06858372933739842, 21298.0]
AGGGGGCGCTGT: [0.7604860811183201, 0.051342062271098686, 9086.0]
ACGCGCGTCGCA: [0.7413505023082717, 0.04084889331300056, 14586.0]
CAAGATGGCGGC: [0.75817709127509, 0.03897166236194413, 2014.0]
GCCAATGR: [0.8346538051961809, 0.027266050127620768, 1177.0]
GGGARTTGTAGT: [0.739304225039802, 0.019515111000195626, 1711.0]
AGCCGGGTCTCG: [0.6898779407070804, 0.016508540495172933, 7901.0]
GCGGGCGGACGT: [0.6935487876725708, 0.0075389633410805085, 2463.0]
MGCGARCG: [0.7054262678437271, 0.004862412054752475, 2096.0]
CGGVGBDACS: [0.694425121485188, 0.004068483641522969, 2898.0]
CGCRVRDTCGGC: [0.6557764478371804, 0.003803441784994707, 2364.0]
WTTCCGCG: [0.7467592297684202, 0.002406835381464158, 1336.0]
TYCCTMGG: 