In [1]:
from __future__ import print_function
import h5py
import numpy as np
import deeplift
import evautils
from scipy import stats
from evautils import sequtils
from evautils import kerasutils
from evautils import dirutils
from evautils import windowscoringutils
from evautils import impscoringutils
from collections import OrderedDict, defaultdict
import os
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve)

Using TensorFlow backend.


In [2]:
REGION_SIZE = 400
SCORES = '/users/eprakash/git/interpret-benchmark/scripts/deepsea_beluga/A549/sim_pos_and_neg.h5'
POS_LABELS = '/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz'
MOTIF_MATCHES='/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/fimo_out/fimo.txt'

In [3]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/fimo_out/fimo.txt ...
#Loaded 2102956 motif matches in 744982 sequences


In [4]:
from collections import OrderedDict
import gzip
import numpy as np
def load_labels_from_bedfile(seqfile):
    seqs = []
    fp = gzip.open(seqfile, "rb")
    print("#Loading " + seqfile + " ...")
    for line in fp:
        line=line.decode('utf8').split()
        seqs.append(line[0])
    fp.close()
    print("#Loaded " + str(len(seqs)) + " seqnames from " + seqfile)
    return np.array(seqs)

In [5]:
h5f = h5py.File(SCORES,'r')
pos_labels = load_labels_from_bedfile(POS_LABELS)
print("\nImp scoring methods: " + str(list(h5f.keys())))

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz ...
#Loaded 10000 seqnames from /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz

Imp scoring methods: ['deeplift-rcrs_ref:allzeros', 'deeplift-rcrs_ref:avgc', 'deeplift-rcrs_ref:shuff-10', 'deeplift-rcrs_ref:shuff-20', 'deeplift-rs_ref:allzeros', 'deeplift-rs_ref:avgc', 'deeplift-rs_ref:shuff-10', 'deeplift-rs_ref:shuff-20', 'gradtimesinp_ref:allzeros', 'ig-10_ref:shuff-10', 'ig-10_ref:shuff-20', 'ig-20_ref:shuff-10', 'ig-20_ref:shuff-20', 'ism', 'seqids']


In [6]:
deeplift_rcrs_allzeros_scores=np.array(h5f.get("deeplift-rcrs_ref:allzeros"))
deeplift_rcrs_avgc_scores=np.array(h5f.get("deeplift-rcrs_ref:avgc"))
deeplift_rcrs_shuff10_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-10"))
deeplift_rcrs_shuff20_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-20"))

deeplift_rs_allzeros_scores=np.array(h5f.get("deeplift-rs_ref:allzeros"))
deeplift_rs_avgc_scores=np.array(h5f.get("deeplift-rs_ref:avgc"))
deeplift_rs_shuff10_scores=np.array(h5f.get("deeplift-rs_ref:shuff-10"))
deeplift_rs_shuff20_scores=np.array(h5f.get("deeplift-rs_ref:shuff-20"))

grad_times_input_allzeros_scores = np.array(h5f.get("gradtimesinp_ref:allzeros"))
ism_scores = np.array(h5f.get("ism"))

ig10_shuff10_scores = np.array(h5f.get("ig-10_ref:shuff-10"))
ig10_shuff20_scores =np.array(h5f.get("ig-10_ref:shuff-20"))
ig20_shuff10_scores =np.array(h5f.get("ig-20_ref:shuff-10"))
ig20_shuff20_scores =np.array(h5f.get("ig-20_ref:shuff-20"))

seqids = [x.decode('utf-8') for x in np.array(h5f.get("seqids"))]

h5f.close()

In [7]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)
for key in list(motif_matches.keys()):
    if key not in pos_labels:
        del motif_matches[key]
print("Motif matches len: " + str(len(motif_matches)))
seq_ids_of_interest = list(motif_matches.keys())
print("Seq ids of interest len: " + str(len(seq_ids_of_interest)))

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/fimo_out/fimo.txt ...
#Loaded 2102956 motif matches in 744982 sequences
Motif matches len: 10000
Seq ids of interest len: 10000


In [8]:
seq_ids_of_interest_set = set(seq_ids_of_interest)
relevant_indices_list, relevant_labels_list=sequtils.get_relevant_labels_in_order_of_scores(seqids, motif_matches)
seq_ids_of_interest = relevant_labels_list
seq_ids_of_interest_set = set(relevant_labels_list)

deeplift_rcrs_allzeros_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_allzeros_scores, REGION_SIZE)
deeplift_rcrs_avgc_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_avgc_scores, REGION_SIZE)
deeplift_rcrs_shuff10_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff10_scores, REGION_SIZE)
deeplift_rcrs_shuff20_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff20_scores, REGION_SIZE)

deeplift_rs_allzeros_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_allzeros_scores, REGION_SIZE)
deeplift_rs_avgc_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_avgc_scores, REGION_SIZE)
deeplift_rs_shuff10_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff10_scores, REGION_SIZE)
deeplift_rs_shuff20_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff20_scores, REGION_SIZE)

grad_times_input_allzeros_scores = sequtils.get_relevant_scores(relevant_indices_list, grad_times_input_allzeros_scores, REGION_SIZE)
ism_scores = sequtils.get_relevant_scores(relevant_indices_list, ism_scores, REGION_SIZE)

ig10_shuff10_scores = sequtils.get_relevant_scores(relevant_indices_list, ig10_shuff10_scores, REGION_SIZE)
ig10_shuff20_scores = sequtils.get_relevant_scores(relevant_indices_list, ig10_shuff20_scores, REGION_SIZE)
ig20_shuff10_scores = sequtils.get_relevant_scores(relevant_indices_list, ig20_shuff10_scores, REGION_SIZE)
ig20_shuff20_scores = sequtils.get_relevant_scores(relevant_indices_list, ig20_shuff20_scores, REGION_SIZE)

Motif matches sequences are 10000
Supplied labels are 10000


In [9]:
method_to_saved_scores = OrderedDict([('grad_times_input', grad_times_input_allzeros_scores),
                                      ('deeplift_rcrs_allzeros', deeplift_rcrs_allzeros_scores),
                                      ('deeplift_rcrs_avgc', deeplift_rcrs_avgc_scores),
                                      ('deeplift_rcrs_shuff10', deeplift_rcrs_shuff10_scores),
                                      ('deeplift_rcrs_shuff20', deeplift_rcrs_shuff20_scores),
                                      ('deeplift_rs_allzeros', deeplift_rs_allzeros_scores),
                                      ('deeplift_rs_avgc', deeplift_rs_avgc_scores),
                                      ('deeplift_rs_shuff10', deeplift_rs_shuff10_scores),
                                      ('deeplift_rs_shuff20', deeplift_rs_shuff20_scores),
                                      ('ig10_shuff10', ig10_shuff10_scores),
                                      ('ig10_shuff20', ig10_shuff20_scores),
                                      ('ig20_shuff10', ig20_shuff10_scores),
                                      ('ig20_shuff20', ig20_shuff20_scores),
                                      ('ism', ism_scores)])

In [10]:
#Count all motifs percentage
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_percentages = []
for method in method_to_saved_scores:
    total_sum_importance=0
    motif_sum_importance=0
    for seq in range(num_labels):
        motif_positions = set({})
        label = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        total_per_seq_importance = np.sum(np.abs(method_to_saved_scores[method][index]))
        total_sum_importance = total_sum_importance + total_per_seq_importance
        seqentries = motif_matches[label]
        motif_per_seq_importance = 0
        for entry in seqentries:
            motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            motif_per_seq_importance = motif_per_seq_importance + np.abs(method_to_saved_scores[method][index][i])
            motif_sum_importance = motif_sum_importance + np.abs(method_to_saved_scores[method][index][i])
        per_seq_percentages.append(float(motif_per_seq_importance)/float(total_per_seq_importance))
    percentage = float(motif_sum_importance)/float(total_sum_importance)
    print("Method is " + str(method))
    print("Total summed importance " + str(total_sum_importance))
    print("Motif region summed importance  " + str(motif_sum_importance))
    print("Total percentage " + str(percentage*100))
    print("Mean per sequence percentage " + str(np.mean(np.array(per_seq_percentages))))
    print("Standard error " + str(stats.sem(per_seq_percentages)))
    print("\n")
    assert(len(per_seq_percentages) == 10000)
    per_seq_percentages.clear()

Num labels is 10000
Method is grad_times_input
Total summed importance 169839.99684220672
Motif region summed importance  95412.85545420412
Total percentage 56.178083624700825
Mean per sequence percentage 0.5784653115154808
Standard error 0.0011034914908408585


Method is deeplift_rcrs_allzeros
Total summed importance 74676.68767095532
Motif region summed importance  47443.69821904831
Total percentage 63.53214061675773
Mean per sequence percentage 0.6485710765140347
Standard error 0.0010375482648181749


Method is deeplift_rcrs_avgc
Total summed importance 61463.9703856084
Motif region summed importance  40697.59671595938
Total percentage 66.21374515937323
Mean per sequence percentage 0.6737226232746232
Standard error 0.0010491298728318743


Method is deeplift_rcrs_shuff10
Total summed importance 52945.73212771377
Motif region summed importance  38406.84641899806
Total percentage 72.5400232947095
Mean per sequence percentage 0.7411332859923319
Standard error 0.0013396018102309728


Met

In [12]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_diffs = []
bad_ig = []
for method in method_to_saved_scores:
    total_sum_importance=0
    motif_sum_importance=0
    for seq in range(num_labels):
        motif_positions = set({})
        label = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        total_per_seq_abs_importance = np.sum(np.abs(method_to_saved_scores[method][index]))
        total_per_seq_importance = np.sum(method_to_saved_scores[method][index])
        total_sum_importance = total_sum_importance + (total_per_seq_importance/total_per_seq_abs_importance)
        seqentries = motif_matches[label]
        motif_per_seq_importance = 0
        for entry in seqentries:
            motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            motif_per_seq_importance = motif_per_seq_importance + (method_to_saved_scores[method][index][i])/total_per_seq_abs_importance
            motif_sum_importance = motif_sum_importance + (method_to_saved_scores[method][index][i])/total_per_seq_abs_importance
        seq_diff = motif_per_seq_importance*2-(total_per_seq_importance/total_per_seq_abs_importance)
        if ('ig' in method and seq_diff < 0):
            bad_ig.append(seq)
        per_seq_diffs.append(seq_diff)
    diff = np.sum(per_seq_diffs)
    print("Method is " + str(method))
    print("Non-motif region summed importance " + str(total_sum_importance - motif_sum_importance))
    print("Motif region summed importance  " + str(motif_sum_importance))
    print("Total difference " + str(diff))
    print("Mean per sequence difference " + str(np.mean(np.array(per_seq_diffs))))
    print("Standard error " + str(stats.sem(per_seq_diffs)))
    print("\n")
    assert(len(per_seq_diffs) == 10000)
    per_seq_diffs.clear()

Num labels is 10000
Method is grad_times_input
Non-motif region summed importance -68.69399935063029
Motif region summed importance  3123.932383735596
Total difference 3192.6263830859225
Mean per sequence difference 0.31926263830859225
Standard error 0.0012999304627972982


Method is deeplift_rcrs_allzeros
Non-motif region summed importance 79.31411829728677
Motif region summed importance  4938.260511608291
Total difference 4858.94639331115
Mean per sequence difference 0.485894639331115
Standard error 0.0014568829319417504


Method is deeplift_rcrs_avgc
Non-motif region summed importance 317.0238390030636
Motif region summed importance  5821.637117926732
Total difference 5504.613278923595
Mean per sequence difference 0.5504613278923596
Standard error 0.0016055616607123019


Method is deeplift_rcrs_shuff10
Non-motif region summed importance 853.5968993144079
Motif region summed importance  7028.517340643074
Total difference 6174.920441328355
Mean per sequence difference 0.61749204413283

In [13]:
print(bad_ig)

[6, 227, 300, 304, 314, 351, 366, 470, 680, 690, 746, 784, 918, 969, 973, 993, 1029, 1044, 1154, 1725, 1897, 2151, 2158, 2535, 2593, 2642, 2677, 2994, 3004, 3021, 3067, 3095, 3443, 3528, 3605, 3996, 4410, 4430, 4533, 4885, 5045, 5176, 5335, 5712, 5716, 5787, 5917, 5989, 5991, 6591, 6957, 7074, 7146, 8608, 9004, 227, 300, 304, 314, 351, 470, 670, 680, 690, 746, 784, 918, 969, 973, 993, 1029, 1044, 1123, 1154, 1316, 1725, 1889, 2151, 2158, 2535, 2593, 2642, 2677, 2994, 3021, 3067, 3095, 3443, 3460, 3528, 3605, 3996, 4410, 4533, 4885, 5045, 5176, 5335, 5712, 5716, 5787, 5917, 5989, 6591, 6957, 7074, 7146, 8608, 9004, 6, 227, 300, 304, 314, 351, 470, 680, 690, 746, 784, 918, 969, 973, 993, 1029, 1044, 1154, 1725, 1889, 1897, 2151, 2158, 2535, 2593, 2642, 2677, 2994, 3004, 3021, 3067, 3095, 3225, 3443, 3460, 3528, 3605, 3996, 4410, 4430, 4533, 4885, 4923, 5045, 5176, 5335, 5712, 5716, 5787, 5917, 5989, 5991, 6591, 6939, 6957, 7074, 7146, 8608, 9004, 227, 300, 304, 314, 351, 470, 670, 680, 6

In [14]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_aurocs = []
per_seq_auprcs = []
#fp = open('full_vs_filtered/pos_and_neg_position_abs_auroc_auprc.txt', 'w+')
for method in method_to_saved_scores:
    total_labels = np.array([])
    total_scores = np.array([])
    for seq in range(num_labels):
        motif_positions = set({})
        seqname = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        scores = np.abs(np.array(method_to_saved_scores[method][index]))
        labels = np.zeros(REGION_SIZE)
        seqentries = motif_matches[seqname]
        for entry in seqentries:
                motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            labels[i] = 1
        total_scores = np.concatenate((total_scores, scores))
        total_labels = np.concatenate((total_labels, labels))
        per_seq_aurocs.append(roc_auc_score(y_true=labels, y_score=scores))
        per_seq_auprcs.append(average_precision_score(y_true=labels, y_score=scores))
    print("Method is " + str(method))
    print("Total auroc " + str(roc_auc_score(y_true=total_labels, y_score=total_scores)))
    print("Total auprc " + str(average_precision_score(y_true=total_labels, y_score=total_scores)))
    print("Mean per sequence auroc " + str(np.mean(np.array(per_seq_aurocs))))
    print("Mean per sequence auprc " + str(np.mean(np.array(per_seq_auprcs))))
    print("Per sequence auroc stderr " + str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)))
    print("Per sequence auprc stderr " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)))
    print("\n")
    assert(len(per_seq_aurocs) == 10000)
    per_seq_aurocs.clear()
    per_seq_auprcs.clear()

Num labels is 10000
Method is grad_times_input
Total auroc 0.8020677908909363
Total auprc 0.5759893825228712
Mean per sequence auroc 0.7840759688978766
Mean per sequence auprc 0.6299598604035951
Per sequence auroc stderr 0.001277234266378499
Per sequence auprc stderr 0.0015357149873311629


Method is deeplift_rcrs_allzeros
Total auroc 0.8222832132281839
Total auprc 0.6425561387593423
Mean per sequence auroc 0.8084736960741666
Mean per sequence auprc 0.6872814056054792
Per sequence auroc stderr 0.0011870487283495686
Per sequence auprc stderr 0.0015265827943152402


Method is deeplift_rcrs_avgc
Total auroc 0.8245483769868152
Total auprc 0.6592336737826685
Mean per sequence auroc 0.812018384572475
Mean per sequence auprc 0.7024128302584577
Per sequence auroc stderr 0.001187805816264399
Per sequence auprc stderr 0.0015631656981987029


Method is deeplift_rcrs_shuff10
Total auroc 0.8416847537525775
Total auprc 0.6790457224867086
Mean per sequence auroc 0.8206840812091212
Mean per sequence a