In [1]:
from __future__ import print_function
import h5py
import numpy as np
import deeplift
import evautils
from scipy import stats
from evautils import sequtils
from evautils import kerasutils
from evautils import dirutils
from evautils import windowscoringutils
from evautils import impscoringutils
from collections import OrderedDict, defaultdict
import os
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve)

Using TensorFlow backend.


In [2]:
REGION_SIZE = 400
POS_LABELS = '/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz'
MOTIF_MATCHES='/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt'
SCORES = '/users/eprakash/git/interpret-benchmark/scripts/deepsea_beluga/A549/sim_pos_and_neg.h5'

In [3]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt ...
#Loaded 4623218 motif matches in 879553 sequences


In [4]:
from collections import OrderedDict
import gzip
import numpy as np
def load_labels_from_bedfile(seqfile):
    seqs = []
    fp = gzip.open(seqfile, "rb")
    print("#Loading " + seqfile + " ...")
    for line in fp:
        line=line.decode('utf8').split()
        seqs.append(line[0])
    fp.close()
    print("#Loaded " + str(len(seqs)) + " seqnames from " + seqfile)
    return np.array(seqs)

In [5]:
h5f = h5py.File(SCORES,'r')
pos_labels = load_labels_from_bedfile(POS_LABELS)
print(h5f.keys())

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz ...
#Loaded 10000 seqnames from /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz
<KeysViewHDF5 ['deeplift-rcrs_ref:allzeros', 'deeplift-rcrs_ref:avgc', 'deeplift-rcrs_ref:shuff-20', 'deeplift-rs_ref:allzeros', 'deeplift-rs_ref:avgc', 'deeplift-rs_ref:shuff-20', 'gradtimesinp_ref:allzeros', 'ig-20_ref:shuff-20', 'ism', 'seqids']>


In [6]:
deeplift_rcrs_allzeros_scores=np.array(h5f.get("deeplift-rcrs_ref:allzeros"))
deeplift_rcrs_avgc_scores=np.array(h5f.get("deeplift-rcrs_ref:avgc"))
deeplift_rcrs_shuff10_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-10"))
deeplift_rcrs_shuff20_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-20"))

deeplift_rs_allzeros_scores=np.array(h5f.get("deeplift-rs_ref:allzeros"))
deeplift_rs_avgc_scores=np.array(h5f.get("deeplift-rs_ref:avgc"))
deeplift_rs_shuff10_scores=np.array(h5f.get("deeplift-rs_ref:shuff-10"))
deeplift_rs_shuff20_scores=np.array(h5f.get("deeplift-rs_ref:shuff-20"))

gradcam_scores = np.array(h5f.get("gradcam"))
gradtimesact_direct_scores = np.array(h5f.get("gradtimesact-direct"))


grad_times_input_allzeros_scores = np.array(h5f.get("gradtimesinp_ref:allzeros"))
grad_times_input_avgc_scores = np.array(h5f.get("gradtimesinp_ref:avgc"))
grad_times_input_shuff10_scores = np.array(h5f.get("gradtimesinp_ref:shuff-10"))
grad_times_input_shuff20_scores = np.array(h5f.get("gradtimesinp_ref:shuff-20"))


ism_scores = np.array(h5f.get("ism"))

ig10_shuff10_scores = np.array(h5f.get("ig-10_ref:shuff-10"))
ig10_shuff20_scores =np.array(h5f.get("ig-10_ref:shuff-20"))
ig20_shuff10_scores =np.array(h5f.get("ig-20_ref:shuff-10"))
ig20_shuff20_scores =np.array(h5f.get("ig-20_ref:shuff-20"))

seqids = [x.decode('utf-8') for x in np.array(h5f.get("seqids"))]
h5f.close()

In [None]:
from scipy.interpolate import interp1d

def interpolate_scores(method_scores):
    leftover_sequence_length = 10
    effective_input_length = REGION_SIZE - leftover_sequence_length
    ret = np.zeros((method_scores.shape[0], REGION_SIZE))
    print(ret.shape)
    for i in range(len(method_scores)):
        scores = method_scores[i]
        multiplication_factor = float(effective_input_length)/(len(scores)+1)
        interpolated_scores = interp1d(x=(np.arange(len(scores))+1)*multiplication_factor, y=scores, kind="linear", fill_value="extrapolate", bounds_error=False)(0.5+np.arange(effective_input_length))
        interpolated_scores = np.pad(interpolated_scores, (0,leftover_sequence_length), 'constant')
        ret[i] = interpolated_scores
    return ret

In [None]:
deeplift_rcrs_allzeros_scores = interpolate_scores(deeplift_rcrs_allzeros_scores)
deeplift_rcrs_avgc_scores = interpolate_scores(deeplift_rcrs_avgc_scores)
deeplift_rcrs_shuff10_scores = interpolate_scores(deeplift_rcrs_shuff10_scores)
deeplift_rcrs_shuff20_scores = interpolate_scores(deeplift_rcrs_shuff20_scores)

deeplift_rs_allzeros_scores = interpolate_scores(deeplift_rs_allzeros_scores)
deeplift_rs_avgc_scores = interpolate_scores(deeplift_rs_avgc_scores)
deeplift_rs_shuff10_scores = interpolate_scores(deeplift_rs_shuff10_scores)
deeplift_rs_shuff20_scores = interpolate_scores(deeplift_rs_shuff20_scores)

gradcam_scores = interpolate_scores(gradcam_scores)
gradtimesact_direct_scores = interpolate_scores(gradtimesact_direct_scores)


grad_times_input_allzeros_scores = interpolate_scores(grad_times_input_allzeros_scores)
grad_times_input_avgc_scores = interpolate_scores(grad_times_input_avgc_scores)
grad_times_input_shuff10_scores = interpolate_scores(grad_times_input_shuff10_scores)
grad_times_input_shuff20_scores = interpolate_scores(grad_times_input_shuff20_scores)

In [7]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)
print(len(motif_matches))
for key in list(motif_matches.keys()):
    if key not in pos_labels:
        del motif_matches[key]
print(len(motif_matches))
seq_ids_of_interest = list(motif_matches.keys())
print(len(seq_ids_of_interest))

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt ...
#Loaded 4623218 motif matches in 879553 sequences
879553
10000
10000


In [8]:
seq_ids_of_interest_set = set(seq_ids_of_interest)
relevant_indices_list, relevant_labels_list=sequtils.get_relevant_labels_in_order_of_scores(seqids, motif_matches)
seq_ids_of_interest = relevant_labels_list
seq_ids_of_interest_set = set(relevant_labels_list)

deeplift_rcrs_allzeros_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_allzeros_scores, REGION_SIZE)
deeplift_rcrs_avgc_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_avgc_scores, REGION_SIZE)
#deeplift_rcrs_shuff10_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff10_scores, REGION_SIZE)
deeplift_rcrs_shuff20_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff20_scores, REGION_SIZE)

deeplift_rs_allzeros_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_allzeros_scores, REGION_SIZE)
deeplift_rs_avgc_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_avgc_scores, REGION_SIZE)
#deeplift_rs_shuff10_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff10_scores, REGION_SIZE)
deeplift_rs_shuff20_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff20_scores, REGION_SIZE)

grad_times_input_allzeros_scores = sequtils.get_relevant_scores(relevant_indices_list, grad_times_input_allzeros_scores, REGION_SIZE)
#grad_times_input_avgc_scores = sequtils.get_relevant_scores(relevant_indices_list, grad_times_input_avgc_scores, REGION_SIZE)
#grad_times_input_shuff10_scores = sequtils.get_relevant_scores(relevant_indices_list, grad_times_input_shuff10_scores, REGION_SIZE)
#grad_times_input_shuff20_scores = sequtils.get_relevant_scores(relevant_indices_list, grad_times_input_shuff20_scores, REGION_SIZE)

#gradcam_scores = sequtils.get_relevant_scores(relevant_indices_list, gradcam_scores, REGION_SIZE)
#gradtimesact_direct_scores = sequtils.get_relevant_scores(relevant_indices_list, gradtimesact_direct_scores, REGION_SIZE)

Motif matches sequences are 10000
Supplied labels are 10000


In [9]:
method_to_saved_scores = OrderedDict([('deeplift_rcrs_shuff20', deeplift_rcrs_shuff20_scores),
                                      ('deeplift_rs_shuff20', deeplift_rs_shuff20_scores)])

In [None]:
#Count all motifs percentage
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_percentages = []
fp = open('no_neg_implant_percents.txt', 'w+')
for method in method_to_saved_scores:
    total_sum_importance=0
    motif_sum_importance=0
    for seq in range(num_labels):
        motif_positions = set({})
        label = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        total_per_seq_importance = np.sum(np.abs(method_to_saved_scores[method][index]))
        total_sum_importance = total_sum_importance + total_per_seq_importance
        seqentries = motif_matches[label]
        motif_per_seq_importance = 0
        for entry in seqentries:
            motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            motif_per_seq_importance = motif_per_seq_importance + np.abs(method_to_saved_scores[method][index][i])
            motif_sum_importance = motif_sum_importance + np.abs(method_to_saved_scores[method][index][i])
        per_seq_percentages.append(float(motif_per_seq_importance)/float(total_per_seq_importance))
    percentage = float(motif_sum_importance)/float(total_sum_importance)
    print("Method is " + str(method))
    print("Total summed importance " + str(total_sum_importance))
    print("Motif region summed importance  " + str(motif_sum_importance))
    print("Total percentage " + str(percentage*100))
    print("Mean per sequence percentage " + str(np.mean(np.array(per_seq_percentages))))
    print("Standard error " + str(stats.sem(per_seq_percentages)))
    print("\n")
    assert(len(per_seq_percentages) == 10000)
    per_seq_percentages.clear()
    fp.write(method + ": [" + str(percentage) + ", " + str(np.mean(np.array(per_seq_percentages))) + "]\n")
fp.close()  

In [None]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_diffs = []
bad_ig = []
#fp = open('sim_pos_and_neg_diffs.txt', 'w+')
for method in method_to_saved_scores:
    total_sum_importance=0
    motif_sum_importance=0
    for seq in range(num_labels):
        motif_positions = set({})
        label = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        total_per_seq_abs_importance = np.sum(np.abs(method_to_saved_scores[method][index]))
        total_per_seq_importance = np.sum(method_to_saved_scores[method][index])
        total_sum_importance = total_sum_importance + (total_per_seq_importance/total_per_seq_abs_importance)
        seqentries = motif_matches[label]
        motif_per_seq_importance = 0
        for entry in seqentries:
            motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            motif_per_seq_importance = motif_per_seq_importance + (method_to_saved_scores[method][index][i])/total_per_seq_abs_importance
            motif_sum_importance = motif_sum_importance + (method_to_saved_scores[method][index][i])/total_per_seq_abs_importance
        seq_diff = motif_per_seq_importance*2-(total_per_seq_importance/total_per_seq_abs_importance)
        if ('ig20_shuff20' == method and (seq_diff < 0)):
            bad_ig.append(seq)
        #if(seq == 2724):
        #    print(method + " " + str(seq_diff))
        per_seq_diffs.append(seq_diff)
    diff = np.sum(per_seq_diffs)
    print("Method is " + str(method))
    print("Non-motif region summed importance " + str(total_sum_importance - motif_sum_importance))
    print("Motif region summed importance  " + str(motif_sum_importance))
    print("Total difference " + str(diff))
    print("Mean per sequence difference " + str(np.mean(np.array(per_seq_diffs))))
    print("Standard error " + str(stats.sem(per_seq_diffs)))
    print("\n")
    assert(len(per_seq_diffs) == 10000)
    #fp.write(method + ": [" + str(diff) + ", " + str(np.mean(np.array(per_seq_diffs))) + "]\n")
    per_seq_diffs.clear()
#fp.close()

In [None]:
print(bad_ig)

In [None]:
print(relevant_labels_list[156])
print(relevant_labels_list[213])
print(relevant_labels_list[2724])

In [None]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_aurocs = []
per_seq_auprcs = []
bad_ig = []
good_deeplift = []
#fp = open('gradcam_second_last_auroc_auprc.txt', 'w+')
for method in list(method_to_saved_scores.keys()):
    total_labels = np.array([])
    total_scores = np.array([])
    for seq in range(num_labels):
        motif_positions = set({})
        seqname = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        scores = np.abs(np.array(method_to_saved_scores[method][index]))
        labels = np.zeros(REGION_SIZE)
        seqentries = motif_matches[seqname]
        for entry in seqentries:
                motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            labels[i] = 1
        total_scores = np.concatenate((total_scores, scores))
        total_labels = np.concatenate((total_labels, labels))
        per_seq_aurocs.append(roc_auc_score(y_true=labels, y_score=scores))
        per_seq_auprcs.append(average_precision_score(y_true=labels, y_score=scores))
        
    print("Method is " + str(method))
    print("Total auroc " + str(roc_auc_score(y_true=total_labels, y_score=total_scores)))
    print("Total auprc " + str(average_precision_score(y_true=total_labels, y_score=total_scores)))
    print("Mean per sequence auroc " + str(np.mean(np.array(per_seq_aurocs))))
    print("Mean per sequence auprc " + str(np.mean(np.array(per_seq_auprcs))))
    print("Per sequence auroc stderr " + str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)))
    print("Per sequence auprc stderr " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)))
    print("\n")
    if ('ig20_shuff20' == method):
        bad_ig = per_seq_auprcs.copy()
    if ('deeplift_rs_shuff20' == method):
        good_deeplift = per_seq_auprcs.copy()
        
    assert(len(per_seq_aurocs) == 10000)
    
    #fp.write(method + ": [" + str(np.mean(np.array(per_seq_aurocs))) + ", " + str(np.mean(np.array(per_seq_auprcs))) + ", "
#+ str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)) + ", " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)) + "]\n")
    
    per_seq_aurocs.clear()
    per_seq_auprcs.clear()
    
#fp.close()

In [None]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_aurocs = []
per_seq_auprcs = []
bad_ig = []
good_deeplift = []
#fp = open('gradcam_second_last_auroc_auprc.txt', 'w+')
for method in list(method_to_saved_scores.keys()):
    total_labels = np.array([])
    total_scores = np.array([])
    for seq in range(num_labels):
        motif_positions = set({})
        seqname = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        scores = np.abs(np.array(method_to_saved_scores[method][index]))
        labels = np.zeros(REGION_SIZE)
        seqentries = motif_matches[seqname]
        for entry in seqentries:
                motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            labels[i] = 1
        total_scores = np.concatenate((total_scores, scores))
        total_labels = np.concatenate((total_labels, labels))
        per_seq_aurocs.append(roc_auc_score(y_true=labels, y_score=scores))
        per_seq_auprcs.append(average_precision_score(y_true=labels, y_score=scores))
        
    print("Method is " + str(method))
    print("Total auroc " + str(roc_auc_score(y_true=total_labels, y_score=total_scores)))
    print("Total auprc " + str(average_precision_score(y_true=total_labels, y_score=total_scores)))
    print("Mean per sequence auroc " + str(np.mean(np.array(per_seq_aurocs))))
    print("Mean per sequence auprc " + str(np.mean(np.array(per_seq_auprcs))))
    print("Per sequence auroc stderr " + str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)))
    print("Per sequence auprc stderr " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)))
    print("\n")
    if ('ig20_shuff20' == method):
        bad_ig = per_seq_auprcs.copy()
    if ('deeplift_rs_shuff20' == method):
        good_deeplift = per_seq_auprcs.copy()
        
    assert(len(per_seq_aurocs) == 10000)
    
    #fp.write(method + ": [" + str(np.mean(np.array(per_seq_aurocs))) + ", " + str(np.mean(np.array(per_seq_auprcs))) + ", "
#+ str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)) + ", " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)) + "]\n")
    
    per_seq_aurocs.clear()
    per_seq_auprcs.clear()
    
#fp.close()

In [10]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_aurocs = []
per_seq_auprcs = []
bad_ig = []
good_deeplift = []
#fp = open('gradcam_second_last_auroc_auprc.txt', 'w+')
for method in list(method_to_saved_scores.keys()):
    total_labels = np.array([])
    total_scores = np.array([])
    for seq in range(num_labels):
        motif_positions = set({})
        seqname = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        scores = np.abs(np.array(method_to_saved_scores[method][index]))
        labels = np.zeros(REGION_SIZE)
        seqentries = motif_matches[seqname]
        for entry in seqentries:
                motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            labels[i] = 1
        total_scores = np.concatenate((total_scores, scores))
        total_labels = np.concatenate((total_labels, labels))
        per_seq_aurocs.append(roc_auc_score(y_true=labels, y_score=scores))
        per_seq_auprcs.append(average_precision_score(y_true=labels, y_score=scores))
        
    print("Method is " + str(method))
    print("Total auroc " + str(roc_auc_score(y_true=total_labels, y_score=total_scores)))
    print("Total auprc " + str(average_precision_score(y_true=total_labels, y_score=total_scores)))
    print("Mean per sequence auroc " + str(np.mean(np.array(per_seq_aurocs))))
    print("Mean per sequence auprc " + str(np.mean(np.array(per_seq_auprcs))))
    print("Per sequence auroc stderr " + str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)))
    print("Per sequence auprc stderr " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)))
    print("\n")
    if ('ig20_shuff20' == method):
        bad_ig = per_seq_auprcs.copy()
    if ('deeplift_rs_shuff20' == method):
        good_deeplift = per_seq_auprcs.copy()
        
    assert(len(per_seq_aurocs) == 10000)
    
    #fp.write(method + ": [" + str(np.mean(np.array(per_seq_aurocs))) + ", " + str(np.mean(np.array(per_seq_auprcs))) + ", "
#+ str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)) + ", " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)) + "]\n")
    
    per_seq_aurocs.clear()
    per_seq_auprcs.clear()
    
#fp.close()

Num labels is 10000
Method is deeplift_rcrs_shuff20
Total auroc 0.8644083377332255
Total auprc 0.8080419276890749
Mean per sequence auroc 0.8392018955117514
Mean per sequence auprc 0.8062041347218181
Per sequence auroc stderr 0.0009726599136625381
Per sequence auprc stderr 0.0012614499611400605


Method is deeplift_rs_shuff20
Total auroc 0.8620843733991302
Total auprc 0.8065082912273422
Mean per sequence auroc 0.8374015736449127
Mean per sequence auprc 0.8043050340589459
Per sequence auroc stderr 0.0009818282226773496
Per sequence auprc stderr 0.0012902395815216237




In [None]:
for i in range(len(bad_ig)):
    ig_auprc = bad_ig[i]
    dl_auprc = good_deeplift[i]
    if (dl_auprc - ig_auprc > 0.2):
        print(dl_auprc, ig_auprc, i)

In [None]:
len(bad_ig)

In [None]:
len(relevant_labels_list)

In [None]:
relevant_labels_list[910]

In [None]:
seqids[910]

In [None]:
for i in range(len(seqids)):
    if (seqids[i] == 'chr14:64937697-64938097'):
        print(i)

In [None]:
seqids[4]

In [None]:
good_deeplift[4]

In [None]:
bad_ig[4]