In [1]:
from __future__ import print_function
import h5py
import numpy as np
import deeplift
import evautils
from scipy import stats
from evautils import sequtils
from evautils import kerasutils
from evautils import dirutils
from evautils import windowscoringutils
from evautils import impscoringutils
from collections import OrderedDict, defaultdict
import os
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve)

Using TensorFlow backend.


In [2]:
REGION_SIZE = 400
POS_LABELS = '/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz'
MOTIF_MATCHES='/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt'
SCORES = '/users/eprakash/git/interpret-benchmark/scripts/gradcam/deepsea_beluga/A549/deeplift_conv_contribs.h5'

In [3]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt ...
#Loaded 4623218 motif matches in 879553 sequences


In [4]:
from collections import OrderedDict
import gzip
import numpy as np
def load_labels_from_bedfile(seqfile):
    seqs = []
    fp = gzip.open(seqfile, "rb")
    print("#Loading " + seqfile + " ...")
    for line in fp:
        line=line.decode('utf8').split()
        seqs.append(line[0])
    fp.close()
    print("#Loaded " + str(len(seqs)) + " seqnames from " + seqfile)
    return np.array(seqs)

In [5]:
h5f = h5py.File(SCORES,'r')
pos_labels = load_labels_from_bedfile(POS_LABELS)
print(h5f.keys())

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz ...
#Loaded 10000 seqnames from /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz
<KeysViewHDF5 ['deeplift-rcrs_ref:allzeros', 'deeplift-rcrs_ref:avgc', 'deeplift-rcrs_ref:shuff-20', 'deeplift-rs_ref:allzeros', 'deeplift-rs_ref:avgc', 'deeplift-rs_ref:shuff-20', 'gradcam', 'gradtimesact-direct', 'gradtimesinp_ref:allzeros', 'seqids']>


In [6]:
deeplift_rcrs_allzeros_scores=np.array(h5f.get("deeplift-rcrs_ref:allzeros"))
deeplift_rcrs_avgc_scores=np.array(h5f.get("deeplift-rcrs_ref:avgc"))
deeplift_rcrs_shuff10_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-10"))
deeplift_rcrs_shuff20_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-20"))

deeplift_rs_allzeros_scores=np.array(h5f.get("deeplift-rs_ref:allzeros"))
deeplift_rs_avgc_scores=np.array(h5f.get("deeplift-rs_ref:avgc"))
deeplift_rs_shuff10_scores=np.array(h5f.get("deeplift-rs_ref:shuff-10"))
deeplift_rs_shuff20_scores=np.array(h5f.get("deeplift-rs_ref:shuff-20"))

gradcam_scores = np.array(h5f.get("gradcam"))
gradtimesact_direct_scores = np.array(h5f.get("gradtimesact-direct"))


grad_times_input_allzeros_scores = np.array(h5f.get("gradtimesinp_ref:allzeros"))
grad_times_input_avgc_scores = np.array(h5f.get("gradtimesinp_ref:avgc"))
grad_times_input_shuff10_scores = np.array(h5f.get("gradtimesinp_ref:shuff-10"))
grad_times_input_shuff20_scores = np.array(h5f.get("gradtimesinp_ref:shuff-20"))

seqids = [x.decode('utf-8') for x in np.array(h5f.get("seqids"))]
h5f.close()

In [7]:
def interpolate_scores(conv_scores, sequence_length=REGION_SIZE):
    print(conv_scores.shape)
    assert len(conv_scores.shape) == 2, "Expecting conv_scores to have dims of examples x conv_layer_length"
    kernel_len = (sequence_length-conv_scores.shape[1]) + 1
    print("Inferred the kernel length to be",kernel_len,"please ensure this is right")
    summed_scores = np.zeros((len(conv_scores), sequence_length), dtype=np.float64) #sum of scores of conv filters whose receptive field overlaps this position
    overlap_count = np.zeros((len(conv_scores), sequence_length)) #counts the num of conv neurons whose receptive field overlaps this position
    for i in range(conv_scores.shape[1]):
        summed_scores[:, i:i+kernel_len] += (conv_scores[:, i])[:,None]
        overlap_count[:, i:i+kernel_len] += 1
    return (summed_scores/overlap_count)
'''
from scipy.interpolate import interp1d

def interpolate_scores(method_scores):
    leftover_sequence_length = 10
    effective_input_length = REGION_SIZE - leftover_sequence_length
    ret = np.zeros((method_scores.shape[0], REGION_SIZE))
    print(ret.shape)
    for i in range(len(method_scores)):
        scores = method_scores[i]
        multiplication_factor = float(effective_input_length)/(len(scores)+1)
        interpolated_scores = interp1d(x=(np.arange(len(scores))+1)*multiplication_factor, y=scores, kind="linear", fill_value="extrapolate", bounds_error=False)(0.5+np.arange(effective_input_length))
        interpolated_scores = np.pad(interpolated_scores, (0,leftover_sequence_length), 'constant')
        ret[i] = interpolated_scores
    return ret
'''

'\nfrom scipy.interpolate import interp1d\n\ndef interpolate_scores(method_scores):\n    leftover_sequence_length = 10\n    effective_input_length = REGION_SIZE - leftover_sequence_length\n    ret = np.zeros((method_scores.shape[0], REGION_SIZE))\n    print(ret.shape)\n    for i in range(len(method_scores)):\n        scores = method_scores[i]\n        multiplication_factor = float(effective_input_length)/(len(scores)+1)\n        interpolated_scores = interp1d(x=(np.arange(len(scores))+1)*multiplication_factor, y=scores, kind="linear", fill_value="extrapolate", bounds_error=False)(0.5+np.arange(effective_input_length))\n        interpolated_scores = np.pad(interpolated_scores, (0,leftover_sequence_length), \'constant\')\n        ret[i] = interpolated_scores\n    return ret\n'

In [8]:
deeplift_rcrs_shuff20_scores = interpolate_scores(deeplift_rcrs_shuff20_scores)
deeplift_rs_shuff20_scores = interpolate_scores(deeplift_rs_shuff20_scores)
gradcam_scores = interpolate_scores(gradcam_scores)
gradtimesact_direct_scores = interpolate_scores(gradtimesact_direct_scores)
grad_times_input_allzeros_scores = interpolate_scores(grad_times_input_allzeros_scores)
gradcam_times_inputgradients_scores = np.multiply(gradcam_scores, grad_times_input_allzeros_scores)

(10000, 393)
Inferred the kernel length to be 8 please ensure this is right
(10000, 393)
Inferred the kernel length to be 8 please ensure this is right
(10000, 393)
Inferred the kernel length to be 8 please ensure this is right
(10000, 393)
Inferred the kernel length to be 8 please ensure this is right
(10000, 393)
Inferred the kernel length to be 8 please ensure this is right


In [9]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)
print(len(motif_matches))
for key in list(motif_matches.keys()):
    if key not in pos_labels:
        del motif_matches[key]
print(len(motif_matches))
seq_ids_of_interest = list(motif_matches.keys())
print(len(seq_ids_of_interest))

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt ...
#Loaded 4623218 motif matches in 879553 sequences
879553
10000
10000


In [10]:
seq_ids_of_interest_set = set(seq_ids_of_interest)
relevant_indices_list, relevant_labels_list=sequtils.get_relevant_labels_in_order_of_scores(seqids, motif_matches)
seq_ids_of_interest = relevant_labels_list
seq_ids_of_interest_set = set(relevant_labels_list)

deeplift_rcrs_shuff20_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff20_scores, REGION_SIZE)
deeplift_rs_shuff20_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff20_scores, REGION_SIZE)
grad_times_input_allzeros_scores = sequtils.get_relevant_scores(relevant_indices_list, grad_times_input_allzeros_scores, REGION_SIZE)
gradcam_scores = sequtils.get_relevant_scores(relevant_indices_list, gradcam_scores, REGION_SIZE)
gradtimesact_direct_scores = sequtils.get_relevant_scores(relevant_indices_list, gradtimesact_direct_scores, REGION_SIZE)

gradcam_times_inputgradients_scores = sequtils.get_relevant_scores(relevant_indices_list, gradcam_times_inputgradients_scores, REGION_SIZE)

Motif matches sequences are 10000
Supplied labels are 10000


In [11]:
method_to_saved_scores = OrderedDict([('grad_times_input', grad_times_input_allzeros_scores),
                                      ('gradcam', gradcam_scores),
                                      ('deeplift_rs_shuff20', deeplift_rs_shuff20_scores),
                                      ('deeplift_rcrs_shuff20', deeplift_rcrs_shuff20_scores),
                                      ('gradcam-times-inputgradients', gradcam_times_inputgradients_scores),
                                      ('gradtimesact_direct', gradtimesact_direct_scores)])

In [12]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_aurocs = []
per_seq_auprcs = []
bad_ig = []
good_deeplift = []
fp = open('gradcam_first_auroc_auprc_zheng_interp.txt', 'w+')
for method in list(method_to_saved_scores.keys()):
    total_labels = np.array([])
    total_scores = np.array([])
    for seq in range(num_labels):
        motif_positions = set({})
        seqname = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        scores = np.abs(np.array(method_to_saved_scores[method][index]))
        labels = np.zeros(REGION_SIZE)
        seqentries = motif_matches[seqname]
        for entry in seqentries:
                motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            labels[i] = 1
        total_scores = np.concatenate((total_scores, scores))
        total_labels = np.concatenate((total_labels, labels))
        per_seq_aurocs.append(roc_auc_score(y_true=labels, y_score=scores))
        per_seq_auprcs.append(average_precision_score(y_true=labels, y_score=scores))
        
    print("Method is " + str(method))
    print("Total auroc " + str(roc_auc_score(y_true=total_labels, y_score=total_scores)))
    print("Total auprc " + str(average_precision_score(y_true=total_labels, y_score=total_scores)))
    print("Mean per sequence auroc " + str(np.mean(np.array(per_seq_aurocs))))
    print("Mean per sequence auprc " + str(np.mean(np.array(per_seq_auprcs))))
    print("Per sequence auroc stderr " + str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)))
    print("Per sequence auprc stderr " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)))
    print("\n")
    if ('ig20_shuff20' == method):
        bad_ig = per_seq_auprcs.copy()
    if ('deeplift_rs_shuff20' == method):
        good_deeplift = per_seq_auprcs.copy()
        
    assert(len(per_seq_aurocs) == 10000)
    
    fp.write(method + ": [" + str(np.mean(np.array(per_seq_aurocs))) + ", " + str(np.mean(np.array(per_seq_auprcs))) + ", "
+ str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)) + ", " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)) + "]\n")
    
    per_seq_aurocs.clear()
    per_seq_auprcs.clear()
    
fp.close()

Num labels is 10000
Method is grad_times_input
Total auroc 0.847069612426276
Total auprc 0.783978810950118
Mean per sequence auroc 0.8314011873271621
Mean per sequence auprc 0.7996371240380944
Per sequence auroc stderr 0.00102978725587366
Per sequence auprc stderr 0.001243783061960048


Method is gradcam
Total auroc 0.4093091918819139
Total auprc 0.2978820994603735
Mean per sequence auroc 0.3671526899339433
Mean per sequence auprc 0.3127943735869096
Per sequence auroc stderr 0.0011530470161448903
Per sequence auprc stderr 0.0018880363742564798


Method is deeplift_rs_shuff20
Total auroc 0.8693087014907444
Total auprc 0.815062279684393
Mean per sequence auroc 0.8493890599525455
Mean per sequence auprc 0.8221648339017728
Per sequence auroc stderr 0.0010017508134224917
Per sequence auprc stderr 0.001255028800948049


Method is deeplift_rcrs_shuff20
Total auroc 0.8706775989924185
Total auprc 0.816101381128214
Mean per sequence auroc 0.8506011996072688
Mean per sequence auprc 0.823819485497