In [1]:
from __future__ import print_function
import h5py
import numpy as np
import deeplift
import evautils
from scipy import stats
from collections import OrderedDict, defaultdict
import os
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve)

In [2]:
REGION_SIZE = 400
INTERPOLATE = False
POS_LABELS = '/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz'
MOTIF_MATCHES='/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt'
SCORES = '/users/eprakash/git/interpret-benchmark/scripts/deepsea_beluga/A549/sim_pos_and_neg.h5'

In [3]:
def load_fimo_motif_matches(motif_match_file, doprint=False):
        motif_matches = OrderedDict()
        fp = open(motif_match_file, "r")
        if doprint:
                print("#Loading " + motif_match_file + " ...")
        numlines = 0
        fp.readline()
        for line in fp:
            line = line.split()
            numlines = numlines + 1
            motif = line[0]
            sequence = line[1]
            begin = int(line[2])
            end = int(line[3])
            strand = line[4]
            seqval = line[8]
            entry = dict()
            entry['motif'] = motif
            entry['sequence'] = sequence
            entry['begin'] = begin-1 # Fimo motif match file is 1 indexed, convert to 0
            entry['end'] = end # Fimo motif match file is 1 indexed AND inclusive, convert to 0 and exclusive
            entry['strand'] = strand
            entry['seqval'] = seqval
            if sequence not in motif_matches:
                motif_matches[sequence] = list()
            motif_matches[sequence].append(entry)
        fp.close()
        if doprint:
                print("#Loaded " + str(numlines) + " motif matches in " + str(len(motif_matches.keys())) + " sequences")
        return motif_matches

In [4]:
from collections import OrderedDict
import gzip
import numpy as np
def load_labels_from_bedfile(seqfile):
    seqs = []
    fp = gzip.open(seqfile, "rb")
    print("#Loading " + seqfile + " ...")
    for line in fp:
        line=line.decode('utf8').split()
        seqs.append(line[0])
    fp.close()
    print("#Loaded " + str(len(seqs)) + " seqnames from " + seqfile)
    return np.array(seqs)

In [5]:
h5f = h5py.File(SCORES,'r')
pos_labels = load_labels_from_bedfile(POS_LABELS)
print(h5f.keys())

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz ...
#Loaded 10000 seqnames from /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz
<KeysViewHDF5 ['deeplift-rcrs_ref:allzeros', 'deeplift-rcrs_ref:avgc', 'deeplift-rcrs_ref:shuff-10', 'deeplift-rcrs_ref:shuff-20', 'deeplift-rs_ref:allzeros', 'deeplift-rs_ref:avgc', 'deeplift-rs_ref:shuff-10', 'deeplift-rs_ref:shuff-20', 'gradtimesinp_ref:allzeros', 'ig-10_ref:shuff-10', 'ig-10_ref:shuff-20', 'ig-20_ref:shuff-10', 'ig-20_ref:shuff-20', 'ism', 'seqids']>


In [6]:
deeplift_rcrs_allzeros_scores=np.array(h5f.get("deeplift-rcrs_ref:allzeros"))
deeplift_rcrs_avgc_scores=np.array(h5f.get("deeplift-rcrs_ref:avgc"))
deeplift_rcrs_shuff10_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-10"))
deeplift_rcrs_shuff20_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-20"))

deeplift_rs_allzeros_scores=np.array(h5f.get("deeplift-rs_ref:allzeros"))
deeplift_rs_avgc_scores=np.array(h5f.get("deeplift-rs_ref:avgc"))
deeplift_rs_shuff10_scores=np.array(h5f.get("deeplift-rs_ref:shuff-10"))
deeplift_rs_shuff20_scores=np.array(h5f.get("deeplift-rs_ref:shuff-20"))

gradcam_scores = np.array(h5f.get("gradcam"))
gradtimesact_direct_scores = np.array(h5f.get("gradtimesact-direct"))


grad_times_input_allzeros_scores = np.array(h5f.get("gradtimesinp_ref:allzeros"))
grad_times_input_avgc_scores = np.array(h5f.get("gradtimesinp_ref:avgc"))
grad_times_input_shuff10_scores = np.array(h5f.get("gradtimesinp_ref:shuff-10"))
grad_times_input_shuff20_scores = np.array(h5f.get("gradtimesinp_ref:shuff-20"))


ism_scores = np.array(h5f.get("ism"))

ig10_shuff10_scores = np.array(h5f.get("ig-10_ref:shuff-10"))
ig10_shuff20_scores =np.array(h5f.get("ig-10_ref:shuff-20"))
ig20_shuff10_scores =np.array(h5f.get("ig-20_ref:shuff-10"))
ig20_shuff20_scores =np.array(h5f.get("ig-20_ref:shuff-20"))

seqids = [x.decode('utf-8') for x in np.array(h5f.get("seqids"))]
h5f.close()

In [7]:
from scipy.interpolate import interp1d

def interpolate_scores(method_scores):
    leftover_sequence_length = 10
    effective_input_length = REGION_SIZE - leftover_sequence_length
    ret = np.zeros((method_scores.shape[0], REGION_SIZE))
    print(ret.shape)
    for i in range(len(method_scores)):
        scores = method_scores[i]
        multiplication_factor = float(effective_input_length)/(len(scores)+1)
        interpolated_scores = interp1d(x=(np.arange(len(scores))+1)*multiplication_factor, y=scores, kind="linear", fill_value="extrapolate", bounds_error=False)(0.5+np.arange(effective_input_length))
        interpolated_scores = np.pad(interpolated_scores, (0,leftover_sequence_length), 'constant')
        ret[i] = interpolated_scores
    return ret

In [8]:
if INTERPOLATE:
    deeplift_rcrs_allzeros_scores = interpolate_scores(deeplift_rcrs_allzeros_scores)
    deeplift_rcrs_avgc_scores = interpolate_scores(deeplift_rcrs_avgc_scores)
    deeplift_rcrs_shuff10_scores = interpolate_scores(deeplift_rcrs_shuff10_scores)
    deeplift_rcrs_shuff20_scores = interpolate_scores(deeplift_rcrs_shuff20_scores)

    deeplift_rs_allzeros_scores = interpolate_scores(deeplift_rs_allzeros_scores)
    deeplift_rs_avgc_scores = interpolate_scores(deeplift_rs_avgc_scores)
    deeplift_rs_shuff10_scores = interpolate_scores(deeplift_rs_shuff10_scores)
    deeplift_rs_shuff20_scores = interpolate_scores(deeplift_rs_shuff20_scores)

    gradcam_scores = interpolate_scores(gradcam_scores)
    gradtimesact_direct_scores = interpolate_scores(gradtimesact_direct_scores)

    grad_times_input_allzeros_scores = interpolate_scores(grad_times_input_allzeros_scores)
    grad_times_input_avgc_scores = interpolate_scores(grad_times_input_avgc_scores)
    grad_times_input_shuff10_scores = interpolate_scores(grad_times_input_shuff10_scores)
    grad_times_input_shuff20_scores = interpolate_scores(grad_times_input_shuff20_scores)

In [9]:
motif_matches=load_fimo_motif_matches(MOTIF_MATCHES, True)
for key in list(motif_matches.keys()):
    if key not in pos_labels:
        del motif_matches[key]
seq_ids_of_interest = list(motif_matches.keys())
print("#Filtered motif matches to ", len(seq_ids_of_interest), " seqs")

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt ...
#Loaded 4623218 motif matches in 879553 sequences
#Filtered motif matches to  10000  seqs


In [10]:
def get_relevant_labels_in_order_of_scores(labels, motif_matches):
    relevant_labels_list=[]
    relevant_indices_list=[]
    sequence_index=0
    positive_labels=[]
    for label in motif_matches.keys():
        positive_labels.append(label)
    positive_labels_set = set(positive_labels)
    print("Motif matches sequences are " + str(len(positive_labels_set)))
    print("Supplied labels are " + str(len(labels)))
    for sequence_label in labels:
        if sequence_label in positive_labels_set:
            relevant_indices_list.append(sequence_index)
            relevant_labels_list.append(sequence_label)
        else:
            print("Did not find this label in motif matches: " + sequence_label)
        sequence_index=sequence_index+1
    return relevant_indices_list, relevant_labels_list

def get_relevant_scores(relevant_indices_list, scores, seq_len=400):
    relevant_scores=np.zeros((len(relevant_indices_list),seq_len))
    index=0
    for scores_index in relevant_indices_list:
        relevant_scores[index]=scores[scores_index]
        index=index+1
    return relevant_scores

In [14]:
seq_ids_of_interest_set = set(seq_ids_of_interest)
relevant_indices_list, relevant_labels_list=get_relevant_labels_in_order_of_scores(seqids, motif_matches)
seq_ids_of_interest = relevant_labels_list
seq_ids_of_interest_set = set(relevant_labels_list)

deeplift_rcrs_allzeros_scores = get_relevant_scores(relevant_indices_list, deeplift_rcrs_allzeros_scores, REGION_SIZE)
deeplift_rcrs_avgc_scores = get_relevant_scores(relevant_indices_list, deeplift_rcrs_avgc_scores, REGION_SIZE)
deeplift_rcrs_shuff10_scores = get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff10_scores, REGION_SIZE)
deeplift_rcrs_shuff20_scores = get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff20_scores, REGION_SIZE)

deeplift_rs_allzeros_scores = get_relevant_scores(relevant_indices_list, deeplift_rs_allzeros_scores, REGION_SIZE)
deeplift_rs_avgc_scores = get_relevant_scores(relevant_indices_list, deeplift_rs_avgc_scores, REGION_SIZE)
deeplift_rs_shuff10_scores = get_relevant_scores(relevant_indices_list, deeplift_rs_shuff10_scores, REGION_SIZE)
deeplift_rs_shuff20_scores=get_relevant_scores(relevant_indices_list, deeplift_rs_shuff20_scores, REGION_SIZE)

grad_times_input_allzeros_scores = get_relevant_scores(relevant_indices_list, grad_times_input_allzeros_scores, REGION_SIZE)
#grad_times_input_avgc_scores = get_relevant_scores(relevant_indices_list, grad_times_input_avgc_scores, REGION_SIZE)
#grad_times_input_shuff10_scores = get_relevant_scores(relevant_indices_list, grad_times_input_shuff10_scores, REGION_SIZE)
#grad_times_input_shuff20_scores = get_relevant_scores(relevant_indices_list, grad_times_input_shuff20_scores, REGION_SIZE)

#gradcam_scores = get_relevant_scores(relevant_indices_list, gradcam_scores, REGION_SIZE)
#gradtimesact_direct_scores = get_relevant_scores(relevant_indices_list, gradtimesact_direct_scores, REGION_SIZE)

Motif matches sequences are 10000
Supplied labels are 10000


In [15]:
method_to_saved_scores = OrderedDict([('deeplift_rcrs_shuff20', deeplift_rcrs_shuff20_scores),
                                      ('deeplift_rs_shuff20', deeplift_rs_shuff20_scores)])

In [16]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_aurocs = []
per_seq_auprcs = []

for method in list(method_to_saved_scores.keys()):
    total_labels = np.array([])
    total_scores = np.array([])
    for seq in range(num_labels):
        motif_positions = set({})
        seqname = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        scores = np.abs(np.array(method_to_saved_scores[method][index]))
        labels = np.zeros(REGION_SIZE)
        seqentries = motif_matches[seqname]
        for entry in seqentries:
                motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            labels[i] = 1
        total_scores = np.concatenate((total_scores, scores))
        total_labels = np.concatenate((total_labels, labels))
        per_seq_aurocs.append(roc_auc_score(y_true=labels, y_score=scores))
        per_seq_auprcs.append(average_precision_score(y_true=labels, y_score=scores))
        
    print("Method is " + str(method))
    print("Total auroc " + str(roc_auc_score(y_true=total_labels, y_score=total_scores)))
    print("Total auprc " + str(average_precision_score(y_true=total_labels, y_score=total_scores)))
    print("Mean per sequence auroc " + str(np.mean(np.array(per_seq_aurocs))))
    print("Mean per sequence auprc " + str(np.mean(np.array(per_seq_auprcs))))
    print("Per sequence auroc stderr " + str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)))
    print("Per sequence auprc stderr " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)))
    print("\n")
        
    assert(len(per_seq_aurocs) == 10000)
        
    per_seq_aurocs.clear()
    per_seq_auprcs.clear()

Num labels is 10000
Method is deeplift_rcrs_shuff20
Total auroc 0.8585416710368289
Total auprc 0.799208011837059
Mean per sequence auroc 0.8335708659621869
Mean per sequence auprc 0.802858084324112
Per sequence auroc stderr 0.0010064248991613913
Per sequence auprc stderr 0.001228215095757082


Method is deeplift_rs_shuff20
Total auroc 0.8571701835925243
Total auprc 0.7982526752247363
Mean per sequence auroc 0.8326249469858991
Mean per sequence auprc 0.8013814264287471
Per sequence auroc stderr 0.0010091804902958258
Per sequence auprc stderr 0.0012469886292110109


