In [1]:
from __future__ import print_function
import h5py
import numpy as np
import deeplift
import evautils
from scipy import stats
from evautils import sequtils
from evautils import kerasutils
from evautils import dirutils
from evautils import windowscoringutils
from evautils import impscoringutils
from collections import OrderedDict, defaultdict
import os
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve)

Using TensorFlow backend.


In [2]:
REGION_SIZE = 400

POS_LABELS = '/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz'
SCORES_FILE='/users/eprakash/git/interpret-benchmark/scripts/deeplift_refs/deepsea_beluga/A549/sim_pos_and_neg.h5'
MOTIF_MATCHES='/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt'

In [3]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt ...
#Loaded 4623218 motif matches in 879553 sequences


In [4]:
from collections import OrderedDict
import gzip
import numpy as np
def load_labels_from_bedfile(seqfile):
    seqs = []
    fp = gzip.open(seqfile, "rb")
    print("#Loading " + seqfile + " ...")
    for line in fp:
        line=line.decode('utf8').split()
        seqs.append(line[0])
    fp.close()
    print("#Loaded " + str(len(seqs)) + " seqnames from " + seqfile)
    return np.array(seqs)

In [5]:
h5f = h5py.File(SCORES_FILE,'r')
pos_labels = load_labels_from_bedfile(POS_LABELS)
print(h5f.keys())

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz ...
#Loaded 10000 seqnames from /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz
<KeysViewHDF5 ['deeplift-rcrs_ref:shuff-30', 'deeplift-rcrs_ref:shuff-40', 'deeplift-rcrs_ref:shuff-50', 'deeplift-rs_ref:shuff-30', 'deeplift-rs_ref:shuff-40', 'deeplift-rs_ref:shuff-50', 'seqids']>


In [6]:
deeplift_rcrs_shuff30_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-30"))
deeplift_rcrs_shuff40_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-40"))
deeplift_rcrs_shuff50_scores=np.array(h5f.get("deeplift-rcrs_ref:shuff-50"))


deeplift_rs_shuff30_scores=np.array(h5f.get("deeplift-rs_ref:shuff-30"))
deeplift_rs_shuff40_scores=np.array(h5f.get("deeplift-rs_ref:shuff-40"))
deeplift_rs_shuff50_scores=np.array(h5f.get("deeplift-rs_ref:shuff-50"))


seqids = [x.decode('utf-8') for x in np.array(h5f.get("seqids"))]
#print(seqids.shape)

h5f.close()

In [7]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)
print(len(motif_matches))
for key in list(motif_matches.keys()):
    if key not in pos_labels:
        del motif_matches[key]
print(len(motif_matches))
seq_ids_of_interest = list(motif_matches.keys())
print(len(seq_ids_of_interest))

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt ...
#Loaded 4623218 motif matches in 879553 sequences
879553
10000
10000


In [8]:
seq_ids_of_interest_set = set(seq_ids_of_interest)
relevant_indices_list, relevant_labels_list=sequtils.get_relevant_labels_in_order_of_scores(seqids, motif_matches)
seq_ids_of_interest = relevant_labels_list
seq_ids_of_interest_set = set(relevant_labels_list)

deeplift_rcrs_shuff30_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff30_scores, REGION_SIZE)
deeplift_rcrs_shuff40_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff40_scores, REGION_SIZE)
deeplift_rcrs_shuff50_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rcrs_shuff50_scores, REGION_SIZE)


deeplift_rs_shuff30_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff30_scores, REGION_SIZE)
deeplift_rs_shuff40_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff40_scores, REGION_SIZE)
deeplift_rs_shuff50_scores=sequtils.get_relevant_scores(relevant_indices_list, deeplift_rs_shuff50_scores, REGION_SIZE)

Motif matches sequences are 10000
Supplied labels are 10000


In [9]:
method_to_saved_scores = OrderedDict([
                                      ('deeplift_rcrs_shuff30', deeplift_rcrs_shuff30_scores),
                                      ('deeplift_rcrs_shuff40', deeplift_rcrs_shuff40_scores),
                                      ('deeplift_rcrs_shuff50', deeplift_rcrs_shuff50_scores),
                                      ('deeplift_rs_shuff30', deeplift_rs_shuff30_scores),
                                      ('deeplift_rs_shuff40', deeplift_rs_shuff40_scores),
                                      ('deeplift_rs_shuff50', deeplift_rs_shuff50_scores),
                                      ])
                                      
method_to_seq_id_to_scores={}
windowscoringutils.collectSeqIdToScoresForAllMethods(method_to_saved_scores, seq_ids_of_interest, relevant_labels_list, seq_ids_of_interest, method_to_seq_id_to_scores, seq_ids_of_interest)

In [11]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_aurocs = []
per_seq_auprcs = []
fp = open('full_vs_filtered/sim_pos_and_neg_position_abs_auroc_auprc.txt', 'w+')
for method in method_to_saved_scores:
    total_labels = np.array([])
    total_scores = np.array([])
    for seq in range(num_labels):
        motif_positions = set({})
        seqname = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        scores = np.abs(np.array(method_to_saved_scores[method][index]))
        labels = np.zeros(REGION_SIZE)
        seqentries = motif_matches[seqname]
        for entry in seqentries:
                motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            labels[i] = 1
        total_scores = np.concatenate((total_scores, scores))
        total_labels = np.concatenate((total_labels, labels))
        per_seq_aurocs.append(roc_auc_score(y_true=labels, y_score=scores))
        per_seq_auprcs.append(average_precision_score(y_true=labels, y_score=scores))
    print("Method is " + str(method))
    print("Total auroc " + str(roc_auc_score(y_true=total_labels, y_score=total_scores)))
    print("Total auprc " + str(average_precision_score(y_true=total_labels, y_score=total_scores)))
    print("Mean per sequence auroc " + str(np.mean(np.array(per_seq_aurocs))))
    print("Mean per sequence auprc " + str(np.mean(np.array(per_seq_auprcs))))
    print("Per sequence auroc stderr " + str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)))
    print("Per sequence auprc stderr " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)))
    print("\n")
    assert(len(per_seq_aurocs) == 10000)
    fp.write(method + ": [" + str(np.mean(np.array(per_seq_aurocs))) + ", " + str(np.mean(np.array(per_seq_auprcs))) + ", "
+ str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)) + ", " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)) + "]\n")
    per_seq_aurocs.clear()
    per_seq_auprcs.clear()
fp.close()

Num labels is 10000
Method is deeplift_rcrs_shuff30
Total auroc 0.8597292730073394
Total auprc 0.8015140861266064
Mean per sequence auroc 0.8351737821817252
Mean per sequence auprc 0.8047417892039619
Per sequence auroc stderr 0.000999351067818638
Per sequence auprc stderr 0.001225645137379767


Method is deeplift_rcrs_shuff40
Total auroc 0.8603983615756234
Total auprc 0.8027032714931716
Mean per sequence auroc 0.8358655686043142
Mean per sequence auprc 0.8056655143029744
Per sequence auroc stderr 0.000997892916112205
Per sequence auprc stderr 0.0012261296244201855


Method is deeplift_rcrs_shuff50
Total auroc 0.860819277059532
Total auprc 0.8034287023497878
Mean per sequence auroc 0.8362745789647643
Mean per sequence auprc 0.8060675337102734
Per sequence auroc stderr 0.0009963504205190146
Per sequence auprc stderr 0.0012256130069386302


Method is deeplift_rs_shuff30
Total auroc 0.85831637585939
Total auprc 0.8005113305786993
Mean per sequence auroc 0.8339998895313351
Mean per sequence