In [1]:
from __future__ import print_function
import h5py
import numpy as np
import deeplift
import evautils
from scipy import stats
from evautils import sequtils
from evautils import kerasutils
from evautils import dirutils
from evautils import windowscoringutils
from evautils import impscoringutils
from collections import OrderedDict, defaultdict
import os
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    roc_curve, precision_recall_curve)

Using TensorFlow backend.


In [2]:
REGION_SIZE = 400

POS_LABELS = '/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz'
SCORES_FILE='/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.gkmexplain.txt'
MOTIF_MATCHES='/users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt'

In [3]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt ...
#Loaded 4623218 motif matches in 879553 sequences


In [4]:
from collections import OrderedDict
import gzip
import numpy as np
def load_labels_from_bedfile(seqfile):
    seqs = []
    fp = gzip.open(seqfile, "rb")
    print("#Loading " + seqfile + " ...")
    for line in fp:
        line=line.decode('utf8').split()
        seqs.append(line[0])
    fp.close()
    print("#Loaded " + str(len(seqs)) + " seqnames from " + seqfile)
    return seqs

In [5]:
impscores = [np.array( [[float(z) for z in y.split(",")] for y in x.rstrip().split("\t")[2].split(";")])for x in open(SCORES_FILE)]
pos_labels = load_labels_from_bedfile(POS_LABELS)

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz ...
#Loaded 10000 seqnames from /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/top_10k_sim_positives.txt.gz


In [6]:
gkmexplain_scores = np.zeros((10000, 400, 4))
gkm_index = 0
for i in range(len(impscores)):
    if impscores[i].shape != (400,4):
        print(i)
        pos_labels.pop(i)
    else:
        gkmexplain_scores[gkm_index] = impscores[i]
        gkm_index = gkm_index + 1

In [7]:
pos_labels = np.array(pos_labels)
print(gkmexplain_scores.shape)
print(pos_labels.shape)

(10000, 400, 4)
(10000,)


In [8]:
motif_matches=sequtils.load_fimo_motif_matches(MOTIF_MATCHES, True)
print(len(motif_matches))
for key in list(motif_matches.keys()):
    if key not in pos_labels:
        del motif_matches[key]
print(len(motif_matches))
seq_ids_of_interest = list(motif_matches.keys())
print(len(seq_ids_of_interest))

#Loading /users/eprakash/git/interpret-benchmark/data/dnase_positives/common_scripts/A549/sequences/reduced_sim_fimo_out/fimo.txt ...
#Loaded 4623218 motif matches in 879553 sequences
879553
10000
10000


In [9]:
gkmexplain_scores = np.sum(gkmexplain_scores, axis=2)
print(gkmexplain_scores.shape)

(10000, 400)


In [10]:
seq_ids_of_interest_set = set(seq_ids_of_interest)
relevant_indices_list, relevant_labels_list=sequtils.get_relevant_labels_in_order_of_scores(pos_labels, motif_matches)
seq_ids_of_interest = relevant_labels_list
seq_ids_of_interest_set = set(relevant_labels_list)

gkmexplain_scores=sequtils.get_relevant_scores(relevant_indices_list, gkmexplain_scores, REGION_SIZE)

Motif matches sequences are 10000
Supplied labels are 10000


In [11]:
method_to_saved_scores = OrderedDict([('gkmexplain', gkmexplain_scores)
                                      ])

In [14]:
num_labels = len(relevant_labels_list)
print("Num labels is " + str(num_labels))
per_seq_aurocs = []
per_seq_auprcs = []
fp = open('full_vs_filtered/sim_pos_and_neg_position_abs_auroc_auprc.txt', 'w+')
for method in method_to_saved_scores:
    total_labels = np.array([])
    total_scores = np.array([])
    for seq in range(num_labels):
        motif_positions = set({})
        seqname = relevant_labels_list[seq]
        index = relevant_indices_list[seq]
        scores = np.abs(np.array(method_to_saved_scores[method][index]))
        labels = np.zeros(REGION_SIZE)
        seqentries = motif_matches[seqname]
        for entry in seqentries:
                motif_positions.update(range(entry['begin'], entry['end']))
        for i in motif_positions:
            labels[i] = 1
        total_scores = np.concatenate((total_scores, scores))
        total_labels = np.concatenate((total_labels, labels))
        per_seq_aurocs.append(roc_auc_score(y_true=labels, y_score=scores))
        per_seq_auprcs.append(average_precision_score(y_true=labels, y_score=scores))
    print("Method is " + str(method))
    print("Total auroc " + str(roc_auc_score(y_true=total_labels, y_score=total_scores)))
    print("Total auprc " + str(average_precision_score(y_true=total_labels, y_score=total_scores)))
    print("Mean per sequence auroc " + str(np.mean(np.array(per_seq_aurocs))))
    print("Mean per sequence auprc " + str(np.mean(np.array(per_seq_auprcs))))
    print("Per sequence auroc stderr " + str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)))
    print("Per sequence auprc stderr " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)))
    print("\n")
    assert(len(per_seq_aurocs) == 10000)
    fp.write(method + ": [" + str(np.mean(np.array(per_seq_aurocs))) + ", " + str(np.mean(np.array(per_seq_auprcs))) + ", "
+ str(np.std(np.array(per_seq_aurocs))/np.sqrt(num_labels)) + ", " + str(np.std(np.array(per_seq_auprcs))/np.sqrt(num_labels)) + "]\n")
    per_seq_aurocs.clear()
    per_seq_auprcs.clear()
fp.close()

Num labels is 10000
Method is gkmexplain
Total auroc 0.7361339326388987
Total auprc 0.637224551723955
Mean per sequence auroc 0.7745930202593155
Mean per sequence auprc 0.7225030281877441
Per sequence auroc stderr 0.0011875649679340089
Per sequence auprc stderr 0.0013985247440767338


