In [129]:
import numpy as np
import h5py
import keras
from collections import OrderedDict

celltype_to_corefiles = {   
    'GM12878': {
          'ism_scores_npy_file': "/users/eprakash/projects/benchmarking/notebooks/momma_dragonn/ISM_adaptDeepSea_gm12878_pos.npy",
          'ism_scores_seqnames_file': "/users/eprakash/projects/benchmarking/notebooks/momma_dragonn/ISM_scores_adaptDeepSea_GM12878_positive.labels.txt",
          'backpropscores_h5file': "/users/eprakash/projects/benchmarking/notebooks/momma_dragonn/GM12878.adaptDeepSea.posIG.h5",
          'positive_sequences_file': "/users/eprakash/projects/benchmarking/newdata/GM12878/400bp/GM12878.400bp.summits.implanted.bed.gz",
          'seqnames_used_for_ig_h5file': '/users/eprakash/projects/benchmarking/notebooks/momma_dragonn/top1kposlabels_GM12878_1kb',
          'motifmatches_file': '/users/eprakash/projects/benchmarking/newdata/GM12878/400bp/GM12878.motif.matches.txt'
    },
    'A549': {
        'ism_scores_npy_file':"/users/eprakash/projects/benchmarking/newdata/A549/models/deepseabeluga/results/A549.deepseabeluga.ISM.scores.5Ksubsample.npy",
        'ism_scores_seqnames_file': "/users/eprakash/projects/benchmarking/newdata/A549/models/deepseabeluga/results/ISM_deepseabeluga_A549_positive.labels.txt",
        'backpropscores_h5file': '/users/eprakash/projects/benchmarking/newdata/A549/models/deepseabeluga/results/A549.deepseabeluga.scores.5Ksubsample.h5',
        'positive_sequences_file': "/users/eprakash/projects/benchmarking/newdata/A549/A549.summits.400bp.implanted.5Ksubsample.bed.gz",
        'seqnames_used_for_ig_h5file': "/users/eprakash/projects/benchmarking/newdata/A549/models/deepseabeluga/results/top1kposlabels_A549_1kb",
        'motifmatches_file': '/users/eprakash/projects/benchmarking/newdata/A549/A549.motif.matches.txt',
        'variedrefs_h5file': '/users/eprakash/projects/benchmarking/newdata/A549/models/deepseabeluga/results/A549.deepseabeluga.scores.variedrefs.5Ksubsample.h5'
    }
}


In [130]:
import h5py
import numpy as np
import gzip

def onehot_encode(seqs):
    ltr = {'A': [1,0,0,0], 'C': [0,1,0,0], 'G': [0,0,1,0], 'T': [0,0,0,1], 'N': [0,0,0,0]}
    return np.array([[ltr[x] for x in seq.upper()] for seq in seqs])


def get_indices_of_subset(superset_seqnames, subset_seqnames):
    seqname_to_idx = dict([(x[1], x[0]) for x in enumerate(superset_seqnames)])
    idx_ordering = [seqname_to_idx[x] for x in subset_seqnames]
    return idx_ordering


def load_posseqs(corefiles, pos_idx_ordering):
    all_posseqs = [x.decode("utf-8").rstrip().split("\t")[1]
                   for x in gzip.open(corefiles['positive_sequences_file'])]
    posseqs = [all_posseqs[idx] for idx in pos_idx_ordering]
    return onehot_encode(posseqs), posseqs


def load_ism_scores(method_to_scores, corefiles, onehot, ism_idx_ordering):
    ism_scores_npy = corefiles['ism_scores_npy_file']
    ism_scores = np.load(ism_scores_npy)[ism_idx_ordering]
    #do some sanity checking
    #i.e. make sure that the ism scores are one-hot masked according to the
    # corresponding one-hot encoded sequence
    assert np.max(np.abs(np.sum(ism_scores*onehot,axis=-1)-np.sum(ism_scores,axis=-1)))==0.0
    method_to_scores['ism'] = np.sum(ism_scores,axis=-1)


def load_ig_scores(method_to_scores, corefiles, ig_idx_ordering):
    method_to_scores['ig10_multiref10'] = np.array(h5py.File(
        corefiles['backpropscores_h5file'], "r")
                 ['integrated_gradients10_multiref_10'][:]
                 [ig_idx_ordering])


def load_nonig_scores(method_to_scores, corefiles, nonig_idx_ordering):
    h5pyfile = h5py.File(corefiles['backpropscores_h5file'], "r")
    method_to_scores['gradtimesinp'] = h5pyfile['grad_times_inp'][:][nonig_idx_ordering]
    method_to_scores['deeplift-RS_multiref10'] = h5pyfile['rescale_all_layers_multiref_10'][:][nonig_idx_ordering]
    method_to_scores['deeplift-RC_multiref10'] = h5pyfile['rescale_conv_revealcancel_fc_multiref_10'][:][nonig_idx_ordering]


def load_variedrefs_scores(method_to_scores, corefiles, variedrefs_idx_ordering):
    #'rescale_all_layers_avg_gc_ref'
    h5pyfile = h5py.File(corefiles['variedrefs_h5file'], "r")
    method_to_scores['ig10_zeroref'] = h5pyfile['integrated_gradients10_all_zeros_ref'][:][variedrefs_idx_ordering]
    method_to_scores['ig10_gcref'] = h5pyfile['integrated_gradients10_avg_gc_ref'][:][variedrefs_idx_ordering]
    method_to_scores['deeplift-RS_zeroref'] = h5pyfile['rescale_all_layers_all_zeros_ref'][:][variedrefs_idx_ordering]
    method_to_scores['deeplift-RS_gcref'] = h5pyfile['rescale_all_layers_avg_gc_ref'][:][variedrefs_idx_ordering]
    

def get_scores_for_common_sequences(corefiles):
    corefiles = celltype_to_corefiles[celltype]
    
    #######
    #Load all the seqnames
    positives_seqnames = [x.decode("utf-8").rstrip().split("\t")[0]
                          for x in gzip.open(corefiles['positive_sequences_file'])]
    ig_seqnames = [x.decode("utf-8") for x in
                   h5py.File(corefiles['seqnames_used_for_ig_h5file'])['labels'][:]]
    nonig_backprop_seqnames = [
        x.decode("utf-8") for x in
        h5py.File(corefiles['backpropscores_h5file'])['labels'][:]]
    ism_seqnames = [
        x.rstrip() for x in open(corefiles['ism_scores_seqnames_file'])]
    all_seqnames = [ism_seqnames, nonig_backprop_seqnames, ig_seqnames]
    if ('variedrefs_h5file' in corefiles):
        variedrefs_seqnames = [x.decode("utf-8") for x in
                               h5py.File(corefiles['backpropscores_h5file'])['labels'][:]]
        all_seqnames.append(variedrefs_seqnames)
    
    ########
    #Figure out the common seqnames
    common_seqnames = set(all_seqnames[0])
    for seqnames in all_seqnames[1:]:
        common_seqnames = common_seqnames.intersection(set(seqnames))
    common_seqnames = sorted(list(common_seqnames))
    assert len(common_seqnames) == min(len(x) for x in all_seqnames), (len(common_seqnames), [len(x) for x in all_seqnames])
    print("Number of common seqnames:",len(common_seqnames))
    
    ########
    #Figure out the mapping from sequence to indices for the common seqnames
    positives_idx_ordering = get_indices_of_subset(superset_seqnames=positives_seqnames,
                                                   subset_seqnames=common_seqnames)
    ism_idx_ordering = get_indices_of_subset(superset_seqnames=ism_seqnames,
                                             subset_seqnames=common_seqnames)
    ig_idx_ordering = get_indices_of_subset(superset_seqnames=ig_seqnames,
                                            subset_seqnames=common_seqnames)
    nonig_idx_ordering = get_indices_of_subset(superset_seqnames=nonig_backprop_seqnames,
                                               subset_seqnames=common_seqnames)
    if ('variedrefs_h5file' in corefiles):
        variedrefs_idx_ordering = get_indices_of_subset(superset_seqnames=variedrefs_seqnames,
                                                        subset_seqnames=common_seqnames)
    
    ########
    #Load the data using the idx ordering
    onehot_posseqs, posseqs = load_posseqs(corefiles=corefiles,
                                           pos_idx_ordering=positives_idx_ordering)
    method_to_scores = {}
    load_ism_scores(method_to_scores=method_to_scores,
                    corefiles=corefiles, onehot=onehot_posseqs,
                    ism_idx_ordering=ism_idx_ordering)
    load_ig_scores(method_to_scores=method_to_scores,
                   corefiles=corefiles,
                   ig_idx_ordering=ig_idx_ordering)
    load_nonig_scores(method_to_scores=method_to_scores,
                      corefiles=corefiles,
                      nonig_idx_ordering=nonig_idx_ordering)
    if ('variedrefs_h5file' in corefiles):
        load_variedrefs_scores(method_to_scores=method_to_scores,
                               corefiles=corefiles,
                               variedrefs_idx_ordering=variedrefs_idx_ordering)
    
    return method_to_scores, onehot_posseqs, posseqs

In [131]:
celltype = 'A549'
#get the scores for the different methods for those common sequences
method_to_scores, onehot_seqs, seqs =\
    get_scores_for_common_sequences(celltype_to_corefiles[celltype])


Number of common seqnames: 1000


In [132]:
method_to_scores.keys()

dict_keys(['ism', 'ig10_multiref10', 'gradtimesinp', 'deeplift-RS_multiref10', 'deeplift-RC_multiref10', 'ig10_zeroref', 'ig10_gcref', 'deeplift-RS_zeroref', 'deeplift-RS_gcref'])

In [5]:
import deeplift
from deeplift.visualization import viz_sequence