In [20]:
import pandas as pd
import numpy as np
import pandas as pd
import h5py
import pysam
import os
from modisco.visualization import viz_sequence
from modisco import util
from matplotlib import pyplot as plt
import pybedtools

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [54]:
data = pd.read_csv("imr90.counts.tomtom.tsv",sep="\t",header=0)

In [55]:
data = data[ ~((data[ "Label" ].str.contains("DIMER")==True) | data[ "Label" ].str.contains("IGNORE")==True) ]
data.head()


Unnamed: 0,Pattern,Num_Seqlets,Label,Match_1,q-value,Match_2,q-value.1,Match_3,q-value.2,Match_4,q-value.3,Match_5,q-value.4,Match_6,q-value.5,Match_7,q-value.6,Match_8,q-value.7,Match_9,q-value.8,Match_10,q-value.9
0,metacluster_0.pattern_0,12878,FOS,FOS_HUMAN.H11MO.0.A,0.0010223,FOSL2_MOUSE.H11MO.0.A,0.0010223,FOSL1_HUMAN.H11MO.0.A,0.0010223,FOSL1_MOUSE.H11MO.0.A,0.001022,FOSL2_HUMAN.H11MO.0.A,0.001022,ATF3_MOUSE.H11MO.0.A,0.004283,FOSB_HUMAN.H11MO.0.A,0.004283,JUNB_MOUSE.H11MO.0.A,0.004283,JUN_MOUSE.H11MO.0.A,0.004393,JUNB_HUMAN.H11MO.0.A,0.004393
1,metacluster_0.pattern_1,9160,CTCF,CTCF_MA0139.1,1.49319e-12,CTCF_HUMAN.H11MO.0.A,8.4965e-10,CTCF_MOUSE.H11MO.0.A,3.10842e-08,CTCF_C2H2_1,5e-06,CTCFL_HUMAN.H11MO.0.A,1.1e-05,CTCFL_MOUSE.H11MO.0.A,2.4e-05,CTCFL_MA1102.1,0.000237,ZIC2_MOUSE.H11MO.0.C,0.139479,RARA_nuclearreceptor_6,0.166742,ZIC3_HUMAN.H11MO.0.B,0.166742
3,metacluster_0.pattern_3,6646,FOXC1,Foxc1.mouse_forkhead_2,0.00363342,FOXG1_forkhead_1,0.00601003,FOXJ3_forkhead_2,0.00601003,FOXL1_MA0033.2,0.00601,FOXL1_forkhead_1,0.00601,Foxj2_MA0614.1,0.006467,Foxj3.mouse_forkhead_3,0.006467,FOXJ2_forkhead_2,0.006791,FOXO4_HUMAN.H11MO.0.C,0.006791,FOXO4_MOUSE.H11MO.0.C,0.006791
4,metacluster_0.pattern_4,4786,FOSB+JUNB,FOSB+JUNB_MA1135.1,0.336541,FOSL1+JUND_MA1142.1,0.336541,FOSL2+JUNB_MA1138.1,0.336541,FOSL2+JUND_MA1144.1,0.336541,Jdp2.mouse_bZIP_1,0.336541,FOS+JUN_MA0099.3,0.336541,JDP2_MA0655.1,0.336541,JDP2_bZIP_3,0.336541,JDP2_bZIP_1,0.336541,FOSL1+JUNB_MA1137.1,0.336541
5,metacluster_0.pattern_5,2157,SP1/KLF,KLF12_HUMAN.H11MO.0.C,9.17264e-05,SP3_HUMAN.H11MO.0.B,9.17264e-05,SP3_MOUSE.H11MO.0.B,9.17264e-05,SP1_HUMAN.H11MO.0.A,0.000238,SP1_MA0079.3,0.000238,SP1_MOUSE.H11MO.0.A,0.000545,KLF3_HUMAN.H11MO.0.B,0.000545,KLF3_MOUSE.H11MO.0.A,0.000545,SP4_HUMAN.H11MO.0.A,0.000663,SP4_MOUSE.H11MO.0.B,0.000663


In [27]:
def trim_motif_new(cwm, motif, trim_threshold=0.20):
    """
    Given the PFM and motif (both L x 4 arrays) (the motif could be the
    PFM itself), trims `motif` by cutting off flanks of low information
    content in `pfm`. `min_ic` is the minimum required information
    content. If specified this trimmed motif will be extended on either
    side by `pad` bases.
    If no base passes the `min_ic` threshold, then no trimming is done.
    """
    
    score = np.sum(np.abs(cwm), axis=1)
    trim_thresh = np.max(score) * trim_threshold  # Cut off anything less than 30% of max score
    pass_inds = np.where(score >= trim_thresh)[0]
    trimmed = motif[np.min(pass_inds): np.max(pass_inds) + 1]
 
    if not trimmed.size:
        return motif
    
    return trimmed

def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
    """
    Imports the PFMs to into a dictionary, mapping `(x, y)` to the PFM,
    where `x` is the metacluster index and `y` is the pattern index.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `out_dir`: where to save motifs
        `trim`: if True, trim the motif flanks based on information content
        `only_pos`: if True, only return motifs with positive contributions
    Returns the dictionary of PFMs.
    """ 
    pfms = {}
    with h5py.File(tfm_results_path, "r") as f:
        metaclusters = f["metacluster_idx_to_submetacluster_results"]
        num_metaclusters = len(metaclusters.keys())
        for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
            metacluster = metaclusters[metacluster_key]
            if "patterns" not in metacluster["seqlets_to_patterns_result"]:
                continue
            patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
            num_patterns = len(patterns["all_pattern_names"][:])
            for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
                pattern_name = pattern_name.decode()
                pattern = patterns[pattern_name]
                pfm = pattern["sequence"]["fwd"][:]
                cwm = pattern["task0_contrib_scores"]["fwd"][:]
                
                # Check that the contribution scores are overall positive
                #if only_pos and np.sum(cwm) < 0:
                #    continue
                    
                if trim:
                    pfm = trim_motif_new(cwm, cwm)
                    
                pfms["%d_%d" % (metacluster_i,pattern_i)] = pfm
    return pfms

In [28]:
dttype = 'atac'
mode = 'counts'

modisco_path = '/oak/stanford/groups/akundaje/projects/chrombpnet_paper_new/modisco_jun_30/modisco/ATAC/IMR90/modisco_crop_500_100K_seqs_1/modisco_results_allChroms_'+mode+'.hdf5'
ppm_dir = '/ppms'


In [29]:
pfms = import_tfmodisco_motifs(modisco_path)

In [77]:
import one_hot
data["string"] = "None"

for key in pfms:
    if ("0_" in key):
        print(key)
        motif=pfms[key]
        motif_idx = motif.argmax(axis=1)
        motif_one_hot = np.zeros((1,len(motif_idx),4))
        for i in range(len(motif_idx)):
            motif_one_hot[0,i,motif_idx[i]] = 1
        string_motif = one_hot.one_hot_to_dna(motif_one_hot)
        data.loc[data["Pattern"] == "metacluster_"+key.replace("_",".pattern_"),"string"] = string_motif
    
    if ("1_0" in key) or ("1_3" in key) or ("1_1" in key):
        motif = -1*pfms[key]
        motif_idx = motif.argmax(axis=1)
        motif_one_hot = np.zeros((1,len(motif_idx),4))
        for i in range(len(motif_idx)):
            motif_one_hot[0,i,motif_idx[i]] = 1
        string_motif = one_hot.one_hot_to_dna(motif_one_hot)
        data.loc[data["Pattern"] == "metacluster_"+key.replace("_",".pattern_"),"string"] = string_motif
        
    
    
    


0_0
0_1
0_2
0_3
0_4
0_5
0_6
0_7
0_8
0_9
0_10
0_11
0_12
0_13
0_14
0_15
0_16
0_17
0_18
0_19
0_20
0_21
0_22
0_23
0_24
0_25
0_26
0_27
0_28
0_29
0_30
0_31
0_32
0_33
0_34
0_35
0_36
0_37
0_38
0_39
0_40


In [80]:
data.to_csv("imr90.counts.tomtom.motifs_string.tsv", index=False, header=True, sep="\t")