In [4]:
import numpy as np
import pandas as pd
import h5py
import pysam
import os
from modisco.visualization import viz_sequence
from modisco import util
from matplotlib import pyplot as plt
import pybedtools
from bs4 import BeautifulSoup

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [5]:
dttype = 'ATAC'
mode = "counts"
celltype="HEPG2"
modisco_path = '/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/'+celltype+'/merge_folds_new_may_05_24/'+mode+'/modisco_old_format.h5'
ppm_dir = ''

htmld="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/"+dttype+'/'+celltype+"/merge_folds_new_may_05_24/"+mode+"/motifs.html"
tomtom = pd.read_html(htmld)


background=[0.25, 0.25, 0.25, 0.25]

In [6]:
def trim_motif_new(cwm, motif, trim_threshold=0.20):
    """
    Given the PFM and motif (both L x 4 arrays) (the motif could be the
    PFM itself), trims `motif` by cutting off flanks of low information
    content in `pfm`. `min_ic` is the minimum required information
    content. If specified this trimmed motif will be extended on either
    side by `pad` bases.
    If no base passes the `min_ic` threshold, then no trimming is done.
    """
    
    score = np.sum(np.abs(cwm), axis=1)
    trim_thresh = np.max(score) * trim_threshold  # Cut off anything less than 30% of max score
    pass_inds = np.where(score >= trim_thresh)[0]
    trimmed = motif[np.min(pass_inds): np.max(pass_inds) + 1]
 
    if not trimmed.size:
        return motif
    
    return trimmed

def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
    """
    Imports the PFMs to into a dictionary, mapping `(x, y)` to the PFM,
    where `x` is the metacluster index and `y` is the pattern index.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `out_dir`: where to save motifs
        `trim`: if True, trim the motif flanks based on information content
        `only_pos`: if True, only return motifs with positive contributions
    Returns the dictionary of PFMs.
    """ 
    pfms = {}
    with h5py.File(tfm_results_path, "r") as f:
        metaclusters = f["metacluster_idx_to_submetacluster_results"]
        num_metaclusters = len(metaclusters.keys())
        for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
            metacluster = metaclusters[metacluster_key]
            if "patterns" not in metacluster["seqlets_to_patterns_result"]:
                continue
            patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
            num_patterns = len(patterns["all_pattern_names"][:])
            for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
#                pattern_name = pattern_name.decode()
                pattern_name = pattern_name

                pattern = patterns[pattern_name]
                pfm = pattern["sequence"]["fwd"][:]
                cwm = pattern["task0_contrib_scores"]["fwd"][:]
                
                # Check that the contribution scores are overall positive
                if only_pos and np.sum(cwm) < 0:
                    continue
                    
                if trim:
                    pfm = trim_motif_new(cwm, cwm)
                else:
                    pfm = cwm
                    
                pfms["%d_%d" % (metacluster_i,pattern_i)] = pfm
    return pfms

In [7]:
pfms = import_tfmodisco_motifs(modisco_path, trim=False)

In [8]:
for key in pfms:
    f = open(os.path.join(ppm_dir,mode+"_"+key+".pfm"),"w")
    #print(pfms[key])
    np.savetxt(f, pfms[key], fmt='%f')
    f.close()
    

In [9]:
tomtom[0]

Unnamed: 0,pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev,match0,qval0,match0_logo,match1,qval1,match1_logo,match2,qval2,match2_logo
0,pos_patterns.pattern_0,39695,,,CTCF_MA0139.1,4.59865e-16,,CTCF_HUMAN.H11MO.0.A,2.05048e-12,,CTCF_MOUSE.H11MO.0.A,1.13801e-10,
1,pos_patterns.pattern_1,34273,,,HNF4G_MA0484.1,2.89122e-09,,HNF4G_HUMAN.H11MO.0.B,3.26683e-07,,HNF4G_MOUSE.H11MO.0.C,3.26683e-07,
2,pos_patterns.pattern_2,10841,,,FOXM1_HUMAN.H11MO.0.A,0.00151191,,FOXM1_MOUSE.H11MO.0.B,0.00151191,,Foxd3_MA0041.1,0.0553071,
3,pos_patterns.pattern_3,10650,,,KLF12_HUMAN.H11MO.0.C,0.00019207,,SP1_HUMAN.H11MO.0.A,0.00019207,,KLF3_HUMAN.H11MO.0.B,0.000517787,
4,pos_patterns.pattern_4,10045,,,FOXO6_MA0849.1,0.000703126,,FOXO6_forkhead_2,0.000703126,,FOXO3_HUMAN.H11MO.0.B,0.00095991,
5,pos_patterns.pattern_5,8953,,,HNF1B_MA0153.2,2.2516e-08,,HNF1B_homeodomain_1,2.2516e-08,,HNF1B_homeodomain_2,9.80389e-07,
6,pos_patterns.pattern_6,6915,,,ATF3_MOUSE.H11MO.0.A,0.00391319,,FOSL2_MOUSE.H11MO.0.A,0.00391319,,JUNB_HUMAN.H11MO.0.A,0.00391319,
7,pos_patterns.pattern_7,5571,,,NFYB_HUMAN.H11MO.0.A,0.00343236,,NFYB_MOUSE.H11MO.0.A,0.00343236,,NFYA_HUMAN.H11MO.0.A,0.00414486,
8,pos_patterns.pattern_8,5513,,,ELK4_MA0076.2,0.00017118,,ELK1_MOUSE.H11MO.0.B,0.00017118,,Gabpa_MA0062.2,0.00017118,
9,pos_patterns.pattern_9,5038,,,CEBPA_HUMAN.H11MO.0.A,8.89186e-06,,CEBPA_MA0102.3,8.89186e-06,,CEBPA_MOUSE.H11MO.0.A,8.89186e-06,


In [10]:

tomtom[0]["pattern"] = tomtom[0]["pattern"].str.replace("pos_patterns.pattern","0").str.replace("neg_patterns.pattern","1")


  """Entry point for launching an IPython kernel.


In [11]:
tomtom[0]

Unnamed: 0,pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev,match0,qval0,match0_logo,match1,qval1,match1_logo,match2,qval2,match2_logo
0,0_0,39695,,,CTCF_MA0139.1,4.59865e-16,,CTCF_HUMAN.H11MO.0.A,2.05048e-12,,CTCF_MOUSE.H11MO.0.A,1.13801e-10,
1,0_1,34273,,,HNF4G_MA0484.1,2.89122e-09,,HNF4G_HUMAN.H11MO.0.B,3.26683e-07,,HNF4G_MOUSE.H11MO.0.C,3.26683e-07,
2,0_2,10841,,,FOXM1_HUMAN.H11MO.0.A,0.00151191,,FOXM1_MOUSE.H11MO.0.B,0.00151191,,Foxd3_MA0041.1,0.0553071,
3,0_3,10650,,,KLF12_HUMAN.H11MO.0.C,0.00019207,,SP1_HUMAN.H11MO.0.A,0.00019207,,KLF3_HUMAN.H11MO.0.B,0.000517787,
4,0_4,10045,,,FOXO6_MA0849.1,0.000703126,,FOXO6_forkhead_2,0.000703126,,FOXO3_HUMAN.H11MO.0.B,0.00095991,
5,0_5,8953,,,HNF1B_MA0153.2,2.2516e-08,,HNF1B_homeodomain_1,2.2516e-08,,HNF1B_homeodomain_2,9.80389e-07,
6,0_6,6915,,,ATF3_MOUSE.H11MO.0.A,0.00391319,,FOSL2_MOUSE.H11MO.0.A,0.00391319,,JUNB_HUMAN.H11MO.0.A,0.00391319,
7,0_7,5571,,,NFYB_HUMAN.H11MO.0.A,0.00343236,,NFYB_MOUSE.H11MO.0.A,0.00343236,,NFYA_HUMAN.H11MO.0.A,0.00414486,
8,0_8,5513,,,ELK4_MA0076.2,0.00017118,,ELK1_MOUSE.H11MO.0.B,0.00017118,,Gabpa_MA0062.2,0.00017118,
9,0_9,5038,,,CEBPA_HUMAN.H11MO.0.A,8.89186e-06,,CEBPA_MA0102.3,8.89186e-06,,CEBPA_MOUSE.H11MO.0.A,8.89186e-06,


In [12]:
tomtom[0][["pattern","num_seqlets"]].to_csv(os.path.join(ppm_dir,mode+"_counts.csv"),sep=",",index=False, header=False)