In [1]:
import numpy as np
import pandas as pd
import h5py
import pysam
import os
from modisco.visualization import viz_sequence
from modisco import util
from matplotlib import pyplot as plt
import pybedtools

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [2]:
dttype = 'atac'
mode = 'profile'

#celltype="GM12878"
#celltype="GM12878_250M"
#celltype="GM12878_100M"
#celltype="GM12878_50M"
#celltype="GM12878_25M"
celltype="GM12878_5M"

#model="GM12878_03.01.2022_bias_128_4_1234_0.4_fold_0"
#model="GM12878_250M_07.19.2022_bias_transfer_1234_fold_0_data_type_ATAC_PE"
#model="GM12878_100M_07.19.2022_bias_transfer_1234_fold_0_data_type_ATAC_PE"
#model="GM12878_50M_07.18.2022_bias_transfer_1234_fold_0_data_type_ATAC_PE"
#model="GM12878_25M_07.18.2022_bias_transfer_1234_fold_0_data_type_ATAC_PE"
model="GM12878_5M_07.18.2022_bias_transfer_1234_fold_0_data_type_ATAC_PE"

modisco_path = '/oak/stanford/groups/akundaje/projects/chrombpnet_paper_new/modisco_jun_30/modisco/ATAC/'+celltype+'/modisco_crop_500_100K_seqs_1/modisco_results_allChroms_'+mode+'.hdf5'
ppm_dir = '/mnt/lab_data3/anusri/chrombpnet/results/chrombpnet/ATAC_PE/'+celltype+'/'+model+'/09_06_2022_motif_scanning/'
tomtom = pd.read_csv('/oak/stanford/groups/akundaje/projects/chrombpnet_paper_new/modisco_jun_30/modisco/ATAC/'+celltype+'/modisco_crop_500_100K_seqs_1/' + mode + ".tomtom.tsv", sep="\t")




background=[0.25, 0.25, 0.25, 0.25]

In [3]:
def trim_motif_new(cwm, motif, trim_threshold=0.20):
    """
    Given the PFM and motif (both L x 4 arrays) (the motif could be the
    PFM itself), trims `motif` by cutting off flanks of low information
    content in `pfm`. `min_ic` is the minimum required information
    content. If specified this trimmed motif will be extended on either
    side by `pad` bases.
    If no base passes the `min_ic` threshold, then no trimming is done.
    """
    
    score = np.sum(np.abs(cwm), axis=1)
    trim_thresh = np.max(score) * trim_threshold  # Cut off anything less than 30% of max score
    pass_inds = np.where(score >= trim_thresh)[0]
    trimmed = motif[np.min(pass_inds): np.max(pass_inds) + 1]
 
    if not trimmed.size:
        return motif
    
    return trimmed

def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
    """
    Imports the PFMs to into a dictionary, mapping `(x, y)` to the PFM,
    where `x` is the metacluster index and `y` is the pattern index.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `out_dir`: where to save motifs
        `trim`: if True, trim the motif flanks based on information content
        `only_pos`: if True, only return motifs with positive contributions
    Returns the dictionary of PFMs.
    """ 
    pfms = {}
    with h5py.File(tfm_results_path, "r") as f:
        metaclusters = f["metacluster_idx_to_submetacluster_results"]
        num_metaclusters = len(metaclusters.keys())
        for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
            metacluster = metaclusters[metacluster_key]
            if "patterns" not in metacluster["seqlets_to_patterns_result"]:
                continue
            patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
            num_patterns = len(patterns["all_pattern_names"][:])
            for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
                pattern_name = pattern_name.decode()
                pattern = patterns[pattern_name]
                pfm = pattern["sequence"]["fwd"][:]
                cwm = pattern["task0_contrib_scores"]["fwd"][:]
                
                # Check that the contribution scores are overall positive
                if only_pos and np.sum(cwm) < 0:
                    continue
                    
                if trim:
                    pfm = trim_motif_new(cwm, cwm)
                    
                pfms["%d_%d" % (metacluster_i,pattern_i)] = pfm
    return pfms

In [4]:
pfms = import_tfmodisco_motifs(modisco_path)

In [5]:
for key in pfms:
    f = open(os.path.join(ppm_dir,mode+"_"+key+".pfm"),"w")
    #print(pfms[key])
    np.savetxt(f, pfms[key], fmt='%f')
    f.close()
    

In [6]:
tomtom.head()

Unnamed: 0,Pattern,Num_Seqlets,Match_1,q-value,Match_2,q-value.1,Match_3,q-value.2,Match_4,q-value.3,Match_5,q-value.4,Match_6,q-value.5,Match_7,q-value.6,Match_8,q-value.7,Match_9,q-value.8,Match_10,q-value.9
0,metacluster_0.pattern_0,12156,CTCF_MA0139.1,4.8443e-14,CTCF_HUMAN.H11MO.0.A,6.51165e-10,CTCF_MOUSE.H11MO.0.A,6.66203e-09,CTCF_C2H2_1,5e-06,CTCFL_HUMAN.H11MO.0.A,6e-06,CTCFL_MOUSE.H11MO.0.A,1.2e-05,CTCFL_MA1102.1,0.000162,ZIC2_MOUSE.H11MO.0.C,0.113881,ZIC3_HUMAN.H11MO.0.B,0.121723,ZIC3_MOUSE.H11MO.0.A,0.121723
1,metacluster_0.pattern_1,11001,IRF4_MOUSE.H11MO.0.A,0.00517109,IRF4_HUMAN.H11MO.0.A,0.0058125,SIX2_MA1119.1,0.00963501,IRF1_MOUSE.H11MO.0.A,0.009635,STAT2_HUMAN.H11MO.0.A,0.009635,STAT2_MOUSE.H11MO.0.A,0.009635,IRF8_HUMAN.H11MO.0.B,0.009635,IRF8_MOUSE.H11MO.0.A,0.009635,IRF1_HUMAN.H11MO.0.A,0.009712,STAT1_MOUSE.H11MO.0.A,0.017139
2,metacluster_0.pattern_2,8604,ELF5_HUMAN.H11MO.0.A,0.000282115,SPIB_MOUSE.H11MO.0.A,0.00181492,SPI1_HUMAN.H11MO.0.A,0.00181492,SPI1_MOUSE.H11MO.0.A,0.001815,SPIB_HUMAN.H11MO.0.A,0.001867,BC11A_HUMAN.H11MO.0.A,0.004158,ERG_HUMAN.H11MO.0.A,0.004573,ETV5_HUMAN.H11MO.0.C,0.004757,EHF_HUMAN.H11MO.0.B,0.004757,EHF_MOUSE.H11MO.0.B,0.004757
3,metacluster_0.pattern_3,8417,FOS+JUND_MA1141.1,0.00302037,FOSL1+JUN_MA1128.1,0.00302037,JDP2_MA0655.1,0.00302037,JDP2_bZIP_3,0.00302,NF2L2_HUMAN.H11MO.0.A,0.00302,FOS+JUN_MA0099.3,0.00302,FOSL2+JUN_MA1130.1,0.00302,NFE2_MA0841.1,0.00302,NFE2_bZIP_1,0.00302,Jdp2.mouse_bZIP_1,0.00302
4,metacluster_0.pattern_4,4671,NFKB1_HUMAN.H11MO.1.B,8.58484e-05,NFKB1_MOUSE.H11MO.0.A,8.58484e-05,TF65_HUMAN.H11MO.0.A,8.58484e-05,RELA_MA0107.1,8.8e-05,TF65_MOUSE.H11MO.0.A,0.000375,REL_MA0101.1,0.000375,RELB_HUMAN.H11MO.0.C,0.000375,RELB_MOUSE.H11MO.0.C,0.000375,NFKB2_HUMAN.H11MO.0.B,0.001616,NFKB2_MOUSE.H11MO.0.C,0.001616


In [7]:

tomtom["Pattern"] = tomtom["Pattern"].str.replace("metacluster_","").str.replace(".pattern_","_")

  """Entry point for launching an IPython kernel.


In [8]:
tomtom.head()

Unnamed: 0,Pattern,Num_Seqlets,Match_1,q-value,Match_2,q-value.1,Match_3,q-value.2,Match_4,q-value.3,Match_5,q-value.4,Match_6,q-value.5,Match_7,q-value.6,Match_8,q-value.7,Match_9,q-value.8,Match_10,q-value.9
0,0_0,12156,CTCF_MA0139.1,4.8443e-14,CTCF_HUMAN.H11MO.0.A,6.51165e-10,CTCF_MOUSE.H11MO.0.A,6.66203e-09,CTCF_C2H2_1,5e-06,CTCFL_HUMAN.H11MO.0.A,6e-06,CTCFL_MOUSE.H11MO.0.A,1.2e-05,CTCFL_MA1102.1,0.000162,ZIC2_MOUSE.H11MO.0.C,0.113881,ZIC3_HUMAN.H11MO.0.B,0.121723,ZIC3_MOUSE.H11MO.0.A,0.121723
1,0_1,11001,IRF4_MOUSE.H11MO.0.A,0.00517109,IRF4_HUMAN.H11MO.0.A,0.0058125,SIX2_MA1119.1,0.00963501,IRF1_MOUSE.H11MO.0.A,0.009635,STAT2_HUMAN.H11MO.0.A,0.009635,STAT2_MOUSE.H11MO.0.A,0.009635,IRF8_HUMAN.H11MO.0.B,0.009635,IRF8_MOUSE.H11MO.0.A,0.009635,IRF1_HUMAN.H11MO.0.A,0.009712,STAT1_MOUSE.H11MO.0.A,0.017139
2,0_2,8604,ELF5_HUMAN.H11MO.0.A,0.000282115,SPIB_MOUSE.H11MO.0.A,0.00181492,SPI1_HUMAN.H11MO.0.A,0.00181492,SPI1_MOUSE.H11MO.0.A,0.001815,SPIB_HUMAN.H11MO.0.A,0.001867,BC11A_HUMAN.H11MO.0.A,0.004158,ERG_HUMAN.H11MO.0.A,0.004573,ETV5_HUMAN.H11MO.0.C,0.004757,EHF_HUMAN.H11MO.0.B,0.004757,EHF_MOUSE.H11MO.0.B,0.004757
3,0_3,8417,FOS+JUND_MA1141.1,0.00302037,FOSL1+JUN_MA1128.1,0.00302037,JDP2_MA0655.1,0.00302037,JDP2_bZIP_3,0.00302,NF2L2_HUMAN.H11MO.0.A,0.00302,FOS+JUN_MA0099.3,0.00302,FOSL2+JUN_MA1130.1,0.00302,NFE2_MA0841.1,0.00302,NFE2_bZIP_1,0.00302,Jdp2.mouse_bZIP_1,0.00302
4,0_4,4671,NFKB1_HUMAN.H11MO.1.B,8.58484e-05,NFKB1_MOUSE.H11MO.0.A,8.58484e-05,TF65_HUMAN.H11MO.0.A,8.58484e-05,RELA_MA0107.1,8.8e-05,TF65_MOUSE.H11MO.0.A,0.000375,REL_MA0101.1,0.000375,RELB_HUMAN.H11MO.0.C,0.000375,RELB_MOUSE.H11MO.0.C,0.000375,NFKB2_HUMAN.H11MO.0.B,0.001616,NFKB2_MOUSE.H11MO.0.C,0.001616


In [9]:
tomtom[["Pattern","Num_Seqlets"]].to_csv(os.path.join(ppm_dir,mode+"_counts.csv"),sep=",",index=False, header=False)