In [1]:
import numpy as np
import pandas as pd
import h5py
import pysam
import os
from modisco.visualization import viz_sequence
from modisco import util
from matplotlib import pyplot as plt
import pybedtools

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [14]:
dttype = 'atac'
mode = 'profile'
modisco_path_atac = '/oak/stanford/groups/akundaje/projects/chrombpnet_paper_new/ATAC_PE/GM12878/GM12878_03.01.2022_bias_128_4_1234_0.4_fold_0/BIAS/modisco/modisco_results_allChroms_'+mode+'.hdf5'
tomtom_path_atac = '/oak/stanford/groups/akundaje/projects/chrombpnet_paper_new/ATAC_PE/GM12878/GM12878_03.01.2022_bias_128_4_1234_0.4_fold_0/BIAS/modisco/profile.tomtom.tsv'
ppm_dir = 'subfigs/bias_motifs_new'
modisco_path_dnase = '/oak/stanford/groups/akundaje/projects/chrombpnet_paper_new/DNASE_PE/HEPG2/HEPG2_06.08.2022_bias_128_4_1234_0.8_fold_0/BIAS/modisco_crop_500/modisco_results_allChroms_'+mode+'.hdf5'
tomtom_path_dnase = '/oak/stanford/groups/akundaje/projects/chrombpnet_paper_new/DNASE_PE/HEPG2/HEPG2_06.08.2022_bias_128_4_1234_0.8_fold_0/BIAS/modisco_crop_500/profile.tomtom.tsv'


background=[0.25, 0.25, 0.25, 0.25]

In [24]:
def trim_motif_new(cwm, motif, trim_threshold=0.25):
    """
    Given the PFM and motif (both L x 4 arrays) (the motif could be the
    PFM itself), trims `motif` by cutting off flanks of low information
    content in `pfm`. `min_ic` is the minimum required information
    content. If specified this trimmed motif will be extended on either
    side by `pad` bases.
    If no base passes the `min_ic` threshold, then no trimming is done.
    """
    
    score = np.sum(np.abs(cwm), axis=1)
    trim_thresh = np.max(score) * trim_threshold  # Cut off anything less than 30% of max score
    pass_inds = np.where(score >= trim_thresh)[0]
    trimmed = motif[np.min(pass_inds): np.max(pass_inds) + 1]
 
    if not trimmed.size:
        return motif
    
    return trimmed

def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
    """
    Imports the PFMs to into a dictionary, mapping `(x, y)` to the PFM,
    where `x` is the metacluster index and `y` is the pattern index.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `out_dir`: where to save motifs
        `trim`: if True, trim the motif flanks based on information content
        `only_pos`: if True, only return motifs with positive contributions
    Returns the dictionary of PFMs.
    """ 
    pfms = {}
    with h5py.File(tfm_results_path, "r") as f:
        metaclusters = f["metacluster_idx_to_submetacluster_results"]
        num_metaclusters = len(metaclusters.keys())
        for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
            metacluster = metaclusters[metacluster_key]
            if "patterns" not in metacluster["seqlets_to_patterns_result"]:
                continue
            patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
            num_patterns = len(patterns["all_pattern_names"][:])
            for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
                pattern_name = pattern_name.decode()
                pattern = patterns[pattern_name]
                pfm = pattern["sequence"]["fwd"][:]
                cwm = pattern["task0_contrib_scores"]["fwd"][:]
                
                # Check that the contribution scores are overall positive
                if only_pos and np.sum(cwm) < 0:
                    continue
                    
                #if trim:
                #    pfm = trim_motif_new(cwm, cwm)
                    
                pfms["%d_%d" % (metacluster_i,pattern_i)] = cwm
    return pfms

In [25]:
pfms = import_tfmodisco_motifs(modisco_path_atac)

atac_data = pd.read_csv(tomtom_path_atac,sep="\t")
sum(atac_data["Num_Seqlets"])
atac_data["Num_Seqlets"] = atac_data["Num_Seqlets"]/sum(atac_data["Num_Seqlets"])
atac_data["Pattern"] = atac_data["Pattern"].str.replace("metacluster_","").str.replace(".pattern_","_")

  


In [26]:
atac_counts = []
for key in pfms:
    if int(key.split("_")[1])<8:
        f = open(os.path.join(ppm_dir,mode+"_"+key+"_tn5.pfm"),"w")
        atac_counts.append([key,atac_data[atac_data["Pattern"]==key]["Num_Seqlets"].values[0]])
        #print(pfms[key])
        np.savetxt(f, pfms[key], fmt='%f')
        f.close()
    

In [27]:
atac_df = pd.DataFrame(atac_counts, columns=["key", "coc_frac"])


In [28]:
atac_df.to_csv(os.path.join(ppm_dir,mode+"_tn5.counts.csv"), sep=",", header=True, index=False)

In [29]:
pfms = import_tfmodisco_motifs(modisco_path_dnase)


dnase_data = pd.read_csv(tomtom_path_dnase,sep="\t")
sum(dnase_data["Num_Seqlets"])
dnase_data["Num_Seqlets"] = dnase_data["Num_Seqlets"]/sum(dnase_data["Num_Seqlets"])
dnase_data["Pattern"] = dnase_data["Pattern"].str.replace("metacluster_","").str.replace(".pattern_","_")

  import sys


In [30]:
dnase_counts = []

for key in pfms:
    print(key)
    if key in ["0_0", "0_1", "0_2", "0_7" ,"0_12", "0_17"]:
        f = open(os.path.join(ppm_dir,mode+"_"+key+"_dnase1.pfm"),"w")
        dnase_counts.append([key,dnase_data[dnase_data["Pattern"]==key]["Num_Seqlets"].values[0]])
        #print(pfms[key])
        np.savetxt(f, pfms[key], fmt='%f')
        f.close()
    


0_0
0_1
0_2
0_3
0_4
0_5
0_6
0_7
0_8
0_9
0_10
0_11
0_12
0_13
0_14
0_15
0_16
0_17
0_18
0_19
0_20
0_21
0_22
0_23
0_24
0_25


In [31]:
dnase_df = pd.DataFrame(dnase_counts, columns=["key", "coc_frac"])


In [32]:
dnase_df.to_csv(os.path.join(ppm_dir,mode+"_dnase.counts.csv"), sep=",", header=True, index=False)