In [2]:
import os
import pyfaidx
import pyBigWig
import h5py
import numpy as np
import one_hot
import scipy.stats
from scipy import signal

In [3]:
def trim_motif_new(cwm, motif, trim_threshold=0.3):
    """
    Given the PFM and motif (both L x 4 arrays) (the motif could be the
    PFM itself), trims `motif` by cutting off flanks of low information
    content in `pfm`. `min_ic` is the minimum required information
    content. If specified this trimmed motif will be extended on either
    side by `pad` bases.
    If no base passes the `min_ic` threshold, then no trimming is done.
    """
    
    score = np.sum(np.abs(cwm), axis=1)
    trim_thresh = np.max(score) * trim_threshold  # Cut off anything less than 30% of max score
    pass_inds = np.where(score >= trim_thresh)[0]
    trimmed = motif[np.min(pass_inds): np.max(pass_inds) + 1]
 
    if not trimmed.size:
        return motif
    
    return trimmed

In [4]:
def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
    """
    Imports the PFMs to into a dictionary, mapping `(x, y)` to the PFM,
    where `x` is the metacluster index and `y` is the pattern index.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `out_dir`: where to save motifs
        `trim`: if True, trim the motif flanks based on information content
        `only_pos`: if True, only return motifs with positive contributions
    Returns the dictionary of PFMs.
    """ 
    pfms = {}
    with h5py.File(tfm_results_path, "r") as f:
        metaclusters = f["metacluster_idx_to_submetacluster_results"]
        num_metaclusters = len(metaclusters.keys())
        for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
            metacluster = metaclusters[metacluster_key]
            if "patterns" not in metacluster["seqlets_to_patterns_result"]:
                continue
            patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
            num_patterns = len(patterns["all_pattern_names"][:])
            for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
                pattern_name = pattern_name.decode()
                pattern = patterns[pattern_name]
                pfm = pattern["sequence"]["fwd"][:]
                cwm = pattern["task0_contrib_scores"]["fwd"][:]
                
                # Check that the contribution scores are overall positive
                if only_pos and np.sum(cwm) < 0:
                    continue
                    
                if trim:
                    pfm = trim_motif_new(cwm, pfm)
                    
                pfms["%d_%d" % (metacluster_i,pattern_i)] = pfm
    return pfms



In [5]:
tfm_path="/oak/stanford/groups/akundaje/projects/chrombpnet_paper_new/modisco_jun_30/modisco/ATAC/K562/modisco_crop_500_100K_seqs_1/modisco_results_allChroms_profile.hdf5"
pfms = import_tfmodisco_motifs(tfm_path)

In [12]:
          
#tfm_path="/oak/stanford/groups/akundaje/projects/chrombpnet_paper_new/ATAC_PE/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_1_data_type_ATAC_PE/BIAS/K562.profile_scores.h5"

#tn5_pfms = import_tfmodisco_motifs(tfm_path)
    
    
  

In [13]:
tn5_motif = one_hot.dna_to_one_hot(["GCACAGTACAGAGCTG"])
dnase_motif = one_hot.dna_to_one_hot(["TTTACAAGTCCA"])


In [29]:
import scipy.signal
values= []
for key in pfms:
    similarity_tn5 = np.max(scipy.signal.convolve2d(pfms[key],tn5_motif[0]))
    similarity_dnase = np.max(scipy.signal.convolve2d(pfms[key],dnase_motif[0]))
    values.append([key,similarity_tn5/pfms[key].shape[0],similarity_dnase/pfms[key].shape[0]])

In [30]:
import pandas as pd
df = pd.DataFrame(values, columns = ["key",'Tn5-sim', 'DNASE-sim'])


In [31]:
#import matplotlib.pyplot as plt
#plt.scatter(df['Tn5-sim'],df['DNASE-sim'],)

In [32]:
df.to_csv("k562_profile_cwm_enzyme_sim.csv",sep=",",header=True,index=False)