In [None]:
%%capture --no-display
%load_ext autoreload
%autoreload 2
import sys
import os
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import motif_bench.read_motifs as read_motifs
import motif_bench.match_motifs as match_motifs
import modisco
from modisco.visualization import viz_sequence
import numpy as np

In [None]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_RESULTS_TF_NAME"]
fold = int(os.environ["TFM_RESULTS_FOLD"])
if "TFM_RESULTS_TASK_INDEX" in os.environ:
    task_index = int(os.environ["TFM_RESULTS_TASK_INDEX"])
else:
    task_index = None
    
print("TF name: %s" % tf_name)
print("Fold: %s" % fold)
print("Task index: %s" % task_index)

[Skip to results](#results)

In [None]:
# Define paths and constants
background_freqs = np.array([0.25, 0.25, 0.25, 0.25])
other_folds = [i for i in range(1, 11) if i != fold]

bench_base_path = "/users/amtseng/tfmodisco/results/motif_benchmarks"
tfm_base_path = "/users/amtseng/tfmodisco/results/tfmodisco"

if task_index is None:
    bench_cond_path = os.path.join(bench_base_path, tf_name, "%s_fold%d" % (tf_name, fold))
    tfm_results_path = os.path.join(tfm_base_path, tf_name, "%s_tfm_fold%d.h5" % (tf_name, fold))
    tfm_other_fold_results_paths = [
        os.path.join(tfm_base_path, tf_name, "%s_tfm_fold%d.h5" % (tf_name, f)) for f in other_folds
    ]
else:
    bench_cond_path = os.path.join(bench_base_path, tf_name, "%s_fold%d_task%d" % (tf_name, fold, task_index))
    tfm_results_path = os.path.join(tfm_base_path, tf_name, "%s_tfm_fold%d_task%d.h5" % (tf_name, fold, task_index))

dichipmunk_peak_results_path = os.path.join(bench_cond_path, "peaks", "dichipmunk")
homer_peak_results_path = os.path.join(bench_cond_path, "peaks", "homer")
meme_peak_results_path = os.path.join(bench_cond_path, "peaks", "meme")
dichipmunk_seqlet_results_path = os.path.join(bench_cond_path, "seqlets", "dichipmunk")
homer_seqlet_results_path = os.path.join(bench_cond_path, "seqlets", "homer")
meme_seqlet_results_path = os.path.join(bench_cond_path, "seqlets", "meme")

### Helper functions

In [None]:
def info_content(track, pseudocount=0.001):
    """
    Given an L x 4 track, computes information content for each base and
    returns it as an L-array.
    """
    num_bases = track.shape[1]
    # Normalize track to probabilities along base axis
    track_norm = (track + pseudocount) / (np.sum(track, axis=1, keepdims=True) + (num_bases * pseudocount))
    ic = track_norm * np.log2(track_norm / np.expand_dims(background_freqs, axis=0))
    return np.sum(ic, axis=1)

In [None]:
def show_motif(pfm):
    ic = info_content(pfm)
    viz_sequence.plot_weights(pfm * np.expand_dims(ic, axis=1))
def show_motifs(pfms):
    for pfm in pfms:
        show_motif(pfm)

### Import motifs

In [None]:
dichipmunk_peak_pfms, dichipmunk_peak_num_seqs = read_motifs.import_dichipmunk_pfms(dichipmunk_peak_results_path)
dichipmunk_seqlet_pfms, dichipmunk_seqlet_num_seqs = read_motifs.import_dichipmunk_pfms(dichipmunk_seqlet_results_path)
homer_peak_pfms, homer_peak_enrichments = read_motifs.import_homer_pfms(homer_peak_results_path)
homer_seqlet_pfms, homer_seqlet_enrichments = read_motifs.import_homer_pfms(homer_seqlet_results_path)
meme_peak_pfms, meme_peak_evalues = read_motifs.import_meme_pfms(meme_peak_results_path)
meme_seqlet_pfms, meme_seqlet_evalues = read_motifs.import_meme_pfms(meme_seqlet_results_path)
tfm_pfms, tfm_cwms, tfm_num_seqlets = read_motifs.import_tfmodisco_motifs(tfm_results_path)
if task_index is None:
    tfm_other_fold_pfms, tfm_other_fold_cwms, tf_other_fold_num_seqlets = zip(
        *[read_motifs.import_tfmodisco_motifs(path) for path in tfm_other_fold_results_paths]
    )

<a id="results"></a>
### Match benchmarked motifs to TF-MoDISco motifs

In [None]:
dichipmunk_peak_matches = match_motifs.match_motifs(tfm_pfms, dichipmunk_peak_pfms)
dichipmunk_seqlet_matches = match_motifs.match_motifs(tfm_pfms, dichipmunk_seqlet_pfms)
homer_peak_matches = match_motifs.match_motifs(tfm_pfms, homer_peak_pfms)
homer_seqlet_matches = match_motifs.match_motifs(tfm_pfms, homer_seqlet_pfms)
meme_peak_matches = match_motifs.match_motifs(tfm_pfms, meme_peak_pfms)
meme_seqlet_matches = match_motifs.match_motifs(tfm_pfms, meme_seqlet_pfms)

In [None]:
# Show benchmark motifs matched to TF-MoDISco motifs
for tfm_pfm_i, tfm_pfm in enumerate(tfm_pfms):
    print("=======================================")
    print("TF-MoDISco motif %d" % tfm_pfm_i)
    show_motif(tfm_pfm)
    print("---------------------------------------")
    bench_inds = np.where(dichipmunk_peak_matches == tfm_pfm_i)[0]
    print("DiChIPMunk (peak) matches: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
    show_motifs([dichipmunk_peak_pfms[i] for i in bench_inds])
    print("---------------------------------------")
    bench_inds = np.where(dichipmunk_seqlet_matches == tfm_pfm_i)[0]
    print("DiChIPMunk (seqlet) matches: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
    show_motifs([dichipmunk_seqlet_pfms[i] for i in bench_inds])
    print("---------------------------------------")
    bench_inds = np.where(homer_peak_matches == tfm_pfm_i)[0]
    print("HOMER (peak) matches: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
    show_motifs([homer_peak_pfms[i] for i in bench_inds])
    print("---------------------------------------")
    bench_inds = np.where(homer_seqlet_matches == tfm_pfm_i)[0]
    print("HOMER (seqlet) matches: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
    show_motifs([homer_seqlet_pfms[i] for i in bench_inds])
    print("---------------------------------------")
    bench_inds = np.where(meme_peak_matches == tfm_pfm_i)[0]
    print("MEME (peak) matches: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
    show_motifs([meme_peak_pfms[i] for i in bench_inds])
    print("---------------------------------------")
    bench_inds = np.where(meme_seqlet_matches == tfm_pfm_i)[0]
    print("MEME (seqlet) matches: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
    show_motifs([meme_seqlet_pfms[i] for i in bench_inds])

In [None]:
# Show leftover benchmark motifs not matched to any TF-MoDISco motifs
bench_inds = np.where(dichipmunk_peak_matches == -1)[0]
print("DiChIPMunk (peak) leftovers: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
show_motifs([dichipmunk_peak_pfms[i] for i in bench_inds])
print("---------------------------------------")
bench_inds = np.where(dichipmunk_seqlet_matches == -1)[0]
print("DiChIPMunk (seqlet) leftovers: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
show_motifs([dichipmunk_seqlet_pfms[i] for i in bench_inds])
print("---------------------------------------")
bench_inds = np.where(homer_peak_matches == -1)[0]
print("HOMER (peak) leftovers: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
show_motifs([homer_peak_pfms[i] for i in bench_inds])
print("---------------------------------------")
bench_inds = np.where(homer_seqlet_matches == -1)[0]
print("HOMER (seqlet) leftovers: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
show_motifs([homer_seqlet_pfms[i] for i in bench_inds])
print("---------------------------------------")
bench_inds = np.where(meme_peak_matches == -1)[0]
print("MEME (peak) leftovers: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
show_motifs([meme_peak_pfms[i] for i in bench_inds])
print("---------------------------------------")
bench_inds = np.where(meme_seqlet_matches == -1)[0]
print("MEME (seqlet) leftovers: %s" % (" ".join(bench_inds.astype(str)) if bench_inds.size else None))
show_motifs([meme_seqlet_pfms[i] for i in bench_inds])

### Match TF-MoDISco motifs across folds
This is only done if `task_index` is `None`

In [None]:
if task_index is None:
    tfm_other_fold_matches = [
        match_motifs.match_motifs(tfm_pfms, pfms) for pfms in tfm_other_fold_pfms
    ]

In [None]:
# Show other-fold TF-MoDISco motifs matched to this fold's TF-MoDISco motifs
if task_index is None:
    for tfm_pfm_i, tfm_pfm in enumerate(tfm_pfms):
        print("=======================================")
        print("TF-MoDISco fold %d motif %d" % (fold, tfm_pfm_i))
        show_motif(tfm_pfm)
        for f, other_fold in enumerate(other_folds):    
            print("---------------------------------------")
            inds = np.where(tfm_other_fold_matches[f] == tfm_pfm_i)[0]
            print("TF-MoDISco fold %d matches: %s" % (other_fold, (" ".join(inds.astype(str)) if inds.size else None)))
            show_motifs([tfm_other_fold_pfms[f][i] for i in inds])

In [None]:
# Show leftover other-fold TF-MoDISco motifs not matched to any TF-MoDISco motifs
if task_index is None:
    for f, other_fold in enumerate(other_folds):    
        print("---------------------------------------")
        inds = np.where(tfm_other_fold_matches[f] == -1)[0]
        print("TF-MoDISco fold %d leftovers: %s" % (other_fold, (" ".join(inds.astype(str)) if inds.size else None)))
        show_motifs([tfm_other_fold_pfms[f][i] for i in inds])