In [None]:
%%capture --no-display
import modisco
from modisco.visualization import viz_sequence
import re
import numpy as np
from xml.etree import ElementTree
import os

In [None]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_RESULTS_TF_NAME"]
fold = int(os.environ["TFM_RESULTS_FOLD"])
if "TFM_RESULTS_TASK_INDEX" in os.environ:
    task_index = int(os.environ["TFM_RESULTS_TASK_INDEX"])
else:
    task_index = None
    
print("TF name: %s" % tf_name)
print("Fold: %s" % fold)
print("Task index: %s" % task_index)

[Skip to results](#results)

In [None]:
# Define paths and constants
background_freqs = np.array([0.27, 0.23, 0.23, 0.27])
bases = ["A", "C", "G", "T"]
base_ind_dict = {base: i for i, base in enumerate(bases)}
dinucs = [x + y for x in bases for y in bases]
dichipmunk_dinuc_prefixes = [dinuc + "|" for dinuc in dinucs]

base_path = "/users/amtseng/tfmodisco/results/motif_benchmarks"

if task_index is None:
    cond_path = os.path.join(base_path, tf_name, "%s_fold%d" % (tf_name, fold))
else:
    cond_path = os.path.join(base_path, tf_name, "%s_fold%d_task%d" % (tf_name, fold, task_index))

dichipmunk_peak_results_path = os.path.join(cond_path, "peaks", "dichipmunk")
homer_peak_results_path = os.path.join(cond_path, "peaks", "homer")
meme_peak_results_path = os.path.join(cond_path, "peaks", "meme")
dichipmunk_seqlet_results_path = os.path.join(cond_path, "seqlets", "dichipmunk")
homer_seqlet_results_path = os.path.join(cond_path, "seqlets", "homer")
meme_seqlet_results_path = os.path.join(cond_path, "seqlets", "meme")

### Helper functions

In [None]:
def dinuc_to_mononuc_pfm(dinuc_dict):
    """
    From a dictionary of dinucleotide counts at each position, constructs
    a standard mononucleotide PFM.
    """
    assert sorted(list(dinuc_dict.keys())) == sorted(dinucs)
    pfm_length = len(dinuc_dict["AA"]) + 1
    pfm = np.zeros((pfm_length, 4))
    for dinuc, counts in dinuc_dict.items():
        base_ind_1, base_ind_2 = base_ind_dict[dinuc[0]], base_ind_dict[dinuc[1]]
        pfm[:-1, base_ind_1] = pfm[:-1, base_ind_1] + counts
        pfm[1:, base_ind_2] = pfm[1:, base_ind_2] + counts
    return pfm

In [None]:
def import_dichipmunk_pfms(dichipmunk_results_path):
    """
    Imports the set of motif PFMs from a diChIPMunk results directory.
    Returns a list of PFMs, and a parallel list of number of supporting
    sequences.
    """
    results_path = os.path.join(dichipmunk_results_path, "results.txt")
    motif_dinuc_dicts, num_seqs = [], []
    with open(results_path, "r") as f:
        dinuc_dict = {}
        
        # Skip to the first motif section, ignoring everything before
        line = next(f)
        while not line.startswith("MOTF|"):
            line = next(f)

        for line in f:
            if line.startswith("MOTF|"):
                motif_dinuc_dicts.append(dinuc_dict)  # Append the previously filled dict
                dinuc_dict = {}  # Create a new dict for the next motif
            elif line[:3] in dichipmunk_dinuc_prefixes:
                dinuc = line[:2]
                dinuc_dict[dinuc] = np.array([float(x) for x in line[3:].strip().split()])
            elif line.startswith("SEQS|"):
                num_seqs.append(int(line[5:].strip()))
        if dinuc_dict:
            motif_dinuc_dicts.append(dinuc_dict)  # Append the last filled dict
            
    # Convert each dinucleotide dict to a mononucleotide PFM
    pfms = [dinuc_to_mononuc_pfm(dinuc_dict) for dinuc_dict in motif_dinuc_dicts]
    
    return pfms, num_seqs

In [None]:
def import_homer_pfms(homer_results_path):
    """
    Imports the set of motif PFMs from a HOMER results directory.
    Returns a list of PFMs, and a parallel list of log-odds
    enrichment values
    """
    results_dir = os.path.join(homer_results_path, "homerResults")
    pattern = re.compile(r"^motif\d+\.motif$")
    pfm_files = [item for item in os.listdir(results_dir) if pattern.match(item)]
    pfms, enrichments = [], []
    for pfm_file in sorted(pfm_files, key=lambda x: int(x.split(".")[0][5:])):
        pfm = []
        with open(os.path.join(results_dir, pfm_file), "r") as f:
            header = next(f)
            enrichment = float(header.strip().split("\t")[3])
            enrichments.append(enrichment)
            for line in f:
                pfm.append(np.array([float(x) for x in line.strip().split()]))
        pfms.append(np.array(pfm))
    return pfms, enrichments

In [None]:
def import_meme_pfms(meme_results_path):
    """
    Imports the set of motif PFMs from a MEME results directory.
    Returns a list of PFMs, and a parallel list of e-values.
    """
    results_path = os.path.join(meme_results_path, "meme.xml")
    tree = ElementTree.parse(results_path)
    pfms, evalues = [], []
    for motif in tree.getroot().find("motifs"):
        pfm = []
        pfm_matrix = motif.find("probabilities").find("alphabet_matrix")
        evalue = motif.get("e_value")
        evalues.append(evalue)
        for row in pfm_matrix:
            base_probs = np.array([float(base.text) for base in row])
            pfm.append(base_probs)
        pfms.append(np.array(pfm))
    return pfms, evalues

In [None]:
def info_content(track, pseudocount=0.001):
    """
    Given an L x 4 track, computes information content for each base and
    returns it as an L-array.
    """
    num_bases = track.shape[1]
    # Normalize track to probabilities along base axis
    track_norm = (track + pseudocount) / (np.sum(track, axis=1, keepdims=True) + (num_bases * pseudocount))
    ic = track_norm * np.log2(track_norm / np.expand_dims(background_freqs, axis=0))
    return np.sum(ic, axis=1)

In [None]:
def show_motif(pfm):
    ic = info_content(pfm)
    viz_sequence.plot_weights(pfm * np.expand_dims(ic, axis=1))

<a id="results"></a>
### Show benchmark motifs

**DiChIPMunk on peaks**

In [None]:
dichipmunk_peak_pfms, dichipmunk_peak_num_seqs = import_dichipmunk_pfms(dichipmunk_peak_results_path)
num_motifs = len(dichipmunk_peak_pfms)
for i in range(num_motifs):
    print("Motif %d/%d: supporting sequences = %d" % (i + 1, num_motifs, dichipmunk_peak_num_seqs[i]))
    show_motif(dichipmunk_peak_pfms[i])

**DiChIPMunk on seqlets**

In [None]:
dichipmunk_seqlet_pfms, dichipmunk_seqlet_num_seqs = import_dichipmunk_pfms(dichipmunk_seqlet_results_path)
num_motifs = len(dichipmunk_seqlet_pfms)
for i in range(num_motifs):
    print("Motif %d/%d: supporting sequences = %d" % (i + 1, num_motifs, dichipmunk_seqlet_num_seqs[i]))
    show_motif(dichipmunk_seqlet_pfms[i])

**HOMER on peaks**

In [None]:
homer_peak_pfms, homer_peak_enrichments = import_homer_pfms(homer_peak_results_path)
num_motifs = len(homer_peak_pfms)
for i in range(num_motifs):
    print("Motif %d/%d: log enrichment = %d" % (i + 1, num_motifs, homer_peak_enrichments[i]))
    show_motif(homer_peak_pfms[i])

**HOMER on seqlets**

In [None]:
homer_seqlet_pfms, homer_seqlet_enrichments = import_homer_pfms(homer_seqlet_results_path)
num_motifs = len(homer_seqlet_pfms)
for i in range(num_motifs):
    print("Motif %d/%d: log enrichment = %d" % (i + 1, num_motifs, homer_seqlet_enrichments[i]))
    show_motif(homer_seqlet_pfms[i])

**MEME on peaks**

In [None]:
meme_peak_pfms, meme_peak_evalues = import_meme_pfms(meme_peak_results_path)
num_motifs = len(meme_peak_pfms)
for i in range(num_motifs):
    print("Motif %d/%d: E-value = %s" % (i + 1, num_motifs, meme_peak_evalues[i]))
    show_motif(meme_peak_pfms[i])

**MEME on seqlets**

In [None]:
meme_seqlet_pfms, meme_seqlet_evalues = import_meme_pfms(meme_seqlet_results_path)
num_motifs = len(meme_seqlet_pfms)
for i in range(num_motifs):
    print("Motif %d/%d: E-value = %s" % (i + 1, num_motifs, meme_seqlet_evalues[i]))
    show_motif(meme_seqlet_pfms[i])