[Skip to results](#results)

In [None]:
import sys
import os
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import motif_bench.read_motifs as read_motifs
import motif_bench.match_motifs as match_motifs
import deeplift.visualization.viz_sequence as viz_sequence
import numpy as np

In [None]:
# Define parameters/fetch arguments
tfm_results_path = os.environ["TFM_TFM_PATH"]
print("TF-MoDISco results path: %s" % tfm_results_path)

In [None]:
# Constants
background_freqs = np.array([0.25, 0.25, 0.25, 0.25])

### Helper functions

In [None]:
def info_content(track, pseudocount=0.001):
    """
    Given an L x 4 track, computes information content for each base and
    returns it as an L-array.
    """
    num_bases = track.shape[1]
    # Normalize track to probabilities along base axis
    track_norm = (track + pseudocount) / (np.sum(track, axis=1, keepdims=True) + (num_bases * pseudocount))
    ic = track_norm * np.log2(track_norm / np.expand_dims(background_freqs, axis=0))
    return np.sum(ic, axis=1)

In [None]:
def show_motif(pfm):
    ic = info_content(pfm)
    viz_sequence.plot_weights(pfm * np.expand_dims(ic, axis=1))
def show_motifs(pfms):
    for pfm in pfms:
        show_motif(pfm)

In [None]:
def dna_to_one_hot(seqs):
    """
    Converts a list of DNA ("ACGT") sequences to one-hot encodings, where the
    position of 1s is ordered alphabetically by "ACGT". `seqs` must be a list
    of N strings, where every string is the same length L. Returns an N x L x 4
    NumPy array of one-hot encodings, in the same order as the input sequences.
    All bases will be converted to upper-case prior to performing the encoding.
    Any bases that are not "ACGT" will be given an encoding of all 0s.
    """
    seq_len = len(seqs[0])
    assert np.all(np.array([len(s) for s in seqs]) == seq_len)
    seq_concat = "".join(seqs).upper()
    one_hot_map = np.identity(5)[:, :-1]
    base_vals = np.frombuffer(bytearray(seq_concat, "utf8"), dtype=np.int8)
    base_vals[~np.isin(base_vals, np.array([65, 67, 71, 84]))] = 85
    _, base_inds = np.unique(base_vals, return_inverse=True)
    return one_hot_map[base_inds].reshape((len(seqs), seq_len, 4))

<a id="results"></a>
### Match TF-MoDISco motifs to known motifs

In [None]:
tfm_pfms, tfm_cwms, tfm_num_seqlets = read_motifs.import_tfmodisco_motifs(tfm_results_path)

In [None]:
pfm_matches = match_motifs.match_motifs_to_database(tfm_pfms)

In [None]:
# Show benchmark motifs matched to TF-MoDISco motifs
for tfm_pfm_i, tfm_pfm in enumerate(tfm_pfms):
    print("=======================================")
    print("TF-MoDISco motif %d" % tfm_pfm_i)
    show_motif(tfm_pfm)
    print("---------------------------------------")
    print("Top TOMTOM matches")
    for match_name, match_seq, match_qval in pfm_matches[tfm_pfm_i]:
        print("%s: q-val = %f" % (match_name, match_qval))
        show_motif(dna_to_one_hot([match_seq])[0])