### Link to results
[Proportion of peaks with hits](#peaks-with-hits)

[Co-occurrence of motifs in peaks](#co-occurrence)

[Distance between co-occurring motifs](#distance)

In [None]:
%load_ext autoreload
%autoreload 2
import os
import util
import moods
import h5py
import viz_sequence
import numpy as np
import pandas as pd
import sklearn.cluster
import scipy.cluster.hierarchy
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import subprocess
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

In [None]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "font.size": 13,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

### Define constants and paths

In [None]:
# Define parameters/fetch arguments
tfm_results_path = os.environ["TFM_TFM_PATH"]
peak_bed_paths = [os.environ["TFM_PEAKS_PATH"]]

if "TFM_MOODS_DIR" in os.environ:
    moods_dir = os.environ["TFM_MOODS_DIR"]
else:
    moods_dir = None

print("TF-MoDISco results path: %s" % tfm_results_path)
print("Peaks path: %s" % peak_bed_paths[0])

In [None]:
# Constants
reference_fasta = "/users/amtseng/genomes/hg38.fasta"

### Helper functions
For plotting and organizing things

In [None]:
def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
    """
    Imports the PFMs to into a dictionary, mapping `(x, y)` to the PFM,
    where `x` is the metacluster index and `y` is the pattern index.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `out_dir`: where to save motifs
        `trim`: if True, trim the motif flanks based on total importance
        `only_pos`: if True, only return motifs with positive contributions
    Returns the dictionary of PFMs.
    """ 
    pfms = {}
    with h5py.File(tfm_results_path, "r") as f:
        metaclusters = f["metacluster_idx_to_submetacluster_results"]
        num_metaclusters = len(metaclusters.keys())
        for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
            metacluster = metaclusters[metacluster_key]
            patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
            num_patterns = len(patterns["all_pattern_names"][:])
            for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
                pattern_name = pattern_name.decode()
                pattern = patterns[pattern_name]
                pfm = pattern["sequence"]["fwd"][:]
                cwm = pattern["task0_contrib_scores"]["fwd"][:]
                
                # Check that the contribution scores are overall positive
                if only_pos and np.sum(cwm) < 0:
                    continue
                    
                if trim:
                    score = np.sum(np.abs(cwm), axis=1)
                    trim_thresh = np.max(score) * 0.2  # Cut off anything less than 20% of max score
                    pass_inds = np.where(score >= trim_thresh)[0]
                    pfm = pfm[np.min(pass_inds): np.max(pass_inds) + 1]
                    
                pfms["%d_%d" % (metacluster_i,pattern_i)] = pfm
    return pfms

In [None]:
def get_peak_hits(peak_table, hit_table):
    """
    For each peak, extracts the set of motif hits that fall in that peak.
    Returns a list mapping peak index to a subtable of `hit_table`. The index
    of the list is the index of the peak table.
    """
    peak_hits = []
    for index, row in tqdm.notebook.tqdm(peak_table.iterrows(), total=len(peak_table)):
        chrom, start, end = row["chrom"], row["peak_start"], row["peak_end"]
        matches = hit_table[hit_table["chrom"] == chrom]
        matches = matches[(matches["start"] >= start) & (matches["start"] < end)]
        peak_hits.append(matches)
    return peak_hits

In [None]:
def get_peak_motif_indicators(peak_hits, motif_keys):
    """
    From the peak hits (as returned by `get_peak_hits`), computes an indicator
    array of size N x M, where N is the number of peaks and M is the number of
    motifs. This indicator array is a binary array, with a 1 whenever the motif
    appears in the peak. `motif_keys` is a list of motif keys as they appear in
    `peak_hits`; the order of the motifs M matches this list.
    """
    motif_inds = {motif_keys[i] : i for i in range(len(motif_keys))}
    indicators = np.zeros((len(peak_hits), len(motif_keys)), dtype=int)
    for i in tqdm.notebook.trange(len(peak_hits)):
        hits = peak_hits[i]
        keys = np.unique(hits["key"])
        for key in keys:
            indicators[i][motif_inds[key]] = 1
    return indicators

In [None]:
def plot_peak_motif_indicator_heatmap(peak_hit_indicators, motif_keys):
    """
    Plots a simple indicator heatmap of the motifs in each peak.
    """
    # Perform k-means clustering on the indicators (over peaks)
    kmeans_clusters = max(5, len(peak_hit_indicators) // 50)  # Set number of clusters based on number of peaks, with minimum
    kmeans = sklearn.cluster.KMeans(n_clusters=kmeans_clusters)
    cluster_assignments = kmeans.fit_predict(peak_hit_indicators)

    # Perform hierarchical clustering on the cluster centers to determine optimal ordering
    kmeans_centers = kmeans.cluster_centers_
    cluster_order = scipy.cluster.hierarchy.leaves_list(
        scipy.cluster.hierarchy.optimal_leaf_ordering(
            scipy.cluster.hierarchy.linkage(kmeans_centers, method="centroid"), kmeans_centers
        )
    )

    # Order the peaks so that the cluster assignments follow the optimal ordering
    cluster_inds = []
    for cluster_id in cluster_order:
        cluster_inds.append(np.where(cluster_assignments == cluster_id)[0])
    cluster_inds = np.concatenate(cluster_inds)
    matrix = peak_hit_indicators[cluster_inds]

    # Create a figure with the right dimensions
    fig_height = min(len(peak_hit_indicators) * 0.004, 8)
    fig, ax = plt.subplots(figsize=(16, fig_height))

    # Plot the heatmap
    ax.imshow(matrix, interpolation="nearest", aspect="auto", cmap="Greens")

    # Set axes on heatmap
    ax.set_yticks([])
    ax.set_yticklabels([])
    ax.set_xticks(np.arange(len(motif_keys)))
    ax.set_xticklabels(motif_keys)
    ax.set_xlabel("Motif")

    fig.tight_layout()
    plt.show()

In [None]:
def plot_motif_cooccurrence_heatmap(peak_hit_indicators, motif_keys):
    """
    Plots a heatmap showing the number of peaks that have both types of
    each motif.
    """
    num_motifs = peak_hit_indicators.shape[1]
    count_matrix = np.zeros((num_motifs, num_motifs), dtype=int)
    for i in range(num_motifs):
        for j in range(i):
            pair_col = np.sum(peak_hit_indicators[:, [i, j]], axis=1)
            count = np.sum(pair_col == 2)
            count_matrix[i, j] = count
            count_matrix[j, i] = count
    
    fig_width = max(5, num_motifs)
    fig, ax = plt.subplots(figsize=(fig_width, fig_width))
    hmap = ax.imshow(count_matrix)

    ax.set_xticks(np.arange(num_motifs))
    ax.set_yticks(np.arange(num_motifs))
    ax.set_xticklabels(motif_keys, rotation=45)
    ax.set_yticklabels(motif_keys)

    # Loop over data dimensions and create text annotations.
    for i in range(num_motifs):
        for j in range(num_motifs):
            ax.text(j, i, count_matrix[i, j], ha="center", va="center")
    fig.colorbar(hmap, orientation="horizontal")

    ax.set_title("Number of peaks with both motifs")
    fig.tight_layout()
    plt.show()

## Import TF-MoDISco results

In [None]:
# Import the PFMs
pfms = import_tfmodisco_motifs(tfm_results_path)
motif_keys = list(pfms.keys())

In [None]:
# Run MOODS
hit_table = moods.get_moods_hits(pfms, reference_fasta, peak_bed_paths[0], temp_dir=moods_dir)

In [None]:
# Import peaks
peak_table = util.import_peak_table(peak_bed_paths)

In [None]:
# Match peaks to motif hits
peak_hits = get_peak_hits(peak_table, hit_table)

In [None]:
# Construct indicator array of peaks an dhits
peak_hit_indicators = get_peak_motif_indicators(peak_hits, motif_keys)

<a id="peaks-with-hits"></a>
### Proportion of peaks with hits

In [None]:
motifs_per_peak = [len(hits) for hits in peak_hits]
max_hits = 5

fig, ax = plt.subplots(figsize=(10, 10))
bins = np.concatenate([np.arange(max_hits + 1), [np.inf]])
ax.hist(motifs_per_peak, bins=bins, density=True, histtype="step", cumulative=True)
ax.set_title("Cumulative distribution of number of motif hits per peak")
ax.set_xlabel("Number of motifs k in peak")
ax.set_ylabel("Proportion of peaks with at least k motifs")
plt.show()

<a id="co-occurrence"></a>
### Co-occurrence of motifs
Proportion of time that motifs co-occur with each other in peaks

In [None]:
plot_peak_motif_indicator_heatmap(peak_hit_indicators, motif_keys)

In [None]:
plot_motif_cooccurrence_heatmap(peak_hit_indicators, motif_keys)

<a id="distance"></a>
### Distribution of distances between motifs
When motifs co-occur, show the distance between the instances

In [None]:
def plot_distance_distribution(peak_hits, motif_keys):
    """
    For each pair of motifs, plots a histogram of distances beween
    motifs. 
    """
    for i in range(len(motif_keys)):
        for j in range(i, len(motif_keys)):
            dists = []
            for k in tqdm.notebook.trange(len(peak_hits)):
                hits = peak_hits[k]
                
                hits_1 = hits[hits["key"] == motif_keys[i]]
                hits_2 = hits[hits["key"] == motif_keys[j]]
                
                if hits_1.empty or hits_2.empty:
                    continue
                
                pos_1 = np.array(hits_1["start"])
                pos_2 = np.array(hits_2["start"])
                
                len_1 = (hits_1["end"] - hits_1["start"]).values[0]
                len_2 = (hits_2["end"] - hits_2["start"]).values[0]
                
                # Differences beteween all pairs of positions
                diffs = pos_2[None] - pos_1[:, None]
                # Take minimum distance for each instance of motif 2, but only
                # if the distance is an appropriate length
                for row in diffs:
                    row = row[row != 0]
                    if not row.size:
                        continue
                    dist = row[np.argmin(np.abs(row))]
                    if (dist < 0 and dist < -len_2) or (dist > 0 and dist > len_1):
                        dists.append(dist)
            
            dists = np.array(dists)
            if not dists.size:
                continue
            fig, ax = plt.subplots(figsize=(10, 6))
            num_bins = min(10, len(dists) // 100)

            bins = np.linspace(np.min(dists), np.max(dists), num=num_bins)
            ax.hist(dists, bins=bins, density=True)

            ax.set_xlabel("Signed distance from %s to %s" % (motif_keys[i], motif_keys[j]))
            plt.show()

In [None]:
plot_distance_distribution(peak_hits, motif_keys)