### Link to results
[Within-motif heterogeneity](#motif-subclusters)

[Clustering of peaks](#peaks)

In [None]:
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

In [None]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

### Define constants and paths

In [None]:
# Define parameters/fetch arguments
tfm_results_path = os.environ["TFM_TFM_PATH"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
task_index = int(os.environ["TFM_TASK_INDEX"])
embeddings_path = os.environ["TFM_EMB_PATH"]
moods_dir = os.environ["TFM_MOODS_DIR"]

print("TF-MoDISco results path: %s" % tfm_results_path)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("Importance score key: %s" % hyp_score_key)
print("Task index: %d" % task_index)
print("Embeddings path: %s" % embeddings_path)
print("MOODS directory: %s" % moods_dir)

In [None]:
# Define constants
input_length = 2114
shap_score_center_size = 400

base_path = "/users/amtseng/tfmodisco/"
data_path = os.path.join(base_path, "data/processed/ENCODE/")
labels_path = os.path.join(data_path, "labels/%s" % tf_name)

# Paths to original called peaks
all_peak_beds = sorted([item for item in os.listdir(labels_path) if item.endswith(".bed.gz")])
if task_index is None:
    peak_bed_paths = [os.path.join(labels_path, item) for item in all_peak_beds]
else:
    peak_bed_paths = [os.path.join(labels_path, all_peak_beds[task_index])]

### Helper functions
For plotting and organizing things

In [None]:
def compute_tfmodisco_motif_subclusters(tfm_results):
    """
    From an imported TF-MoDISco results object, computes the subclustering
    of heterogeneity within each motif/pattern.
    """
    metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
    num_metaclusters = len(metaclusters.keys())
    for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
        metacluster = metaclusters[metacluster_key]
        patterns = metacluster.seqlets_to_patterns_result.patterns
        if not patterns:
            break
        num_patterns = len(patterns)
        for pattern_i, pattern in enumerate(patterns):
            # Compute subclustering for each pattern (motif)
            pattern.compute_subclusters_and_embedding(
                pattern_comparison_settings=modisco.affinitymat.core.PatternComparisonSettings(
                    track_names=["task0_hypothetical_contribs", "task0_contrib_scores"],
                    track_transformer=modisco.affinitymat.L1Normalizer(),
                    min_overlap=None  # This argument is irrelevant here
                ),
                perplexity=30, n_jobs=4, verbose=True
            )

In [None]:
def trim_hcwm(pfm, hcwm):
    # Trim motif based on information content
    ic = pfm_info_content(pfm)
    pass_inds = np.where(ic >= 0.2)[0]  # Cut off flanks with less than 0.2 IC

    # Expand trimming to +/- 4bp on either side
    start, end = max(0, np.min(pass_inds) - 4), min(len(pfm), np.max(pass_inds) + 4 + 1)
    return hcwm[start:end]

In [None]:
def plot_motif_heterogeneity(tfm_results):
    colgroup = vdomh.colgroup(
        vdomh.col(style={"width": "5%"}),
        vdomh.col(style={"width": "5%"}),
        vdomh.col(style={"width": "50%"}),
        vdomh.col(style={"width": "40%"})
    )
    header = vdomh.thead(
        vdomh.tr(
            vdomh.th("Subpattern", style={"text-align": "center"}),
            vdomh.th("Seqlets", style={"text-align": "center"}),
            vdomh.th("Embeddings", style={"text-align": "center"}),
            vdomh.th("hCWM", style={"text-align": "center"})
        )
    )

    metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
    num_metaclusters = len(metaclusters.keys())
    for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
        metacluster = metaclusters[metacluster_key]
        display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
        patterns = metacluster.seqlets_to_patterns_result.patterns
        if not patterns:
            break
        num_patterns = len(patterns)
        for pattern_i, pattern in enumerate(patterns):
            display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))

            embedding = pattern.twod_embedding
            subpattern_clusters = pattern.subclusters

            # Aggregate motif
            pfm = pattern["sequence"].fwd
            hcwm = pattern["task0_hypothetical_contribs"].fwd
            trimmed_hcwm = trim_hcwm(pfm, hcwm)
            hcwm_fig = viz_sequence.plot_weights(
                trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
            )
            emb_fig, ax = plt.subplots()
            ax.scatter(
                embedding[:,0], embedding[:,1], c=subpattern_clusters, cmap="tab20", alpha=0.3
            )

            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(len(pattern.seqlets))),
                vdomh.td(figure_to_vdom_image(emb_fig)),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            )]

            for subpattern_key, subpattern in pattern.subcluster_to_subpattern.items():
                pfm = subpattern["sequence"].fwd
                hcwm = subpattern["task0_hypothetical_contribs"].fwd
                trimmed_hcwm = trim_hcwm(pfm, hcwm)
                hcwm_fig = viz_sequence.plot_weights(
                    trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
                )
                emb_fig, ax = plt.subplots()
                ax.scatter(
                    embedding[:,0], embedding[:,1], c=(subpattern_clusters == subpattern_key), alpha=0.3
                )

                table_rows.append(vdomh.tr(
                    vdomh.td(str(subpattern_key)),
                    vdomh.td(str(len(subpattern.seqlets))),
                    vdomh.td(figure_to_vdom_image(emb_fig)),
                    vdomh.td(figure_to_vdom_image(hcwm_fig))
                ))

            table = vdomh.table(header, vdomh.tbody(*table_rows))
            display(table)
            plt.close("all")  # Remove all standing figures

In [None]:
def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
    """
    Imports hCWMs to into a dictionary, mapping `(x, y)` to the hCWM,
    where `x` is the metacluster index and `y` is the pattern index.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `out_dir`: where to save motifs
        `trim`: if True, trim the motif flanks based on total importance
        `only_pos`: if True, only return motifs with positive contributions
    Returns the dictionary of hCWM.
    """ 
    hcwms = {}
    with h5py.File(tfm_results_path, "r") as f:
        metaclusters = f["metacluster_idx_to_submetacluster_results"]
        num_metaclusters = len(metaclusters.keys())
        for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
            metacluster = metaclusters[metacluster_key]
            if "patterns" not in metacluster["seqlets_to_patterns_result"]:
                continue
            patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
            num_patterns = len(patterns["all_pattern_names"][:])
            for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
                pattern_name = pattern_name.decode()
                pattern = patterns[pattern_name]
                pfm = pattern["sequence"]["fwd"][:]
                hcwm = pattern["task0_hypothetical_contribs"]["fwd"][:]
                cwm = pattern["task0_contrib_scores"]["fwd"][:]
                
                # Check that the contribution scores are overall positive
                if only_pos and np.sum(cwm) < 0:
                    continue
                    
                if trim:
                    hcwm = trim_hcwm(pfm, hcwm)
                    
                hcwms["%d_%d" % (metacluster_i,pattern_i)] = hcwm
    return hcwms

In [None]:
def get_hit_peak_indices(hit_table, motif_keys):
    """
    Returns a dictionary of NumPy arrays, mapping each motif key to
    the set of peak indices that contain that motif.
    """
    hit_peak_indices = {}
    for motif_key in motif_keys:
        hit_peak_indices[motif_key] = hit_table[hit_table["key"] == motif_key]["peak_index"].values
    return hit_peak_indices

In [None]:
def umap_transform(matrix):
    """
    Converts N x D matrix into transformed N x 2 matrix using
    UMAP. First projects down to 50 components using PCA.
    """
    # First reduce using PCA
    centered = matrix - np.mean(matrix, axis=0, keepdims=True)
    pca = sklearn.decomposition.PCA(n_components=50)
    reduced = pca.fit_transform(centered)

    # Run UMAP
    um = umap.UMAP(verbose=False)
    return um.fit_transform(centered)

In [None]:
def plot_peak_clustering(embeddings_path, motif_keys, hcwms, hit_emb_indices):
    colgroup = vdomh.colgroup(
        vdomh.col(style={"width": "5%"}),
        vdomh.col(style={"width": "55"}),
        vdomh.col(style={"width": "40%"})
    )
    header = vdomh.thead(
        vdomh.tr(
            vdomh.th("Motif key", style={"text-align": "center"}),
            vdomh.th("Embeddings", style={"text-align": "center"}),
            vdomh.th("hCWM", style={"text-align": "center"})
        )
    )
    
    embeddings_reader = h5py.File(embeddings_path, "r")
    num_layers = embeddings_reader["embeddings"]["mean"].shape[1]
    
    for i in range(num_layers):
        display(vdomh.h3("Layer %d/%d" % (i + 1, num_layers)))
        
        embeddings = np.concatenate([
            embeddings_reader["embeddings"]["mean"][:, i],
            embeddings_reader["embeddings"]["std"][:, i],
            embeddings_reader["embeddings"]["max"][:, i],
            embeddings_reader["embeddings"]["min"][:, i]
        ], axis=1)  # Shape: N x (F * 4)

        umap_trans = umap_transform(embeddings)
        
        table_rows = []
        for motif_key in motif_keys:
            hcwm = hcwms[motif_key]
            hcwm_fig = viz_sequence.plot_weights(
                hcwm, subticks_frequency=(len(hcwm) + 1), return_fig=True
            )
            emb_fig, ax = plt.subplots()
            subset = np.zeros(len(embeddings), dtype=int)
            subset[hit_emb_indices[motif_key]] = 1
            ax.scatter(
                umap_trans[:,0], umap_trans[:,1], c=subset, alpha=0.1
            )

            table_rows.append(vdomh.tr(
                vdomh.td(motif_key),
                vdomh.td(figure_to_vdom_image(emb_fig)),
                vdomh.td(figure_to_vdom_image(hcwm_fig))
            ))

        table = vdomh.table(header, vdomh.tbody(*table_rows))
        display(table)
        plt.close("all")  # Remove all standing figures
    
    embeddings_reader.close()

### Import TF-MoDISco results
Run motif subclustering

In [None]:
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(
    shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size, remove_non_acgt=True
)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut

In [None]:
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)

In [None]:
# Compute subclusters (needed for older versions of TF-MoDISco); this takes awhile!
compute_tfmodisco_motif_subclusters(tfm_obj)

### Import motif hits
For each motif, determine the peaks that contain it

In [None]:
# Import the hCWMs
hcwms = import_tfmodisco_motifs(tfm_results_path)
motif_keys = list(hcwms.keys())

In [None]:
# Import the motif hits
hit_table = moods.import_moods_hits(os.path.join(moods_dir, "moods_filtered_collapsed_scored.bed"))

In [None]:
hit_peak_indices = get_hit_peak_indices(hit_table, motif_keys)

In [None]:
# Import peaks
peak_table = import_peak_table(peak_bed_paths)

# Expand to input length
peak_table["peak_start"] = \
    (peak_table["peak_start"] + peak_table["summit_offset"]) - (input_length // 2)
peak_table["peak_end"] = peak_table["peak_start"] + input_length

In [None]:
with h5py.File(embeddings_path, "r") as f:
    emb_coords_table = pd.DataFrame({
        "chrom": f["coords"]["coords_chrom"][:].astype(str),
        "start": f["coords"]["coords_start"][:],
        "end": f["coords"]["coords_end"][:]
    })
emb_coords_table["start"] = \
    ((emb_coords_table["start"] + emb_coords_table["end"]) // 2) - (input_length // 2)
emb_coords_table["end"] = emb_coords_table["start"] + input_length

In [None]:
# Convert the peak indices to embedding indices
peak_coords_table = peak_table[["chrom", "peak_start", "peak_end"]]

matched_inds = peak_coords_table.reset_index().merge(
    emb_coords_table.reset_index(), how="left", left_on=["chrom", "peak_start", "peak_end"],
    right_on=["chrom", "start", "end"]
)[["index_x", "index_y"]].values
order_inds = np.empty(int(np.max(matched_inds[:, 0])) + 1)
order_inds[matched_inds[:, 0].astype(int)] = matched_inds[:, 1]
order_inds = np.nan_to_num(order_inds, nan=-1).astype(int)

assert np.all(
    peak_coords_table.values[order_inds >= 0] == \
    emb_coords_table.iloc[order_inds].values[order_inds >= 0]
)  # Make sure the coordinates match up, at least those for which there was a match

# Convert peak indices into embedding indices
hit_emb_indices = {}
for key in hit_peak_indices:
    emb_inds = order_inds[hit_peak_indices[key]]
    # Remove -1s; this removes anything where there was a peak but
    # not a computed embedding
    hit_emb_indices[key] = emb_inds[emb_inds >= 0]

<a id="motif-subclusters"></a>
### Within-motif heterogeneity
For each motif, show the subclusters that exist within the TF-MoDISco-identified subpatterns

In [None]:
plot_motif_heterogeneity(tfm_obj)

<a id="peaks"></a>
### Peak clustering
For each peak, cluster the peaks by embeddings to highlight the structure of different peaks and different motifs

In [None]:
plot_peak_clustering(embeddings_path, motif_keys, hcwms, hit_emb_indices)