In [None]:
import moods
import util
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import tqdm
tqdm.tqdm_notebook()

In [None]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "font.size": 13,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

### Define constants and paths

In [None]:
expids = [
    "ENCSR000BGZ",
    "ENCSR725VFL",
    "ENCSR240PRQ",
    "ENCSR000DTO",
    "ENCSR000BSE",
    "ENCSR000EFS",
    "ENCSR000FAH",
    "ENCSR000EWG",
    "ENCSR000BKM"
]
peakids = [
    "ENCFF068YYR",
    "ENCFF154RAJ",
    "ENCFF463FGL",
    "ENCFF832INR",
    "ENCFF273SBR",
    "ENCFF190CGV",
    "ENCFF589QXC",
    "ENCFF497ISV",
    "ENCFF242YZU"
]
keys = ["profile", "counts"]

moods_dirs = {
    (expid, key) : os.path.join("/mnt/lab_data2/amtseng/tf_atlas/results/moods", expid, key)
    for expid in expids for key in keys 
}
peak_bed_paths = {
    expids[i] : os.path.join("/users/zahoor/TF-Atlas/data/idr_peaks/", peakid + ".bed.gz")
    for i, peakid in enumerate(peakids)
}

In [None]:
fdrs = np.arange(0.05, 1.05, 0.05)

### Helper functions
For plotting and organizing things

In [None]:
def get_peak_hit_counts(hit_table, num_peaks):
    """
    For each peak, extracts the number of motif hits that fall in that peak.
    Returns a list mapping peak index to a subtable of `hit_table`. The index
    of the list is the index of the peak table.
    """
    peak_hit_counts = np.zeros(num_peaks, dtype=int)
    for peak_index, matches in hit_table.groupby("peak_index"):
        peak_hit_counts[peak_index] = len(matches)
    return peak_hit_counts

In [None]:
def get_peak_hit_counts_for_fdrs(hit_tables, peak_counts, fdrs):
    """
    Returns dictionary mapping hit table key to N x F array of hit counts,
    where N is the number of peaks and F is the number of FDR values to
    test. This tells us the number of hits in each peak for each FDR
    cutoff.
    """
    all_peak_hit_counts = {}
    for key, hit_table in tqdm.notebook.tqdm(hit_tables.items()):
        expid = key[0]
        num_peaks = peak_counts[expid]

        # For each FDR, compute the maximum p-value threshold
        all_pvals = np.sort(hit_table["imp_pval"].values)
        pval_threshes = []
        for fdr in fdrs:
            bh_crit_vals = fdr * np.arange(1, len(all_pvals) + 1) / len(all_pvals)
            pval_thresh = all_pvals[np.max(np.where(all_pvals <= bh_crit_vals)[0])]
            pval_threshes.append(pval_thresh)
        pval_threshes = np.array(pval_threshes)
        
        # For each peak, compute the number of hits for each p-value threshold
        peak_hit_counts = np.zeros((num_peaks, len(fdrs)), dtype=int)
        for peak_index, matches in hit_table.groupby("peak_index"):
            if matches.empty:
                continue
            pvals = np.sort(matches["imp_pval"].values)
            peak_hit_counts[peak_index] = np.searchsorted(pvals, pval_threshes, side="right")
        
        all_peak_hit_counts[key] = peak_hit_counts
    return all_peak_hit_counts

### Import hit results

In [None]:
# Import MOODS hit tables
hit_tables = {
    key : moods.import_moods_hits(os.path.join(moods_dir, "moods_filtered_collapsed_tested.bed"))
    for key, moods_dir in moods_dirs.items()
}

In [None]:
# Import peaks counts
peak_counts = {
    expid : len(util.import_peak_table([path])) for expid, path in peak_bed_paths.items()
}

In [None]:
all_peak_hit_counts = get_peak_hit_counts_for_fdrs(hit_tables, peak_counts, fdrs)

### Show peak hit count statistics

In [None]:
# FDR vs total hits : number of peaks
total_hits = {key : np.sum(all_peak_hit_counts[key], axis=0) for key in hit_tables}
avg_hits_per_peak = {key : hits / peak_counts[key[0]] for key, hits in total_hits.items()}
fig, ax = plt.subplots(figsize=(20, 20))
for key, rats in avg_hits_per_peak.items():
    ax.plot(fdrs, rats, label=("%s_%s" % key))
ax.set_xlabel("FDR cutoff")
ax.set_xticks(fdrs)
ax.set_ylabel("Average number of motif hits per peak")
ax.set_title("FDR vs motif hits per peak")
plt.legend()
plt.show()

In [None]:
# FDR vs proportion of peaks with 0 hits
num_zeros = {key : np.sum(all_peak_hit_counts[key] == 0, axis=0) for key in hit_tables}
prop_zeros = {key : nums / peak_counts[key[0]] for key, nums in num_zeros.items()}
fig, ax = plt.subplots(figsize=(20, 20))
for key, nums in prop_zeros.items():
    ax.plot(fdrs, nums, label=("%s_%s" % key))
ax.set_xlabel("FDR cutoff")
ax.set_xticks(fdrs)
ax.set_ylabel("Proportion of peaks with 0 hits")
ax.set_title("FDR vs peaks with no hits")
plt.legend()
plt.show()

In [None]:
# FDR vs 99th quantile of number of hits/peak
top_quants = {key : np.quantile(all_peak_hit_counts[key], 0.99, axis=0) for key in hit_tables}
fig, ax = plt.subplots(figsize=(20, 20))
for key, vals in top_quants.items():
    ax.plot(fdrs, vals, label=("%s_%s" % key))
ax.set_xlabel("FDR cutoff")
ax.set_xticks(fdrs)
ax.set_ylabel("99th quantile of number of motif hits per peak")
ax.set_title("FDR vs 99th quantile of hits per peak")
plt.legend()
plt.show()