In [None]:
import moods
import util
import os
import numpy as np
import pandas as pd
import pomegranate
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import tqdm
tqdm.tqdm_notebook()

In [None]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "font.size": 13,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

### Define constants and paths

In [None]:
# Define parameters/fetch arguments
shap_scores_path = os.environ["TFM_SHAP_PATH"]
peak_bed_paths = [os.environ["TFM_PEAKS_PATH"]]
moods_dir = os.environ["TFM_MOODS_DIR"]

print("DeepSHAP scores path: %s" % shap_scores_path)
print("Peaks path: %s" % peak_bed_paths[0])
print("MOODS directory: %s" % moods_dir)

In [None]:
# Constants
input_length = 2114
hyp_score_key = "hyp_scores"

### Import hits and scores

In [None]:
# Import MOODS table
hits_path = os.path.join(moods_dir, "moods_filtered_collapsed_tested.bed")
hit_table = moods.import_moods_hits(hits_path)

In [None]:
# Import DeepSHAP scores
_, imp_scores, _, coords = util.import_shap_scores(
    shap_scores_path, hyp_score_key, remove_non_acgt=False
)

In [None]:
# Import peaks
peak_table = pd.read_csv(
    peak_bed_paths[0], sep="\t", header=None, index_col=False,
    usecols=[0, 1, 2, 9], names=["peak_chrom", "peak_start", "peak_end", "offset"]
)
peak_table["peak_start"] = (peak_table["peak_start"] + peak_table["offset"]) - (input_length // 2)
peak_table["peak_end"] = peak_table["peak_start"] + input_length
peak_table = peak_table[["peak_chrom", "peak_start", "peak_end"]]
assert np.all(coords == peak_table.values)

In [None]:
# Merge in the peak starts/ends to the hit table
merged_hits = pd.merge(
    hit_table, peak_table, left_on="peak_index", right_index=True
)

# Important! Reset the indices of `merged_hits` after merging, otherwise
# iteration over the rows won't be in order
merged_hits = merged_hits.reset_index(drop=True)

# Compute start and end of each motif relative to the peak
merged_hits["motif_rel_start"] = \
    merged_hits["start"] - merged_hits["peak_start"]
merged_hits["motif_rel_end"] = \
    merged_hits["end"] - merged_hits["peak_start"]

# Careful! Because of the merging step that only kept the top peak hit, some
# hits might overrun the edge of the peak; we limit the motif hit
# indices here so they stay in the peak; this should not be a common occurrence
merged_hits["peak_min"] = 0
merged_hits["peak_max"] = merged_hits["peak_end"] - merged_hits["peak_start"]
merged_hits["motif_rel_start"] = \
    merged_hits[["motif_rel_start", "peak_min"]].max(axis=1)
merged_hits["motif_rel_end"] = \
    merged_hits[["motif_rel_end", "peak_max"]].min(axis=1)
del merged_hits["peak_min"]
del merged_hits["peak_max"]

# Get score of each motif hit as average importance over the hit
scores = np.empty(len(merged_hits))
for peak_index, group in tqdm.notebook.tqdm(merged_hits.groupby("peak_index")):
    score_track = np.sum(np.abs(imp_scores[peak_index]), axis=1)
    total_score = np.sum(score_track)
    for i, row in group.iterrows():
        scores[i] = np.mean(score_track[row["motif_rel_start"]:row["motif_rel_end"]]) / total_score

In [None]:
model = pomegranate.GeneralMixtureModel.from_samples(
    [pomegranate.GammaDistribution, pomegranate.GammaDistribution],
    2, scores[:, None]
)
model = model.fit(scores)

In [None]:
major_dist = model.distributions[np.argmax(model.to_dict()["weights"])]

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
ax.hist(scores, bins=200, density=True, alpha=0.3)
x = np.linspace(0, 0.03, 200)
y = model.probability(x)
ax.plot(x, y)
ax.set_title("Histogram of motif hit scores")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
ax.hist(scores, bins=200, density=True, alpha=0.3)
x = np.linspace(0, 0.03, 200)
y = major_dist.probability(x)
ax.plot(x, y)
ax.set_title("Histogram of motif hit scores (first component only)")
plt.show()

In [None]:
score_range = np.linspace(np.min(scores), np.max(scores), 1000000)
pdf = major_dist.probability(score_range)
pdf = pdf / np.sum(pdf)
inverse_cdf = 1 - np.cumsum(pdf)

In [None]:
assignments = np.digitize(scores, score_range, right=True)

In [None]:
pvals = inverse_cdf[assignments]

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
ranks = np.arange(1, len(pvals) + 1)
ax.plot(ranks, np.sort(pvals), color="black", label="p-values")
for fdr in [0.05, 0.1, 0.2, 0.3]:
    ax.plot(ranks, ranks / len(ranks) * fdr, label=("Crit values (FDR = %.2f)" % fdr))
ax.set_title("Step-up p-values and FDR corrective critical values")
plt.legend()
plt.show()