In [None]:
import h5py
import numpy as np
import scipy.stats
import scipy.ndimage
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import pandas as pd
import json
import os
import tqdm
tqdm.tqdm_notebook()

In [None]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

### Define constants and paths

In [None]:
tf_name = "NR3C1"
num_tasks = 7
best_fold = 6
test_set_only = True

In [None]:
prediction_path = "/users/amtseng/tfmodisco/results/peak_predictions/{0}/{0}_peak_prediction_performance_fold{1}.h5".format(tf_name, best_fold)

files_spec_json = "/users/amtseng/tfmodisco/data/processed/ENCODE/config/NR3C1/NR3C1_training_paths.json"
with open(files_spec_json, "r") as f:
    files_spec = json.load(f)

chrom_splits_json = "/users/amtseng/tfmodisco/data/processed/ENCODE/chrom_splits.json"
with open(chrom_splits_json, "r") as f:
    chrom_splits = json.load(f)
test_set_chroms = chrom_splits[str(best_fold)]["test"]

In [None]:
input_length = 1346
profile_length = 1000

### Import peak coordinates for each task
For each task, import the set of peaks belonging to that task. This obtains a set of indices for coordinates in `prediction_path` that correspond to each task.

In [None]:
# Read in tables containing the peak coordinates, padded to `input_length`
task_coords = []
assert len(files_spec["peak_beds"]) == num_tasks
for peak_bed_path in files_spec["peak_beds"]:
    table = pd.read_csv(
        peak_bed_path, sep="\t", header=None,  # Infer compression
        names=[
            "chrom", "peak_start", "peak_end", "name", "score",
            "strand", "signal", "pval", "qval", "summit_offset"
        ]
    )
    # Add summit location column:
    table["summit"] = table["peak_start"] + table["summit_offset"]
    
    # Add start and end columns, at proper length
    table["start"] = table["summit"] - (input_length // 2)
    table["end"] = table["start"] + input_length
    
    task_coords.append(table[["chrom", "start", "end"]])

In [None]:
# Read in coordinates within the saved predictions
with h5py.File(prediction_path, "r") as f:
    pred_coords = pd.DataFrame(
        data={
            "chrom": f["coords"]["coords_chrom"][:].astype(str),
            "start": f["coords"]["coords_start"][:],
            "end": f["coords"]["coords_end"][:]
        }
    )
    
# If specified, limit only to test-set chromosomes
pred_coords = pred_coords[pred_coords["chrom"].isin(test_set_chroms)]

In [None]:
# Compute the set of indices of prediction coordinates corresponding to each task
task_coord_inds = []
for task_index in range(num_tasks):
    task_inds = pred_coords.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
        task_coords[task_index].reset_index(), on=["chrom", "start", "end"]
    ).sort_values("index_y")["index_x"].values
    
    task_coord_inds.append(np.sort(task_inds))

### Import the set of performance metrics
For each task, import the set of performance metrics corresponding to the peaks in that task, for that task only.

In [None]:
def import_tf_metrics(tf_name, prediction_path, task_coord_inds):
    """
    Imports the set of all metrics for the given TF, for each task.
    `task_coord_inds` is the set of indices corresponding to each task
    within the saved prediction coordinates.
    Returns a dictionary of the following format:
        `nll`: [
            <task 0 NLL vector>
            <task 1 NLL vector>
            ...
        ],
        `count_mse`: [
            <task 0 MSE value>
            <task 1 MSE value>
            ...
        ]
        ...
    """
    perfs = {}
    
    pred_reader = h5py.File(prediction_path, "r")
    perf_reader = pred_reader["performance"]
    
    for key in perf_reader.keys():
        if len(perf_reader[key].shape) >= 2:  # Profile metric
            metrics = [
                perf_reader[key][task_coord_inds[task_index]][:, task_index]
                for task_index in range(num_tasks)
            ]
        else:  # Count metric
            metrics = [
                perf_reader[key][task_index]
                for task_index in range(num_tasks)
            ]
        perfs[key] = metrics

    # Compute the normalized NLL for the actual performance
    norm_nlls = []
    for task_index in range(num_tasks):
        nlls = perfs["nll"][task_index]
        num_reads = np.mean(np.sum(
            pred_reader["predictions"]["true_profs"][task_coord_inds[task_index]], axis=2
        ), axis=2)[:, task_index]
        norm_nlls.append(nlls / num_reads)
    perfs["norm_nll"] = norm_nlls

    pred_reader.close()
    
    return perfs

### Plot the performance

In [None]:
def plot_perf_bounds(tf_perfs, tf_name):
    def create_violins(ax, perfs_list):
        num_perfs = len(perfs_list)
        
        q1, med, q3 = np.stack([
            np.nanpercentile(data, [25, 50, 70], axis=0) for data in perfs_list
        ], axis=1)
        iqr = q3 - q1
        lower_outlier = q1 - (1.5 * iqr)
        upper_outlier = q3 + (1.5 * iqr)

        
        sorted_clipped_data = [  # Remove outliers
            np.sort(vec[(vec >= lower_outlier[i]) & (vec <= upper_outlier[i])])
            for i, vec in enumerate(perfs_list)
        ]
        
        plot_parts = ax.violinplot(
            sorted_clipped_data, showmeans=False, showmedians=False, showextrema=False
        )
        violin_parts = plot_parts["bodies"]
        for i in range(num_perfs):
            violin_parts[i].set_facecolor("mediumorchid")
            violin_parts[i].set_edgecolor("mediumorchid")
            violin_parts[i].set_alpha(0.7)
        
        inds = np.arange(1, num_perfs + 1)
        ax.vlines(inds, q1, q3, color="black", linewidth=5, zorder=1)
        ax.scatter(inds, med, marker="o", color="white", s=30, zorder=2)
    
    # Profile metrics
    for metric_key, metric_name in [
        ("nll", "NLL"), ("norm_nll", "Normalized NLL"), ("jsd", "JSD"), ("profile_mse", "Profile MSE"),
        ("profile_pearson", "Profile Pearson"), ("profile_spearman", "Profile Spearman")
    ]:
        fig, ax = plt.subplots(figsize=(20, 5))
        create_violins(ax, tf_perfs[metric_key])
        ax.set_title("Task-specific %s of %s" % (metric_name, tf_name))
        ax.set_xticks(np.arange(1, num_tasks + 1))
        ax.set_xticklabels(["Task %d" % task_index for task_index in range(num_tasks)])
        plt.show()
        
    # Count metrics
    for metric_key, metric_name in [
        ("count_mse", "Count MSE"), ("count_pearson", "Count Pearson"), ("count_spearman", "Count Spearman")
    ]:
        fig, ax = plt.subplots(figsize=(20, 5))
        label_locs = np.arange(num_tasks)  # Location of labels
        ax.bar(
            label_locs, tf_perfs[metric_key],
            color=(num_tasks * ["mediumorchid"]), alpha=0.7
        )
        ax.set_title("Task-specific %s of %s" % (metric_name, tf_name))
        ax.set_xticks(label_locs)
        ax.set_xticklabels(["Task %d" % task_index for task_index in range(num_tasks)])
        plt.show()

In [None]:
perfs = import_tf_metrics(tf_name, prediction_path, task_coord_inds)

In [None]:
plot_perf_bounds(perfs, tf_name)