In [None]:
import h5py
import numpy as np
import scipy.stats
import scipy.ndimage
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import pandas as pd
import json
import os
import tqdm
tqdm.tqdm_notebook()

In [None]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

### Define constants and paths

In [None]:
tf_names = ["E2F6", "FOXA2", "SPI1", "CEBPB", "MAX", "NR3C1", "GABPA", "MAFK", "JUND", "REST"]
best_folds = {
    "E2F6": 4,
    "FOXA2": 7,
    "SPI1": 7,
    "CEBPB": 7,
    "MAX": 1,
    "NR3C1": 6,
    "GABPA": 7,
    "MAFK": 8,
    "JUND": 7,
    "REST": 7
}

predictions_paths = {
    tf_name: {
        fold: "/users/amtseng/tfmodisco/results/peak_predictions/{0}/{0}_peak_prediction_performance_fold{1}.h5".format(tf_name, fold)
        for fold in range(1, 11)
    } for tf_name in tf_names
}
performance_bounds_paths = {
    tf_name: "/users/amtseng/tfmodisco/results/performance_bounds/{0}/{0}_performance_bounds.h5".format(tf_name)
    for tf_name in tf_names
}

chrom_splits_json = "/users/amtseng/tfmodisco/data/processed/ENCODE/chrom_splits.json"
with open(chrom_splits_json, "r") as f:
    chrom_splits = json.load(f)

In [None]:
input_length = 1346
profile_length = 1000

### Import the set of performance metrics
For each peak, get the actual performance metrics, the lower bounds, and the upper bounds. The coordinates in the files should match exactly\*

\*The saved coordinates in the predictions/actual performances are padded to the input length, but the coordinates in the performance bounds are padded to the profile length

In [None]:
def import_tf_metrics(tf_name):
    """
    Imports the set of all metrics for the given TF. Returns a dictionary of the
    following format:
        `actual_perf`:
            `fold_num`:
                `nll`: ...
                `norm_nll`: ...
                ...
            `fold_num`:
                ...
        `lower_perf`:
            ...
        `upper_perf`:
            ...
    """
    perfs = {}
    
    performance_bounds_path = performance_bounds_paths[tf_name]
    perf_bound_reader = h5py.File(performance_bounds_path, "r")
    
    perf_bound_reader_coords = perf_bound_reader["coords"]
    lower_perf = perf_bound_reader["performance_lower"]
    upper_perf = perf_bound_reader["performance_upper"]
    lower_perf_dict = {key : lower_perf[key][:] for key in lower_perf.keys()}
    upper_perf_dict = {key : upper_perf[key][:] for key in upper_perf.keys()}
    perfs["lower_perf"] = lower_perf_dict
    perfs["upper_perf"] = upper_perf_dict
    perfs["actual_perf"] = {}

    for fold in tqdm.notebook.trange(1, 11):
        predictions_path = predictions_paths[tf_name][fold]
        pred_reader = h5py.File(predictions_path, "r")

        # Check that the coordinates match up exactly
        pred_reader_coords = pred_reader["coords"]
        assert np.all(
            pred_reader_coords["coords_end"][:] - pred_reader_coords["coords_start"][:] == input_length
        )
        assert np.all(
            perf_bound_reader_coords["coords_end"][:] - perf_bound_reader_coords["coords_start"][:] ==
            profile_length
        )
        assert np.all(
            perf_bound_reader_coords["coords_start"][:] - ((input_length - profile_length) // 2) ==
            pred_reader_coords["coords_start"][:]
        )
        assert np.all(
            perf_bound_reader_coords["coords_end"][:] + ((input_length - profile_length) // 2) ==
            pred_reader_coords["coords_end"][:]
        )
        assert np.all(
            perf_bound_reader_coords["coords_chrom"][:] == pred_reader_coords["coords_chrom"][:]
        )

        actual_perf = pred_reader["performance"]
        actual_perf_dict = {key : actual_perf[key][:] for key in actual_perf.keys()}

        # Compute the normalized NLL for the actual performance 
        actual_perf_dict["norm_nll"] = actual_perf_dict["nll"][:] / \
            np.mean(np.sum(pred_reader["predictions"]["true_profs"][:], axis=2), axis=2)

        perfs["actual_perf"][fold] = actual_perf_dict
        pred_reader.close()
        
    perf_bound_reader.close()
    
    return perfs

### Plot the performance bounds

In [None]:
def plot_perf_bounds(tf_perfs, tf_name):
    def create_violins(ax, lower_perfs, actual_perfs_list, upper_perfs):
        num_actual_perfs = len(actual_perfs_list)
        all_data = np.stack([lower_perfs] + actual_perfs_list + [upper_perfs], axis=0)
        q1, med, q3 = np.nanpercentile(all_data, [25, 50, 70], axis=1)
        iqr = q3 - q1
        lower_outlier = q1 - (1.5 * iqr)
        upper_outlier = q3 + (1.5 * iqr)
        
        sorted_clipped_data = [  # Remove outliers
            np.sort(vec[(vec >= lower_outlier[i]) & (vec <= upper_outlier[i])])
            for i, vec in enumerate(all_data)
        ]
        
        plot_parts = ax.violinplot(
            sorted_clipped_data, showmeans=False, showmedians=False, showextrema=False
        )
        violin_parts = plot_parts["bodies"]
        violin_parts[0].set_facecolor("coral")
        violin_parts[0].set_edgecolor("coral")
        violin_parts[0].set_alpha(0.7)
        for i in range(1, num_actual_perfs + 1):
            violin_parts[i].set_facecolor("mediumorchid")
            violin_parts[i].set_edgecolor("mediumorchid")
            violin_parts[i].set_alpha(0.7)
        violin_parts[-1].set_facecolor("slateblue")
        violin_parts[-1].set_edgecolor("slateblue")
        violin_parts[-1].set_alpha(0.7)
        
        inds = np.arange(1, num_actual_perfs + 2 + 1)
        ax.vlines(inds, q1, q3, color="black", linewidth=5, zorder=1)
        ax.scatter(inds, med, marker="o", color="white", s=30, zorder=2)
    
    # Profile metrics
    for metric_key, metric_name in [
        ("nll", "NLL"), ("norm_nll", "Normalized NLL"), ("jsd", "JSD"), ("profile_mse", "Profile MSE"),
        ("profile_pearson", "Profile Pearson"), ("profile_spearman", "Profile Spearman")
    ]:
        lower_perfs = np.nanmean(tf_perfs["lower_perf"][metric_key], axis=1)
        upper_perfs = np.nanmean(tf_perfs["upper_perf"][metric_key], axis=1)
        actual_perfs_list = [
            np.nanmean(tf_perfs["actual_perf"][fold][metric_key], axis=1)
            for fold in range(1, 11)
        ]
        fig, ax = plt.subplots(figsize=(20, 5))
        create_violins(ax, lower_perfs, actual_perfs_list, upper_perfs)
        ax.set_title("%s of %s predictions across folds\nGenome-wide performance" % (metric_name, tf_name))
        ax.set_xticks(np.arange(1, 13))
        ax.set_xticklabels(
            ["Randomized"] + ["Fold %d" % (fold + 1) for fold in range(10)] + ["Pseudoreps"]
        )
        plt.show()
        
    # Count metrics
    for metric_key, metric_name in [
        ("count_mse", "Count MSE"), ("count_pearson", "Count Pearson"), ("count_spearman", "Count Spearman")
    ]:
        lower_bound = np.nanmean(tf_perfs["lower_perf"][metric_key])
        upper_bound = np.nanmean(tf_perfs["upper_perf"][metric_key])
        actual_perfs_list = [
            np.nanmean(tf_perfs["actual_perf"][fold][metric_key]) for fold in range(1, 11)
        ]
        fig, ax = plt.subplots(figsize=(20, 5))
        
        label_locs = np.arange(12)  # Location of labels
        
        ax.bar(
            label_locs, [lower_bound] + actual_perfs_list + [upper_bound],
            color=(["coral"] + (10 * ["mediumorchid"]) + ["slateblue"]), alpha=0.7
        )
        ax.set_title("%s of %s predictions across folds\nGenome-wide performance" % (metric_name, tf_name))
        ax.set_xticks(label_locs)
        ax.set_xticklabels(
            ["Randomized"] + ["Fold %d" % (fold + 1) for fold in range(10)] + ["Pseudoreps"]
        )
        plt.show()

### Plot performance metrics

In [None]:
for tf_name in tf_names:
    print(tf_name)
    perfs = import_tf_metrics(tf_name)
    plot_perf_bounds(perfs, tf_name)
    del perfs