### Direct links to results
[Multi-task profile model performance across all 10 folds](#multitask-fold)

[Single-task profile model performance across all 10 folds](#singletask-fold)

[Fine-tuned multi-task profile model task-specific performance](#finetune-multitask-task)

In [None]:
import h5py
import numpy as np
import scipy.stats
import scipy.special
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import pandas as pd
import json
import os
import tqdm
tqdm.tqdm_notebook()

In [None]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

### Define constants and paths

In [None]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_RESULTS_TF_NAME"]
num_tasks = int(os.environ["TFM_RESULTS_NUM_TASKS"])
best_multitask_fold = int(os.environ["TFM_RESULTS_BEST_MULTITASK_FOLD"])
best_singletask_folds = [int(x) for x in os.environ["TFM_RESULTS_BEST_SINGLETASK_FOLDS"].split(",")]
    
print("TF name: %s" % tf_name)
print("Number of tasks: %d" % num_tasks)
print("Best multi-task fold: %d" % best_multitask_fold)
print("Best single-task folds: %s" % " ".join([str(x) for x in best_singletask_folds]))

In [None]:
preds_base = "/users/amtseng/tfmodisco/results/peak_predictions/"

# Paths for all 10 folds for multi-task profile models
multitask_preds_paths = [
    os.path.join(
        preds_base,
        "multitask_profile",
        "{0}_multitask_profile_fold{1}/{0}_multitask_profile_fold{1}_pred_perf.h5".format(tf_name, fold)
    ) for fold in range(1, 11)
]

# Paths for all 10 folds for single-task profile models, for each task
singletask_preds_paths = [
    [
        os.path.join(
            preds_base,
            "singletask_profile",
            "{0}_singletask_profile_fold{1}/task_{2}/{0}_singletask_profile_task{2}_fold{1}_pred_perf.h5".format(
                tf_name, fold, task_index
            )
        ) for fold in range(1, 11)
    ] for task_index in range(num_tasks)
]

# Path for fine-tuned multi-task profile model
multitask_finetuned_preds_path = os.path.join(
    preds_base,
    "multitask_profile_finetune",
    "{0}_multitask_profile_finetune_fold{1}/{0}_multitask_profile_finetune_fold{1}_pred_perf.h5".format(
        tf_name, best_multitask_fold
    )
)

# Paths for fine-tuned single-task profile models, for each task
singletask_finetuned_preds_paths = [
    os.path.join(
        preds_base,
        "singletask_profile_finetune",
        "{0}_singletask_profile_finetune_fold{1}/task_{2}/{0}_singletask_profile_finetune_task{2}_fold{1}_pred_perf.h5".format(
            tf_name, fold, task_index
        )
    ) for task_index, fold in enumerate(best_singletask_folds)
]

# Path for upper-bound and lower-bound performance metrics
perf_bounds_path = "/users/amtseng/tfmodisco/results/performance_bounds/{0}_performance_bounds.h5".format(tf_name)

# File specifications (including peak files)
files_spec_json = "/users/amtseng/tfmodisco/data/processed/ENCODE/config/{0}/{0}_training_paths.json".format(tf_name)
with open(files_spec_json, "r") as f:
    files_spec = json.load(f)

# Chromosome split definition (i.e. test set chromosomes)
chrom_splits_json = "/users/amtseng/tfmodisco/data/processed/ENCODE/chrom_splits.json"
with open(chrom_splits_json, "r") as f:
    chrom_splits = json.load(f)
all_fold_test_chroms = [
    chrom_splits[str(fold)]["test"] for fold in range(1, 11)
]
best_multitask_fold_test_chroms = chrom_splits[str(best_multitask_fold)]["test"]
best_singletask_fold_test_chroms = [chrom_splits[str(fold)]["test"] for fold in best_singletask_folds]

In [None]:
input_length = 2114
profile_length = 1000

### Import peak coordinates for each task
For each task, import the set of peaks belonging to that task. This allows us to get a set of indices for coordinates in the saved predictions/performance files that correspond to each task.

In [None]:
# Read in tables containing the peak coordinates, padded to `input_length`
task_coords = []
assert len(files_spec["peak_beds"]) == num_tasks
for peak_bed_path in files_spec["peak_beds"]:
    table = pd.read_csv(
        peak_bed_path, sep="\t", header=None,  # Infer compression
        names=[
            "chrom", "peak_start", "peak_end", "name", "score",
            "strand", "signal", "pval", "qval", "summit_offset"
        ]
    )
    # Add summit location column:
    table["summit"] = table["peak_start"] + table["summit_offset"]
    
    # Add start and end columns, at proper length
    table["start"] = table["summit"] - (input_length // 2)
    table["end"] = table["start"] + input_length
    
    task_coords.append(table[["chrom", "start", "end"]])

### Helper functions
For subsetting predictions/performance metrics to peak subsets, extracting performance metrics, and plotting

In [None]:
def subset_coord_inds(superset_coords, subset_coords):
    """
    Both `superset_coords` and `subset_coords` are Pandas DataFrames of
    coordinates. This will return the indices within `superset_coords`
    (indices being the 0-indexed row numbers) that correspond to the
    coordinates in `subset_coords`. Returns a sorted NumPy array of indices.
    Note that if there are duplicates in either set of coordinates, they
    will be dropped (i.e. the returned indices will be unique).
    """
    inds = superset_coords.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
            subset_coords.reset_index(), on=["chrom", "start", "end"]
        ).sort_values("index_y")["index_x"].values
    return np.sort(inds)

In [None]:
def compute_nll_log_probs(nlls, profiles):
    """
    Computes the log probability portion of the NLL by adding back
    log(N!/x1!...xk!).
    Arguments:
        `nlls`: An N x T array of NLLs (strands averaged)
        `profiles`: An N x T x O x 2 corresponding array of true profile counts
            (that were used to compute the NLLs)
    Returns an N x T array of NLL log probabilities.
    """
    counts = np.sum(profiles, axis=2)
    log_n_fact = scipy.special.gammaln(counts + 1)
    log_x_fact = scipy.special.gammaln(profiles + 1)
    log_x_fact_sum = np.sum(log_x_fact, axis=2)
    diff = np.mean(log_n_fact + log_x_fact_sum, axis=2)  # Shape: N x T
    return nlls + diff

In [None]:
def extract_performance_metrics(pred_perf_path, coord_sets=None, task_inds=None):
    """
    Extracts the set of performance metrics from a saved predictions/performance
    HDF5 file. If specified, filters for coordinates that are in `coord_sets`.
    `coord_sets` is a list of coordinate DataFrames, and a set of metrics will
    be fetched for each table of coordinates provided. Otherwise, will simply
    return all coordinates available (i.e. only one coordinate set with all
    coordinates). If `task_inds` is specified, it must be a list of indices
    parallel to `coord_sets`. For each coordinate set, the metrics extracted
    will be for that task index only. If unspecified, the average over all tasks
    is retained for each coordinate set.
    Returns a dictionary of the following form:
        `nll`: [
            <NLL vector for coord set 1>
            <NLL vector for coord set 2>
            ...
        ],
        `count_mse`: [
            <MSE scalar for coord set 1>
            <MSE scalar for coord set 2>
            ...
        ]
        ...
    """
    result = {}
    
    reader = h5py.File(pred_perf_path, "r")
    coord_reader = reader["coords"]
    pred_reader = reader["predictions"]
    perf_reader = reader["performance"]
    
    # First, get the set of indices within the HDF5 predictions/performance that
    # correspond to the given coordinate sets
    if coord_sets is None:
        subset_inds = [np.arange(perf_reader["nll"].shape[0])]  # The entire vector
    else:
        # Import the DataFrame of coordinates in this HDF5
        pred_perf_coords = pd.DataFrame(
            data={
                "chrom": coord_reader["coords_chrom"][:].astype(str),
                "start": coord_reader["coords_start"][:],
                "end": coord_reader["coords_end"][:]
            }
        )
        subset_inds = [
            subset_coord_inds(pred_perf_coords, coord_set)
            for coord_set in coord_sets
        ]
    
    # If we didn't specify a task index for each coordinate set, just use
    # all tasks; either way, let's get each set of task indices into a
    # NumPy array form
    if task_inds is None:
        task_inds = [np.arange(perf_reader["nll"].shape[1])]
    else:
        task_inds = [np.array([i]) for i in task_inds]
        
    # For each performance metric, for each coordinate set/task index, extract
    # the metrics values
    for key in perf_reader.keys():
        metrics_list = []
        for i in range(len(subset_inds)):
            subset = subset_inds[i]
            tasks = task_inds[i]
            
            if len(perf_reader[key].shape) >= 2:  # Profile metric
                metrics_list.append(
                    np.mean(perf_reader[key][subset][:, tasks], axis=1)
                )
            else:  # Count metric
                # If the coordinate set is limited, then we'll need to recompute
                # the count metrics (i.e. MSE and correlations), since these were
                # saved for the entire set
                if coord_sets is None:
                    # No need to recompute
                    metrics_list.append(
                        np.mean(perf_reader[key][tasks])
                    )
                else:
                    log_true_counts = np.ravel(np.log(pred_reader["true_counts"][subset][:, tasks] + 1))
                    log_pred_counts = np.ravel(pred_reader["log_pred_counts"][subset][:, tasks])
                    if key == "count_mse":
                        metrics_list.append(
                            np.mean(np.square(log_true_counts - log_pred_counts))
                        )
                    elif key == "count_pearson":
                        metrics_list.append(
                            scipy.stats.pearsonr(log_true_counts, log_pred_counts)[0]
                        )
                    elif key == "count_spearman":
                        metrics_list.append(
                            scipy.stats.spearmanr(log_true_counts, log_pred_counts)[0]
                        )
                    else:
                        raise ValueError("Unknown count metric key: %s" % key)

        result[key] = metrics_list

    # Compute the NLL log probs for the actual performance
    nll_log_probs = []
    for i in range(len(subset_inds)):
        subset = subset_inds[i]
        tasks = task_inds[i]
        
        log_probs = compute_nll_log_probs(
            perf_reader["nll"][subset][:, tasks],
            pred_reader["true_profs"][subset][:, tasks]
        )
        nll_log_probs.append(np.mean(log_probs, axis=1))
    result["nll_log_probs"] = nll_log_probs
    
    reader.close()
    
    return result

In [None]:
def extract_performance_bounds(perf_bounds_path, input_length, coord_sets=None, task_inds=None):
    """
    Extracts the set of lower and upper bound performance metrics from a saved
    HDF5 file. `input_length` is the lenght of input sequence to use.
    If specified, filters for coordinates that are in `coord_sets`.
    `coord_sets` is a list of coordinate DataFrames, and a set of metrics will
    be fetched for each table of coordinates provided. Otherwise, will simply
    return all coordinates available (i.e. only one coordinate set with all
    coordinates). If `task_inds` is specified, it must be a list of indices
    parallel to `coord_sets`. For each coordinate set, the metrics extracted
    will be for that task index only. If unspecified, the average over all tasks
    is retained for each coordinate set.
    Returns a dictionary of the following form:
        `nll`: [
            (
                <lower-bound NLL vector for coord set 1>,
                <upper-bound NLL vector for coord set 1>
            ),
            (
                <lower-bound NLL vector for coord set 2>,
                <upper-bound NLL vector for coord set 2>
            ),
            
            ...
        ],
        `count_mse`: [
            (
                <lower-bound MSE scalar for coord set 1>,
                <upper-bound MSE scalar for coord set 1>
            ),
            (
                <lower-bound MSE scalar for coord set 2>,
                <upper-bound MSE scalar for coord set 2>
            ),
            ...
        ]
        ...
    """
    result = {}
    
    reader = h5py.File(perf_bounds_path, "r")
    coord_reader = reader["coords"]
    lower_perf_reader = reader["performance_lower"]
    upper_perf_reader = reader["performance_upper"]
    
    # First, get the set of indices within the HDF5 predictions/performance that
    # correspond to the given coordinate sets
    if coord_sets is None:
        subset_inds = [np.arange(lower_perf_reader["nll"].shape[0])]  # The entire vector
    else:
        # Import the DataFrame of coordinates in this HDF5
        perf_coords = pd.DataFrame(
            data={
                "chrom": coord_reader["coords_chrom"][:].astype(str),
                "start": coord_reader["coords_start"][:],
                "end": coord_reader["coords_end"][:]
            }
        )
        
        # Unlike the predictions, the performance bounds are computed solely
        # based on profiles, so their saved coordinates have a different
        # length, although they are centered at the same summit. So we need
        # to re-pad them.
        perf_coords["midpoint"] = (perf_coords["start"] + perf_coords["end"]) // 2
        perf_coords["start"] = perf_coords["midpoint"] - (input_length // 2)
        perf_coords["end"] = perf_coords["start"] + input_length
        del perf_coords["midpoint"]
        
        subset_inds = [
            subset_coord_inds(perf_coords, coord_set)
            for coord_set in coord_sets
        ]
    
    # If we didn't specify a task index for each coordinate set, just use
    # all tasks; either way, let's get each set of task indices into a
    # NumPy array form
    if task_inds is None:
        task_inds = [np.arange(lower_perf_reader["nll"].shape[1])]
    else:
        task_inds = [np.array([i]) for i in task_inds]
        
    # For each performance metric, for each coordinate set/task index, extract
    # the metrics values for lower and upper bound
    for key in lower_perf_reader.keys():
        metrics_list = []
        for i in range(len(subset_inds)):
            subset = subset_inds[i]
            tasks = task_inds[i]
            
            if len(lower_perf_reader[key].shape) >= 2:  # Profile metric
                metrics_list.append(
                    (
                        np.mean(lower_perf_reader[key][subset][:, tasks], axis=1),
                        np.mean(upper_perf_reader[key][subset][:, tasks], axis=1)
                    )
                )
            else:  # Count metric
                # For the performance bounds, we'll use counts metrics as-is without
                # recomputing them; this is because the counts metrics are distributed
                # very uniformly
                metrics_list.append(
                    (
                        np.mean(lower_perf_reader[key][tasks]),
                        np.mean(upper_perf_reader[key][tasks]),
                    )
                )

        result[key] = metrics_list

    # Note that the NLL log probs is already part of the saved HDF5,
    # so we don't need to compute it separately here

    reader.close()
    
    return result

In [None]:
def plot_performances(perf_dict, title=None, cond_labels=None, cond_colors=None):
    """
    Creates plots for a performance dictionary of the following form:
        `nll`: [
            <NLL vector for cond 1>
            <NLL vector for cond 2>
            ...
        ],
        `count_mse`: [
            <MSE scalar for cond 1>
            <MSE scalar for cond 2>
            ...
        ]
        ...
    For profile metrics (i.e. where the metrics are a vector), creates violin
    plots. For count metrics (i.e. where the metrics are a scalar), creates bar
    plots.
    `cond_labels` and `cond_colors` must be arrays parallel to the set of
    vectors or scalars for each metric.
    """
    def create_violins(ax, metrics_list, colors):
        """
        Creates a violin plot on the given instantiated axes.
        `metrics_list` is a list of vectors. `colors` is a parallel
        list of colors for each violin.
        """
        num_perfs = len(metrics_list)
        
        q1, med, q3 = np.stack([
            np.nanpercentile(data, [25, 50, 70], axis=0) for data in metrics_list
        ], axis=1)
        iqr = q3 - q1
        lower_outlier = q1 - (1.5 * iqr)
        upper_outlier = q3 + (1.5 * iqr)

        
        sorted_clipped_data = [  # Remove outliers based on outlier rule
            np.sort(vec[(vec >= lower_outlier[i]) & (vec <= upper_outlier[i])])
            for i, vec in enumerate(metrics_list)
        ]
        
        plot_parts = ax.violinplot(
            sorted_clipped_data, showmeans=False, showmedians=False, showextrema=False
        )
        violin_parts = plot_parts["bodies"]
        for i in range(num_perfs):
            violin_parts[i].set_facecolor(colors[i])
            violin_parts[i].set_edgecolor(colors[i])
            violin_parts[i].set_alpha(0.7)
        
        inds = np.arange(1, num_perfs + 1)
        ax.vlines(inds, q1, q3, color="black", linewidth=5, zorder=1)
        ax.scatter(inds, med, marker="o", color="white", s=30, zorder=2)
        
    num_conds = len(perf_dict["nll"])
    if not cond_colors:
        cond_colors = ["mediumorchid"] * num_conds
    
    # Profile metrics
    for metric_key, metric_name in [
        ("nll", "NLL"), ("nll_log_probs", "NLL log probs"), ("jsd", "JSD"), ("profile_mse", "Profile MSE"),
        ("profile_pearson", "Profile Pearson"), ("profile_spearman", "Profile Spearman")
    ]:
        fig, ax = plt.subplots(figsize=(20, 5))
        create_violins(ax, perf_dict[metric_key], cond_colors)
        if title:
            ax.set_title("%s: %s" % (title, metric_name))
        else:
            ax.set_title(metric_name)
        if cond_labels:
            ax.set_xticks(np.arange(1, num_conds + 1))
            ax.set_xticklabels(cond_labels)
        plt.show()
        print("Average values: " + " ".join([("%.3f" % np.nanmean(arr)) for arr in perf_dict[metric_key]]))

    # Count metrics
    for metric_key, metric_name in [
        ("count_mse", "Count MSE"), ("count_pearson", "Count Pearson"), ("count_spearman", "Count Spearman")
    ]:
        fig, ax = plt.subplots(figsize=(20, 5))
        label_locs = np.arange(num_conds)  # Location of labels
        ax.bar(
            label_locs, perf_dict[metric_key], color=cond_colors, alpha=0.7
        )
        if title:
            ax.set_title("%s: %s" % (title, metric_name))
        else:
            ax.set_title(metric_name)
        if cond_labels:
            ax.set_xticks(label_locs)
            ax.set_xticklabels(cond_labels)
        plt.show()
        print("Average values: " + " ".join([("%.3f" % val) for val in perf_dict[metric_key]]))

<a id="multitask-fold"></a>
### Multi-task profile model performance across all 10 folds
A comparison of the test-set performance (averaged across tasks) between:
1. Multi-task profile models trained across all 10 folds
2. A fine-tuned multi-task profile model on the best-performing fold
3. Upper and lower bounds on the best-performing fold

In [None]:
multitask_perf_dict = {}

multitask_bounds_perf_dict = extract_performance_bounds(
    perf_bounds_path, input_length
)

# Lower bound first
for key in multitask_bounds_perf_dict.keys():
    multitask_perf_dict[key] = [multitask_bounds_perf_dict[key][0][0]]

# 10 folds
for pred_path in multitask_preds_paths:
    perf_dict = extract_performance_metrics(pred_path)
    for key in multitask_perf_dict.keys():
        multitask_perf_dict[key].append(perf_dict[key][0])

# Fine-tuned
perf_dict = extract_performance_metrics(multitask_finetuned_preds_path)
for key in multitask_perf_dict.keys():
    multitask_perf_dict[key].append(perf_dict[key][0])
    
# Upper bound last
for key in multitask_perf_dict.keys():
    multitask_perf_dict[key].append(multitask_bounds_perf_dict[key][0][1])
    
cond_labels = ["Randomized"]
cond_labels += [("Fold %d" % i) for i in range(1, 11)]
cond_labels += ["Fine-tuned\n(fold %d)" % best_multitask_fold]
cond_labels += ["Pseudoreps"]
cond_colors = ["coral"] + (["mediumorchid"] * 10) + ["seagreen", "slateblue"]
plot_performances(
    multitask_perf_dict,
    title=("%s multi-task models" % tf_name),
    cond_labels=cond_labels,
    cond_colors=cond_colors
)

del multitask_perf_dict

<a id="singletask-fold"></a>
### Single-task profile model performance across all 10 folds
**For each task**, a comparison of the test-set performance between:
1. Single-task profile models trained across all 10 folds
2. A fine-tuned single-task profile model on the best-performing fold
3. Upper and lower bounds on the best-performing fold

In [None]:
for task_index in range(len(task_coords)):
    singletask_perf_dict = {}

    singletask_bounds_perf_dict = extract_performance_bounds(
        perf_bounds_path, input_length,
        coord_sets=[task_coords[task_index]], task_inds=[task_index]
    )

    # Lower bound first
    for key in singletask_bounds_perf_dict.keys():
        singletask_perf_dict[key] = [singletask_bounds_perf_dict[key][0][0]]

    # 10 folds
    for pred_path in singletask_preds_paths[task_index]:
        perf_dict = extract_performance_metrics(pred_path)
        # No need to specify specific coordinates or task indices, because
        # single-task model predictions are saved only for that one task
        for key in singletask_perf_dict.keys():
            singletask_perf_dict[key].append(perf_dict[key][0])

    # Fine-tuned
    perf_dict = extract_performance_metrics(
        singletask_finetuned_preds_paths[task_index]
    )
    # No need to specify specific coordinates or task indices, because
        # single-task model predictions are saved only for that one task
    for key in singletask_perf_dict.keys():
        singletask_perf_dict[key].append(perf_dict[key][0])

    # Upper bound last
    for key in singletask_perf_dict.keys():
        singletask_perf_dict[key].append(singletask_bounds_perf_dict[key][0][1])
        
    cond_labels = ["Randomized"]
    cond_labels += [("Fold %d" % i) for i in range(1, 11)]
    cond_labels += ["Fine-tuned\n(fold %d)" % best_singletask_folds[task_index]]
    cond_labels += ["Pseudoreps"]
    cond_colors = ["coral"] + (["mediumorchid"] * 10) + ["seagreen", "slateblue"]
    plot_performances(
        singletask_perf_dict,
        title=("%s single-task models (task %d)" % (tf_name, task_index)),
        cond_labels=cond_labels,
        cond_colors=cond_colors
    )
    
    del singletask_perf_dict

<a id="finetune-multitask-task"></a>
### Fine-tuned multi-task profile model task-specific performance
A comparison of the test-set performance for between each task of a multi-task profile model fine-tuned on the best-performing fold

In [None]:
finetune_multitask_perf_dict = extract_performance_metrics(
    multitask_finetuned_preds_path,
    coord_sets=task_coords,
    task_inds=list(range(len(task_coords)))
)

cond_labels = [("Task %d" % i) for i in range(len(task_coords))]
cond_colors = ["mediumorchid"] * len(task_coords)
plot_performances(
    finetune_multitask_perf_dict,
    title=("%s fine-tuned multi-task model" % tf_name),
    cond_labels=cond_labels,
    cond_colors=cond_colors
)

del finetune_multitask_perf_dict