In [1]:
%matplotlib inline
import glob
import json
import matplotlib
import os
import sys

sys.path.insert(0,'/srv/home/nmew/myprojects/neural-nets/')
from collections import defaultdict, OrderedDict, Counter
import common.util as ut
from itertools import izip
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix, r2_score, precision_recall_fscore_support
import common.metrics as met
import common.plots as plo
from common.h5py_loading import load_target_map, load_dataset
from common.h5py_data_loader import H5pyDataLoader
from common.prediction_analysis import df_from_chembl_export, intersect_truth_prediction
from lasagne_nn.run_nn import get_predictions_of_knowns, get_network_from_weights
from lasagne_nn.output_loader import df_from_prediction_path
import cPickle as pkl
# setups
matplotlib.rcParams['savefig.dpi'] = 300
context = 'paper'

    
##### misc functions #####
class  Experiment(dict):
    def __init__(self, name):
        self.name = name
        self.folds = []
        self.converged_epochs = []
        self.trained_paths = []
      
    def __repr__(self):
        return str(vars(self))
    
    def __str__(self):
        return json.dumps(vars(self), indent=2)
        
    def set_converged_epoch(self, epoch, train_path, fold=None):        
        self.folds.append(fold)
        self.converged_epochs.append(epoch)
        self.trained_paths.append(train_path)

        
def get_expts(expt_json):
    """
    Given a filename for a pnr experiment that contains converged epochs, 
    
    Parameters
    ----------
    expt_json : str
        *.json containing experiment converged epochs, folds, and paths

    Returns
    -------
    expt_epochs_json : list of Experiment
        list of class Experiment containing names, trained_paths, folds, and converged epochs for an experiment

    """
    converged_epochs = (expt_json)
    with open(converged_epochs, "r") as fp:
        expts = json.load(fp)
    experiments = []
    for e in expts:
        tmp = Experiment(e["name"])
        for epoch, path, fold in izip(e["converged_epochs"], e["trained_paths"], e["folds"]):
            tmp.set_converged_epoch(epoch, path, fold)
        experiments.append(tmp)
        del(tmp)    
    return experiments


##### Filesystem handler functions #####

def get_env_var(handle):
    """Get an environment variable given the handle for the bash variable
    
    Params
    ========
    handle : str
        handle for a bash variable
        
    Returns
    ========
    tmp : str
        environment variable as a string
    """ 
    tmp = os.getenv(handle)
    if not tmp:
        raise LookupError("Environment variable: {} not set.".format(handle))
    return tmp.strip("'")


##### Data reader functions #####

def get_easy_percentages(df):
    df['enforced_neg_percent'] = (100 / (1 + df['positive_negative_ratio']))
    df['ppos'] = 100 - df["enforced_neg_percent"]
    df.loc[df['positive_negative_ratio'] == 0.0, 'enforced_neg_percent'] = 0.0
    df.loc[df['positive_negative_ratio'] == 0.0, 'ppos'] = 100.0  
    df = df.sort_values('enforced_neg_percent')
    return df


##### Plotting functions #####

def set_nice_spines(ax=None, sides=('left', 'bottom'), color="black", linewidth=1.5):
    """
    Nice spine formatting for plots
    """
    if not ax:
        ax = plt.gca()
    for side in sides:
        ax.spines[side].set_visible(True)
        ax.spines[side].set_color(color)
        ax.spines[side].set_linewidth(linewidth)
        

def plot_metrics_by_pnr(resdf, metrics, palette=None, metric_names=None, label_suffix='', linestyle='-', hatch=''):  
    """
    Plots a positive-negative ratio experiment given a dataframe of experiments, metrics of interest, and a list of converged epochs
    
    Parameters
    ----------
    df : pd.DataFrame
        pandas dataframe containing experiment metrics and info
    metric : str
        metric of interest (e.g. r2_train)
    expt_epochs_json : list of dicts
        list of dicts containing: trained_paths, folds, and converged epochs for an experiment
        
    Kwargs
    -------
    metric_names : list
        list of names for provided metrics
    label_suffix : str
        Name to add to end of legend names (Default '')
    linestyle : str
        matplotlib plotting variable for linestyle (Default '-')
    hatch : str
        What hatch to use for matplotlib filling of fold variance (Default '')
    """
    if metric_names is None:
        metric_names = list(metrics)
    if not palette:
        palette = sns.color_palette("husl", len(metrics))

    for i, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
        metric_df = resdf.sort_values('positive_negative_ratio')
        metric_df = metric_df[['positive_negative_ratio', 'enforced_neg_percent', metric]]
        metric_df['is_mean'] = False
        metric_df = metric_df.rename(index=str, columns={'Score':metric})
        cfold_ctr = metric_df.groupby("positive_negative_ratio").count()
        missing_vals = cfold_ctr[cfold_ctr[metric]<5].index.values
        if len(missing_vals) > 0:
            print("Missing values for PNR {} tests: {}".format(metric, missing_vals))
        metric_df = metric_df[['enforced_neg_percent', metric]]

        mean_df = metric_df.groupby('enforced_neg_percent', as_index=False).mean()        
        max_err = metric_df.groupby('enforced_neg_percent', as_index=False).max()
        min_err = metric_df.groupby('enforced_neg_percent', as_index=False).min()


        plt.plot(mean_df['enforced_neg_percent'], mean_df[metric], label=metric_name+label_suffix, linestyle=linestyle, color=palette[i], lw=2)
        plt.fill_between(mean_df['enforced_neg_percent'], min_err[metric], max_err[metric], alpha=.25, color=palette[i], hatch=hatch)


because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

ERROR (theano.gpuarray): Could not initialize pygpu, support disabled
Traceback (most recent call last):
  File "/srv/home/ecaceres/anaconda2/envs/features/lib/python2.7/site-packages/theano/gpuarray/__init__.py", line 227, in <module>
    use(config.device)
  File "/srv/home/ecaceres/anaconda2/envs/features/lib/python2.7/site-packages/theano/gpuarray/__init__.py", line 214, in use
    init_dev(device, preallocate=preallocate)
  File "/srv/home/ecaceres/anaconda2/envs/features/lib/python2.7/site-packages/theano/gpuarray/__init__.py", line 99, in init_dev
    **args)
  File "pygpu/gpuarray.pyx", line 658, in pygpu.gpuarray.init
  File "pygpu/gpuarray.pyx", line 587, in pygpu.gpuarray.pygpu_init
GpuArrayException: Could not load "libcuda.so": libcuda.so: cannot open shared object file: No such file or directory


In [2]:
# data handling

def get_preds(dset_dict, expt, dset, fold, ratio=None):
    indir = data_dict[expt]["indir"]
    if ratio is not None:
        knowns_file = data_dict[expt]["knowns_fmt"].format(indir, ratio, dset, fold)
        preds_file = data_dict[expt]["preds_fmt"].format(indir, ratio, dset, fold)
    else:
        knowns_file = data_dict[expt]["knowns_fmt"].format(indir, dset, fold)
        preds_file = data_dict[expt]["preds_fmt"].format(indir, dset, fold)
        
    prediction = np.load(preds_file)
    truth = np.load(knowns_file)
    
    if dset_dict[expt]["npz_fmt"] == True:
        prediction = prediction["arr_0"]
        truth = truth["arr_0"]
    return prediction, truth


def predictions_knowns_from_trained_network_and_data(hdf5_file, weights_filename, build_nn_script,
                                                     test_indices_file=None,
                                                     npKi=False, multitask=False,
                                                     network_target_map_file=None,
                                                     dataset_target_map_file=None):
    network = get_network_from_weights(weights_filename, build_nn=build_nn_script)
    network_target_map = load_target_map(network_target_map_file) if network_target_map_file else None
    data_loader = H5pyDataLoader(hdf5_file=hdf5_file, test_indices_file=test_indices_file,
                                 npKi=npKi, multitask=multitask,
                                 target_map_file=dataset_target_map_file,
                                 train_percentage=None)
    data_loader.load_training_data()
    predictions, knowns = get_predictions_of_knowns(data_loader=data_loader,
                                                    weights_filename=weights_filename,
                                                    indices=data_loader.train_indices,
                                                    network=network,
                                                    network_target_map=network_target_map)
    return predictions, knowns

In [3]:
home_save_dir = get_env_var("HOME_SAVE_BASE")
srv_save_dir = get_env_var("DATA_SAVE_BASE")
proj_dir = get_env_var("NMEW_PROJ_BASE")
loss_dir = "{}/nnet_output".format(srv_save_dir)

std_sma_dir = "{}/lr_nesterov_1024_2048_3072/".format(loss_dir)
std_sma_basepath = os.path.join(std_sma_dir, "STD_SMA_RATIOS")

neg_rm_dir = "{}/all_negs_stochastic/".format(loss_dir)
neg_rm_basepath = os.path.join(neg_rm_dir, "NEG_RM_RATIOS")

In [4]:
preds_dir = "{}/20190410_SMA_Investigation/predictions".format(srv_save_dir)
sma_pnr_epochs = get_expts("{}/{}/experiments.json".format(preds_dir, "STD_SMA_RATIOS"))
neg_rm_pnr_epochs = get_expts("{}/{}/experiments.json".format(preds_dir, "NEG_RM_RATIOS"))

In [5]:
reg_data_file = "./regression_preds_file_lookup.pkl"
with open(reg_data_file, "rb") as f:
    data_dict = pkl.load(f)

In [6]:
df = pd.DataFrame(columns=["expt", "dset", "fold", "ratio", "AUROC", "AUPRC", "R2"])
thresh=5.0
expts = ["STD_SMA_RATIOS", "NEG_RM_RATIOS"]
make_data = False

if make_data:
    for expt in expts:
        ratio_explicit = data_dict[expt]["ratios"]
        for ratio in ratio_explicit:
            for dset in ["drugmatrix", "timesplit", "test", "train"]:
                for fold in np.arange(0,5,1):
                    prediction, truth = get_preds(data_dict, expt, dset, fold, ratio=ratio)

                    if dset != "drugmatrix":
                        known_mask = ~np.isnan(truth)
                        truth = truth[known_mask]
                        prediction = prediction[known_mask]

                    auroc, _, _, _, _ = met.compute_binary_auc(prediction, truth, 
                                                                   threshold=thresh, x_y_type="tpr-fpr")
                    auprc, _, _, _, _ = met.compute_binary_auc(prediction, truth, 
                                                                   threshold=thresh, x_y_type="precision-recall")
                    if dset == "drugmatrix":
                        known_mask = ~np.isnan(truth)
                        truth = truth[known_mask]
                        prediction = prediction[known_mask]

                    r2 = met.compute_rsquared(prediction, truth)
                    input_data={"expt":[expt],"dset" :[dset], "fold": [fold], "ratio":[ratio], "AUROC":[auroc], 
                                "AUPRC":[auprc], "R2":[r2]}
                    df = pd.concat([df, pd.DataFrame(input_data)], sort=False)

    plot_data_save_base = "{}/20190410_SMA_Investigation/plot_data/regression".format(get_env_var("DATA_SAVE_BASE"))
    df.to_csv("{}/ratio_plot_vals.tsv".format(plot_data_save_base), sep='\t')            

In [7]:
plot_data_save_base = "{}/20190410_SMA_Investigation/plot_data/regression".format(get_env_var("DATA_SAVE_BASE"))
sma_df = pd.read_csv("{}/ratio_plot_vals.tsv".format(plot_data_save_base), sep='\t', index_col=0)
sma_df.columns = [u'expt', u'dset', u'fold', u'positive_negative_ratio', u'AUROC', u'AUPRC', u'R2']
sma_df["positive_negative_ratio"] = sma_df.positive_negative_ratio.astype(np.float)
sma_df = get_easy_percentages(sma_df)


In [8]:
other_df = pd.read_csv("{}/neighbors_plot_vals.tsv".format(plot_data_save_base), sep='\t', index_col=0)
other_df.columns = [u'expt', u'dset', u'fold', u'positive_negative_ratio', u'AUROC', u'AUPRC', u'R2']
other_df["positive_negative_ratio"] = other_df.positive_negative_ratio.astype(np.float)
other_df = get_easy_percentages(other_df)
other_expts = ['NEG_RM', 'STD']
to_add = other_df[other_df.expt.isin(other_expts)]
to_add.positive_negative_ratio=0.0
to_add = get_easy_percentages(to_add)
m = to_add["expt"] == "NEG_RM"
to_add.expt[m] = 'NEG_RM_RATIOS'
to_add.expt[~m] = 'STD_SMA_RATIOS'

df = pd.concat([sma_df, to_add])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-

In [9]:
sma_resdf = df[df.expt == "STD_SMA_RATIOS"]
neg_resdf = df[df.expt == "NEG_RM_RATIOS"]

In [21]:
axes_size = 20
legend_size= 20
legend_col = 2
title_size=25

name_dict = {
    "drugmatrix" : "Drug Matrix",
    "timesplit" : "Time Split",
    "test" : "Test",
    "train" : "Train",
}

metrics = ["AUROC", "AUPRC", "R2"]
metric_names = metrics

dsets = ["drugmatrix", "timesplit", "test", "train"]


for d in dsets:
    fig, ax = plt.subplots(figsize=(20,12))
    sns.set_style("whitegrid")
    palette = sns.color_palette("husl", len(metrics))

    plot_metrics_by_pnr(sma_resdf[sma_resdf["dset"]==d], metrics, metric_names=metric_names, palette=palette, label_suffix=" SNA")
    plot_metrics_by_pnr(neg_resdf[neg_resdf["dset"]==d], metrics, metric_names=metric_names, palette=palette, label_suffix=' Negatives Removed +SNA', linestyle='--', hatch='//')
    set_nice_spines(ax=ax)
#     plt.title(d, fontsize=30)
    
    plt.ylim(ymin=0.)
    sns.despine()
    set_nice_spines(ax)
    ax.xaxis.grid(True, which='major', linestyle=':') 
    ax.set_xlim(0,101)
    ax.set_ylim(0,1.01)

    ax.yaxis.grid(True, which='major', linestyle=':')
    ax.tick_params(top=False, right=False)

    plt.suptitle('{} Regression Network Performance on Standard vs. Negative-removed Models'.format(name_dict[d]), size=title_size)
    plt.xlabel('Percent Negative Composition aim for training data (by protein target when # negatives < # positives)', size=axes_size)
    plt.ylabel('Score', size=axes_size)
    plt.legend(loc='lower center')
    plt.legend(bbox_to_anchor=(0, -0.25), loc=3, borderaxespad=0., fontsize=legend_size, ncol=legend_col)
    sns.set_context("poster")
    save_dir = "{}/paper_figs".format(get_env_var("HOME"))
    plt.savefig("{}/{}_regression_SNA_ratios.png".format(save_dir, d), dpi=300)
    plt.clf()
    

<matplotlib.figure.Figure at 0x7f873312a7d0>

<matplotlib.figure.Figure at 0x7f8732fb1190>

<matplotlib.figure.Figure at 0x7f8733326990>

<matplotlib.figure.Figure at 0x7f873a6bae10>

In [11]:
grouped = sma_resdf.groupby(["expt", "dset", "positive_negative_ratio", "enforced_neg_percent", "ppos"])
g1 = grouped.mean()
g2 = grouped.std()
g2.columns = ["fold_std", "AUROC std", "AUPRC std", "R2 std"]
sma_ratio_df = pd.merge(g1.reset_index(), g2.reset_index(), on=["expt", "dset", "positive_negative_ratio", 
                                                                "enforced_neg_percent", "ppos"])


In [12]:
grouped = neg_resdf.groupby(["expt", "dset", "positive_negative_ratio", "enforced_neg_percent", "ppos"])
g1 = grouped.mean()
g2 = grouped.std()
g2.columns = ["fold_std", "AUROC std", "AUPRC std", "R2 std"]
neg_ratio_df = pd.merge(g1.reset_index(), g2.reset_index(), on=["expt", "dset", "positive_negative_ratio", 
                                                                "enforced_neg_percent", "ppos"])

In [13]:
all_df = pd.concat([sma_ratio_df, neg_ratio_df]).groupby(["dset", "expt", "positive_negative_ratio", 
                                                                "enforced_neg_percent", "ppos"]).mean()

In [14]:
all_df = all_df[['AUROC', 'AUROC std', 'AUPRC', 'AUPRC std', 'R2', 'R2 std']]

In [15]:
all_df.to_csv("{}/all_ratio_plot_mean_std.csv".format(plot_data_save_base), sep='\t', index=True)