In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
import seaborn as sns
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix, r2_score, precision_recall_fscore_support
import common.metrics as met
import common.plots as plo
from common.h5py_loading import load_target_map, load_dataset
from common.h5py_data_loader import H5pyDataLoader
from common.prediction_analysis import df_from_chembl_export, intersect_truth_prediction
from lasagne_nn.run_nn import get_predictions_of_knowns, get_network_from_weights
from lasagne_nn.output_loader import df_from_prediction_path

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

ERROR (theano.gpuarray): Could not initialize pygpu, support disabled
Traceback (most recent call last):
  File "/srv/home/ecaceres/anaconda2/envs/features/lib/python2.7/site-packages/theano/gpuarray/__init__.py", line 227, in <module>
    use(config.device)
  File "/srv/home/ecaceres/anaconda2/envs/features/lib/python2.7/site-packages/theano/gpuarray/__init__.py", line 214, in use
    init_dev(device, preallocate=preallocate)
  File "/srv/home/ecaceres/anaconda2/envs/features/lib/python2.7/site-packages/theano/gpuarray/__init__.py", line 99, in init_dev
    **args)
  File "pygpu/gpuarray.pyx", line 658, in pygpu.gpuarray.init
  File "pygpu/gpuarray.pyx", line 587, in pygpu.gpuarray.pygpu_init
GpuArrayException: Could not load "libcuda.so": libcuda.so: cannot open shared object file: No such file or directory


In [2]:
# data handling

def get_env_var(handle):
    ''' Get an environment variable given the handle for the bash variable'''
    tmp = os.getenv(handle)
    if not tmp:
        raise LookupError("Environment variable: {} not set.".format(handle))
    return tmp.strip("'")

def get_preds(dset, fold):
    indir="{}/output/20180815_Paper_Retrains/predictions/scrambled_idx".format(get_env_var("HOME_SAVE_BASE"))
    knowns_file = "{}/scrambled_idx_1.0_{}_fold_{}_regression_knowns.npz".format(indir, dset, fold)
    preds_file = "{}/scrambled_idx_1.0_{}_fold_{}_regression_preds.npz".format(indir, dset, fold)

    pred_npz = np.load(preds_file)
    known_npz = np.load(knowns_file)
    
    prediction = pred_npz["arr_0"]
    truth = known_npz["arr_0"]
    
    return prediction, truth

def predictions_knowns_from_trained_network_and_data(hdf5_file, weights_filename, build_nn_script,
                                                     test_indices_file=None,
                                                     npKi=False, multitask=False,
                                                     network_target_map_file=None,
                                                     dataset_target_map_file=None):
    network = get_network_from_weights(weights_filename, build_nn=build_nn_script)
    network_target_map = load_target_map(network_target_map_file) if network_target_map_file else None
    data_loader = H5pyDataLoader(hdf5_file=hdf5_file, test_indices_file=test_indices_file,
                                 npKi=npKi, multitask=multitask,
                                 target_map_file=dataset_target_map_file,
                                 train_percentage=None)
    data_loader.load_training_data()
    predictions, knowns = get_predictions_of_knowns(data_loader=data_loader,
                                                    weights_filename=weights_filename,
                                                    indices=data_loader.train_indices,
                                                    network=network,
                                                    network_target_map=network_target_map)
    return predictions, knowns

In [3]:
# plotting 

def set_nice_spines(ax=None, sides=('left', 'bottom'), color="black", linewidth=1.5):
    if not ax:
        ax = plt.gca()
    for side in sides:
        ax.spines[side].set_visible(True)
        ax.spines[side].set_color(color)
        ax.spines[side].set_linewidth(linewidth)
    ax.tick_params(axis='both', labeltop='off', labelright='off')
    x_labels = map(str, ax.get_xticks().tolist())
    x_labels[:-1] = ['']*(len(x_labels) - 1)
    ax.set_xticklabels(x_labels, size=10)
    y_labels = map(str, ax.get_yticks().tolist())
    y_labels[1:-1] = ['']*(len(y_labels) - 2)
    ax.set_yticklabels(y_labels, size=10)
    ax.xaxis.labelpad = -10
    ax.yaxis.labelpad = -18

def plot_roc_curves(train_dir, fold, save_name=None, thresh=6.0):
    reload(met)
    reload(plo)
    figsize = (10, 10)
    plt.clf()
    current_palette = sns.color_palette("deep")
    sns.set_palette(current_palette)
    fig, ax = plt.subplots(figsize=figsize)
    sns.set_context("paper")
    sns.set(font_scale=2)
    
    labels = []
    aucs = []
    # TRAIN

    prediction, truth = get_preds("train", fold)
    known_mask = ~np.isnan(truth)
    truth = truth[known_mask]
    prediction = prediction[known_mask]
    
    
    auc = plo.plot_binary_auc(prediction, truth, threshold=thresh, plot_rand=False,
                              ax=ax, x_y_type='tpr-fpr', show_legend=False)
    labels.append(("Train Set (auc={:0.2f})".format(auc), "-"))

    # TEST
    
    prediction, truth = get_preds("test", fold)
    known_mask = ~np.isnan(truth)
    truth = truth[known_mask]
    prediction = prediction[known_mask]
    
    auc = plo.plot_binary_auc(prediction, truth, threshold=thresh, plot_rand=False,
                              ax=ax, x_y_type='tpr-fpr', show_legend=False)
    labels.append(("Test Set (auc={:0.2f})".format(auc), "--"))
    # DM
    dm_exprt = '/srv/home/nmew/data/drugmatrix/drugmatrix_full_chembl20_cutoff800.csv'
    
    
    dm_prediction = os.path.join(train_dir, 'drug-matrix', 'model_at_epoch_200_prediction.csv')
    tdf, pdf = intersect_truth_prediction(df_from_chembl_export(dm_exprt, fill_unknowns=False),
                                          df_from_prediction_path(dm_prediction))
    prediction, truth = pdf.values, tdf.values
    auc = plo.plot_binary_auc(prediction, truth, threshold=thresh, plot_rand=False,
                              ax=ax, x_y_type='tpr-fpr', show_legend=False)
    aucs.append(auc)
    labels.append(("Drug Matrix (auc={:0.3f})".format(auc), "-."))
    
    # TS
    ts_prediction, ts_truth = get_preds("timesplit", fold)
    
    known_mask = ~np.isnan(ts_truth)
    ts_truth = ts_truth[known_mask]
    ts_prediction = ts_prediction[known_mask]
    
    auc, rand_auc = plo.plot_binary_auc(ts_prediction, ts_truth, threshold=thresh,
                                        ax=ax, x_y_type='tpr-fpr', show_legend=False, 
                                        plot_rand=True)
    aucs.append(auc)
    labels.append(("Time Split (auc={:0.2f})".format(auc), ":"))
    labels.append(("Random (auc={:0.2f})".format(0.5), "--"))
    
    # ax = plt.gca()
    ax.spines["left"].set_visible(True)
    ax.spines["left"].set_color("black")
    ax.spines["left"].set_linewidth(1.5)

    ax.spines["bottom"].set_visible(True)
    ax.spines["bottom"].set_color("black")
    ax.spines["bottom"].set_linewidth(1.5)

    for line, (label, style) in zip(ax.lines, labels):
        line.set_label(label)
        line.set_linestyle(style)
        line.set_clip_on(False)

    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.title("AUROC Curve for fold {}".format(fold))
    
    if save_name:
        plt.savefig(save_name)
        plt.close()
    else: 
        plt.show()
    
    return
    
    
def plot_test_train_auprc(fold, save_name=None, thresh=6.0, figsize=(4,4), dpi=300):
    # TEST AND TRAIN TOGETHER At theshold of 6.0# TEST A 
    plt.clf()
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)    
    sns.set_context("paper")

    # train
    prediction, truth = get_preds("train", fold)
    known_mask = ~np.isnan(truth)
    truth = truth[known_mask]
    prediction = prediction[known_mask]
    tr_auc = plo.plot_binary_auc(prediction, truth, threshold=thresh, 
                                 ax=ax, x_y_type='precision-recall', 
                                 show_legend=False, plot_rand=False)

    # test
    prediction, truth = get_preds("test", fold)
    known_mask = ~np.isnan(truth
                         
                         )
    truth = truth[known_mask]
    prediction = prediction[known_mask]
    auc, rand_auc = plo.plot_binary_auc(prediction, truth, threshold=thresh, 
                                        ax=ax, x_y_type='precision-recall', 
                                        show_legend=False)

    for side in ['left', 'bottom']:
        ax.spines[side].set_visible(True)
        ax.spines[side].set_color("black")
        ax.spines[side].set_linewidth(1.5)

    for line, (label, style) in zip(ax.lines, 
                                    [("Train Set (auc = {:0.3f})".format(tr_auc), '-'),
                                     ("Test Set  (auc = {:0.3f})".format(auc), '-'),
                                     ("Random  (auc = {:0.3f})".format(rand_auc), '--')]):
        line.set_label(label)
        line.set_linestyle(style)
        line.set_clip_on(False)

    plt.legend(loc='lower right')
    plt.title("Scrambled index AUPRC Curve for fold {} at threshold {}".format(fold, thresh))
    
    if save_name:
        plt.savefig(save_name)
        plt.close()
    else: 
        plt.show()
    
    return

def plot_dm_auprc(fold, save_name=None, thresh=6.0, figsize=(6,6), dpi=300):
    
    prediction, truth = get_preds("drugmatrix", fold)

    plt.clf()
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    sns.set_context("paper")
    
    auc, rand_auc = plo.plot_binary_auc(prediction, truth, threshold=6., 
                                        ax=ax, x_y_type='precision-recall', 
                                        show_legend=False)
    
    for side in ['left', 'bottom']:
        ax.spines[side].set_visible(True)
        ax.spines[side].set_color("black")
        ax.spines[side].set_linewidth(1.5)
        
    for line, (label, style) in zip(ax.lines, [("DM  (auc = {:0.3f})".format(auc), '-'),
                                               ("Base (auc = {:0.3f})".format(rand_auc), '--')]):
        line.set_label(label)
        line.set_linestyle(style)
        line.set_clip_on(False)
        
#     set_nice_spines(ax)
    plt.title("Scrambled index AUPRC Curve for fold {} at threshold {}".format(fold, thresh))
    ax.legend(loc='upper right', borderpad=0., borderaxespad=0.25, prop={'family': 'monospace', 'size': 11})
    plt.tight_layout()
    
    if save_name:
        plt.savefig(save_name)
        plt.close()
    else: 
        plt.show()
    
    return

def plot_ts_auprc(fold, save_name=None, thresh=6.0, figsize=(4,4), dpi=300):
    
    prediction, truth = get_preds("timesplit", fold)
    known_mask = ~np.isnan(truth)
    truth = truth[known_mask]
    prediction = prediction[known_mask]
    
    plt.clf()
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    sns.set_context("paper")
    
    auc, rand_auc = plo.plot_binary_auc(prediction, truth, threshold=6., 
                                        x_y_type='precision-recall',
                                        ax=ax, show_legend=False)
    
    for side in ['left', 'bottom']:
        ax.spines[side].set_visible(True)
        ax.spines[side].set_color("black")
        ax.spines[side].set_linewidth(1.5)
        
    
    for line, (label, style) in zip(ax.lines, [("TS  (auc = {:0.3f})".format(auc), '-'),
                                               ("Base (auc = {:0.3f})".format(rand_auc), '--')]):
        line.set_label(label)
        line.set_linestyle(style)
        line.set_clip_on(False)
        
    #set_nice_spines(ax)
    plt.title("Scrambled index AUPRC Curve for fold {} at threshold {}".format(fold, thresh))
    ax.legend(loc='lower right', borderpad=0., borderaxespad=0.25, 
              prop={'family': 'monospace', 'size': 11})
    plt.tight_layout()
    
    if save_name:
        plt.savefig(save_name)
        plt.close()
    else: 
        plt.show()
    
    return


In [4]:
save_base = get_env_var("HOME_SAVE_BASE")
train_base = "{}/output/20180815_Paper_Retrains/trained_nets/lr_nesterov_1024_2048_3072/scrambled_idx".format(save_base)
train_dir_fmter = "{}/fold_{}/pnr_1.0/".format(train_base, "{}")

In [5]:
new_save_base = get_env_var("HOME")

img_base ="{}/labgits/lab-notebook-caceres.wiki/images".format(new_save_base)
expt_base = "{}/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx".format(img_base)
auroc_fmter = "{}/{}_at_fold_{}_thresh_{}_AUROC.png".format(expt_base, "{}", "{}", "{}")
auprc_fmter = "{}/{}_at_fold_{}_thresh_{}_AUPRC.png".format(expt_base, "{}", "{}", "{}")

## AUROC All at Thresh: 6.0

In [6]:
thresh=6.0
figsize=(4,4)
dpi=300

In [7]:
folds = ["0", "1", "2", "3", "4"]
dset_name="all"
for f in folds:
    plot_roc_curves(train_dir_fmter.format(f), f, save_name=auroc_fmter.format(dset_name, f, thresh), thresh=thresh)

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  tdf.sortlevel(['target', 'compound'], inplace=True)
  df.sortlevel(['target', 'compound'], inplace=True)
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  pdf = pdf.loc[tdf.index].dropna()
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotl

<matplotlib.figure.Figure at 0x7f6c344a2f10>

## AUPRC Train and Test: Thresh 6.0

Good performance on AUPRC for Train set (expected) with higher than expected performance on test set. Is this due to the similarity between small molecules? Pretty close to baseline though...

In [8]:
thresh=6.0
figsize=(4,4)
dpi=300

In [9]:
folds = ["0", "1", "2", "3", "4"]
dset_name="traintest"
for f in folds:
    plot_test_train_auprc(f, save_name=auprc_fmter.format(dset_name, f, thresh), thresh=thresh, figsize=figsize, dpi=dpi)

<matplotlib.figure.Figure at 0x7f6bbd897890>

In [10]:
thresh=6.0
folds = ["0", "1", "2", "3", "4"]
dset_name="drugmatrix"
for f in folds:
    plot_dm_auprc(f, save_name=auprc_fmter.format(dset_name, f, thresh), thresh=thresh)

<matplotlib.figure.Figure at 0x7f6bbd0c1310>

In [11]:
thresh=6.0
folds = ["0", "1", "2", "3", "4"]
dset_name="timesplit"
for f in folds:
    plot_ts_auprc(f, save_name=auprc_fmter.format(dset_name, f, thresh), thresh=thresh,figsize=figsize, dpi=dpi)

<matplotlib.figure.Figure at 0x7f6bbc542590>

## AUPRC Train and Test: Thresh 5.0

In [None]:
thresh=5.0
figsize=(4,4)
dpi=300

In [None]:
folds = ["0", "1", "2", "3", "4"]
for f in folds:
    plot_test_train_auprc(f, thresh=thresh, figsize=figsize, dpi=dpi)

In [None]:
thresh=6.0
folds = ["0", "1", "2", "3", "4"]
for f in folds:
    plot_dm_auprc(f, thresh=thresh)

In [None]:
thresh=6.0
folds = ["0", "1", "2", "3", "4"]
for f in folds:
    plot_ts_auprc(f, thresh=thresh,figsize=figsize, dpi=dpi)

## format for MD

In [16]:
from glob import glob

def format_md_img(link_name, rel_plot_loc):
    img_formatter="[[{}/{}]]".format(link_name, rel_plot_loc)
    return img_formatter

def print_table(list_of_headers):
    fmter =  "| {} {}".format("{}", " {} ")
    for header in list_of_headers:
        fmter = fmter.format("{} {}".format(header, "| {}"), "{}")
    fmter = fmter.format(" ", " ")
    return fmter

In [17]:
github_wiki_link="https://github.com/keiserlab/lab-notebook-caceres/wiki/images"
github_expt_base = "{}/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx".format(github_wiki_link)

img_order = ['traintest', 'timesplit', 'drugmatrix']
ratio = 1.0
headers = ["PNR_{}".format(ratio)]
headers.extend(img_order)
print(print_table(headers))
print(print_table([":---"]*len(headers)))

png_files = glob(auprc_fmter.format("*", "*", "*"))
for f in np.arange(0, 5, 1):
    name_fmter = "fold_{}".format(str(f))
    fold_pngs = [i for i in png_files if name_fmter in i]
    fold_pngs = sorted(fold_pngs, key=lambda x: img_order.index(x.split("/")[-1].split(".")[0].split("_")[0]))
    md_print_fmt = [name_fmter] + [format_md_img(github_expt_base, i.split("/")[-1]) for i in fold_pngs]
    print(print_table(md_print_fmt))
print("\n\n")

| PNR_1.0 | traintest | timesplit | drugmatrix |      
| :--- | :--- | :--- | :--- |      
| fold_0 | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/traintest_at_fold_0_thresh_6.0_AUPRC.png]] | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/timesplit_at_fold_0_thresh_6.0_AUPRC.png]] | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/drugmatrix_at_fold_0_thresh_6.0_AUPRC.png]] |      
| fold_1 | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/traintest_at_fold_1_thresh_6.0_AUPRC.png]] | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/timesplit_at_fold_1_thresh_6.0_AUPRC.png]] | [[https://

In [18]:
img_order = ['all']
ratio = 1.0
headers = ["PNR_{}".format(ratio)]
headers.extend(img_order)
print(print_table(headers))
print(print_table([":---"]*len(headers)))

png_files = glob(auroc_fmter.format("*", "*", "*"))
for f in np.arange(0, 5, 1):
    name_fmter = "fold_{}".format(str(f))
    fold_pngs = [i for i in png_files if name_fmter in i]
    fold_pngs = sorted(fold_pngs, key=lambda x: img_order.index(x.split("/")[-1].split(".")[0].split("_")[0]))
    md_print_fmt = [name_fmter] + [format_md_img(github_expt_base, i.split("/")[-1]) for i in fold_pngs]
    print(print_table(md_print_fmt))
print("\n\n")

| PNR_1.0 | all |      
| :--- | :--- |      
| fold_0 | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/all_at_fold_0_thresh_6.0_AUROC.png]] |      
| fold_1 | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/all_at_fold_1_thresh_6.0_AUROC.png]] |      
| fold_2 | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/all_at_fold_2_thresh_6.0_AUROC.png]] |      
| fold_3 | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/all_at_fold_3_thresh_6.0_AUROC.png]] |      
| fold_4 | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/all_at_fold_4_thresh_6.0_AUROC.png]] |      



