In [1]:
import os
from common.plot_fcns import get_env_var
import cPickle as pkl
import numpy as np
import pandas as pd
from itertools import product

In [16]:
def get_mean_std(df):
    std_df = df.groupby(["expt","dset"]).std().round(decimals=4)
    mean_df = df.groupby(["expt", "dset"]).mean().round(decimals=4)
    std_df = std_df[['AUROC', 'AUPRC', 'R2']]
    mean_df = mean_df[['AUROC', 'AUPRC', 'R2']]
    std_df.columns = ['std_AUROC', 'std_AUPRC', 'std_R2']
    std_df = std_df.reset_index()
    mean_df = mean_df.reset_index()
    merged_df = pd.merge(mean_df, std_df)
    merged_df = merged_df[['expt','dset', 'R2', 'std_R2', 'AUROC', 'std_AUROC', 'AUPRC',  'std_AUPRC']]
    return merged_df

In [3]:
def rename_vals(df):
    df.expt[df.expt == "scrambled_idx_LC"] = "SNA scrambled"
    df.expt[df.expt == "scrambled_idx_no_SMA_LC"] = "STD scrambled"
    df.expt[df.expt == "STD_SMA"] = "SNA"
    df.expt[df.expt == "NEG_RM"] = "Negatives Removed"
    df.expt[df.expt == "NEG_UW"] = "Negatives Upweighted"
    df.expt[df.expt == "SEA_SMA"] = "SNA +SEA blacklist"
    df.expt[df.expt == "NEG_RM_SMA"] = "Negatives Removed +SNA"
    df.expt[df.expt == "NEG_RM_scrambled"] = "Negatives Removed scrambled"
    df.dset[df.dset == "drugmatrix"] = "Drugmatrix"
    df.dset[df.dset == "timesplit"] = "Time Split"
    df.dset[df.dset == "test"] = "Test"
    df.dset[df.dset == "train"] = "Train" 
    return df

In [4]:
def ci95(df):
    g = df.groupby(["dset", "expt"])
    mean = g.mean()
    ci95_hi = mean + g.sem() * 1.96
    ci95_lo = mean - g.sem() * 1.96
    return ci95_lo, ci95_hi, mean

In [5]:
def get_ci_df(df):
    ci95_high, ci95_low, mean = ci95(df)
    idx=ci95_high.index
    mean["AUROC_95%CI"] = list(zip(ci95_high.loc[idx].AUROC.round(decimals=4), ci95_low.loc[idx].AUROC.round(decimals=4)))
    mean["AUPRC_95%CI"] = list(zip(ci95_high.loc[idx].AUPRC.round(decimals=4), ci95_low.loc[idx].AUPRC.round(decimals=4)))
    mean["R2_95%CI"] = list(zip(ci95_high.loc[idx].R2.round(decimals=4), ci95_low.loc[idx].R2.round(decimals=4)))
    mean = mean[['AUROC_95%CI', 'AUPRC_95%CI', 'R2_95%CI']].reset_index()
    return mean

In [6]:
reg_data_file = "./regression_preds_file_lookup.pkl"
with open(reg_data_file, "rb") as f:
    data_dict = pkl.load(f)

In [7]:
plot_data_save_base = "{}/20190410_SMA_Investigation/plot_data/regression".format(get_env_var("DATA_SAVE_BASE"))
df = pd.read_csv("{}/neighbors_plot_vals.tsv".format(plot_data_save_base), sep="\t", index_col=0)

In [8]:
dset_order = ['drugmatrix', 'timesplit', 'test', 'train']
expt_order = ['STD', 'scrambled_idx_no_SMA_LC',  
              'STD_SMA', 'scrambled_idx_LC',
              'NEG_RM', 'NEG_RM_scrambled', 
              'NEG_RM_SMA', 
              'NEG_UW',
              'SEA_SMA',]
sort_dict = {x: i for i,x in enumerate(product(dset_order, expt_order))}

In [9]:
mean = get_ci_df(df)
mean = mean[mean.expt.isin(['STD', 'STD_SMA', 'scrambled_idx_LC', 'scrambled_idx_no_SMA_LC'])]
mean["sort_val"] = mean.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
mean = mean.sort_values(["sort_val"])
mean = rename_vals(mean)
mean = mean[['dset', 'expt', 'AUROC_95%CI', 'AUPRC_95%CI', 'R2_95%CI']]
mean.columns = ["Dataset", "Training Type", "95% CI AUROC", "95% CI AUPRC", "95% CI R2"]
mean = mean.set_index(["Dataset", "Training Type",])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [10]:
mean.to_csv("{}/table_95ci_STD_expts_ratio1.csv".format(plot_data_save_base), sep="\t")


In [11]:
all_df = get_ci_df(df)
all_df = all_df[all_df.expt.isin(['NEG_RM', "NEG_UW", 'NEG_RM_SMA', 'SEA_SMA', 'STD','STD_SMA', 
                            'scrambled_idx_LC','scrambled_idx_no_SMA_LC', "NEG_RM_scrambled"])]
all_df["sort_val"] = all_df.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
all_df = all_df.sort_values(["sort_val"])
all_df = rename_vals(all_df)
all_df = all_df[['dset', 'expt', 'AUROC_95%CI', 'AUPRC_95%CI', 'R2_95%CI',]]
all_df.columns = ["Dataset", "Training Type", "95% CI AUROC", "95% CI AUPRC", "95% CI R2"]
all_df = all_df.set_index(["Dataset", "Training Type",])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [12]:
all_df

Unnamed: 0_level_0,Unnamed: 1_level_0,95% CI AUROC,95% CI AUPRC,95% CI R2
Dataset,Training Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Drugmatrix,STD,"(0.6804, 0.6968)","(0.1422, 0.1557)","(0.1763, 0.2089)"
Drugmatrix,STD scrambled,"(0.5452, 0.5625)","(0.0775, 0.0856)","(0.0073, 0.0234)"
Drugmatrix,SNA,"(0.7782, 0.7885)","(0.4336, 0.4475)","(0.403, 0.4507)"
Drugmatrix,SNA scrambled,"(0.4724, 0.4959)","(0.0661, 0.0714)","(0.0, 0.0041)"
Drugmatrix,Negatives Removed,"(0.6053, 0.6187)","(0.1017, 0.1061)","(0.1819, 0.2128)"
Drugmatrix,Negatives Removed scrambled,"(0.5269, 0.536)","(0.0743, 0.0769)","(0.0036, 0.0093)"
Drugmatrix,Negatives Removed +SNA,"(0.7802, 0.7894)","(0.443, 0.4537)","(0.4101, 0.4414)"
Drugmatrix,Negatives Upweighted,"(0.6969, 0.7079)","(0.1599, 0.1741)","(0.2009, 0.2346)"
Drugmatrix,SNA +SEA blacklist,"(0.7813, 0.7902)","(0.4468, 0.4588)","(0.4263, 0.4559)"
Time Split,STD,"(0.7367, 0.7409)","(0.9427, 0.9441)","(0.2124, 0.2181)"


In [13]:
all_df.to_csv("{}/table_95ci_all_reg_expts_ratio1.csv".format(plot_data_save_base), sep="\t")


In [14]:
ci95_lo, ci95_hi, m = ci95(df)
m = m.reset_index()
m = m[m.expt.isin(['STD', 'STD_SMA', 'scrambled_idx_LC', 'scrambled_idx_no_SMA_LC'])]
m["sort_val"] = m.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
m = m.sort_values(["sort_val"])
m = rename_vals(m)
m.drop("sort_val", axis=1)
m.groupby(["dset", "expt"]).mean()
m = m[["dset", "expt", "AUROC", "AUPRC", "R2"]]
m.columns = ["Dataset", "Training Type", "mean AUROC", "mean AUPRC", "mean R2"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [15]:
m.to_csv("{}/table_mean_STD_expts_ratio1.csv".format(plot_data_save_base), sep="\t")



In [16]:
ci95_lo, ci95_hi, a = ci95(df)
a = a.reset_index()
a = a[a.expt.isin(['NEG_RM', "NEG_UW", 'NEG_RM_SMA', 'SEA_SMA', 'STD','STD_SMA', 
                            'scrambled_idx_LC','scrambled_idx_no_SMA_LC', "NEG_RM_scrambled"])]
a["sort_val"] = a.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
a = a.sort_values(["sort_val"])
a = rename_vals(a)
a.drop("sort_val", axis=1)
a.groupby(["dset", "expt"]).mean()
a = a[["dset", "expt", "AUROC", "AUPRC", "R2"]]
a.columns = ["Dataset", "Training Type", "mean AUROC", "mean AUPRC", "mean R2"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [19]:
a.to_csv("{}/table_mean_all_reg_expts_ratio1.csv".format(plot_data_save_base), sep="\t")


In [31]:
mean_std_df = get_mean_std(df)
# filter out ratios expts
mean_std_df = mean_std_df[mean_std_df.expt.isin(['NEG_RM', "NEG_UW", 'NEG_RM_SMA', 'SEA_SMA', 'STD','STD_SMA', 
                            'scrambled_idx_LC','scrambled_idx_no_SMA_LC', "NEG_RM_scrambled"])]

# sort vals by expt and dataset
mean_std_df["sort_val"] = mean_std_df.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
mean_std_df = mean_std_df.sort_values(["sort_val"])
# rename experiments
mean_std_df = rename_vals(mean_std_df)
# save only cols of interest
mean_std_df=mean_std_df[['expt', 'dset', 'R2', 'std_R2', 'AUROC', 'std_AUROC', 'AUPRC',
       'std_AUPRC', ]]
mean_std_df.columns = ["Dataset", "Training Type", "mean R2", "R2 std", "mean AUROC", "AUROC std","mean AUPRC", "AUPRC std",]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [33]:
mean_std_df.to_csv("{}/table_mean_stddev_all_reg_expts_ratio1.csv".format(plot_data_save_base), sep="\t")

In [21]:
!ls -lrth $plot_data_save_base

total 95K
-rw-r--r--. 1 ecaceres keiser  40K May 17 13:15 ratio_plot_vals.tsv
-rw-r--r--. 1 ecaceres keiser  21K Jun 14 00:24 neighbors_plot_vals.tsv
-rw-r--r--. 1 ecaceres keiser  16K Aug 14 00:42 all_ratio_plot_mean_std.csv
-rw-r--r--. 1 ecaceres keiser 1.2K Sep 23 15:15 table_95ci_STD_expts_ratio1.csv
-rw-r--r--. 1 ecaceres keiser 2.7K Sep 23 15:15 table_95ci_all_reg_expts_ratio1.csv
-rw-r--r--. 1 ecaceres keiser 1.1K Sep 23 15:15 table_mean_STD_expts_ratio1.csv
-rw-r--r--. 1 ecaceres keiser 2.6K Sep 23 15:17 table_mean_all_reg_expts_ratio1.csv


In [22]:
plot_data_save_base

'/srv/nas/mk1/users/ecaceres//20190410_SMA_Investigation/plot_data/regression'