In [2]:
import os
from common.plot_fcns import get_env_var
import cPickle as pkl
import numpy as np
import pandas as pd
from itertools import product

In [13]:
def get_mean_std(df):
    std_df = df.groupby(["expt","dset"]).std()
    mean_df = df.groupby(["expt", "dset"]).mean()
    std_df = std_df[['AUROC', 'AUPRC']]
    mean_df = mean_df[['AUROC', 'AUPRC']]
    std_df.columns = ['std_AUROC', 'std_AUPRC']
    std_df = std_df.reset_index()
    mean_df = mean_df.reset_index()
    merged_df = pd.merge(mean_df, std_df)
    merged_df = merged_df[['expt','dset', 'AUROC', 'std_AUROC', 'AUPRC',  'std_AUPRC']]
    return merged_df

In [4]:
def rename_vals(df):
    # rename expts
    df.expt[df.expt == 'CLASSIFIER_STD'] = "STD (classifier)"
    df.expt[df.expt == 'CLASSIFIER_NEG_RM'] = "Negatives Removed (classifier)"
    df.expt[df.expt == 'CLASSIFIER_NEG_RM_scrambled'] = "Negatives Removed scrambled (classifier)"
    df.expt[df.expt == 'CLASSIFIER_SEA_SMA'] = "SNA +SEA blacklist (classifier)"
    
    df.expt[df.expt == 'CLASSIFIER_scrambled_idx_LC'] = "SNA scrambled (classifier)"
    df.expt[df.expt == 'CLASSIFIER_scrambled_idx_No_SMA_LC'] = "STD scrambled (classifier)"
    df.expt[df.expt == 'CLASSIFIER_SMA_RATIOS'] = "SNA (classifier)"
    df.expt[df.expt == 'CLASSIFIER_NEG_RM_SMA_RATIOS'] = "Negatives Removed +SNA (classifier)"
    df.expt[df.expt == 'CLASSIFIER_NEG_RM_RATIOS_scrambled'] = "Negatives Removed +SNA scrambled (classifier)"
    # and dsets
    df.dset[df.dset == "drugmatrix"] = "Drug Matrix"
    df.dset[df.dset == "timesplit"] = "Time Split"
    df.dset[df.dset == "test"] = "Test"
    df.dset[df.dset == "train"] = "Train" 
    return df

In [5]:
def ci95(df):
    g = df.groupby(["dset", "expt"])
    mean = g.mean()
    ci95_hi = mean + g.sem() * 1.96
    ci95_lo = mean - g.sem() * 1.96
    return ci95_lo, ci95_hi, mean

In [6]:
def get_ci_df(df):
    ci95_high, ci95_low, mean = ci95(df)
    idx=ci95_high.index
    mean["AUROC_95%CI"] = list(zip(ci95_high.loc[idx].AUROC.round(decimals=4), ci95_low.loc[idx].AUROC.round(decimals=4)))
    mean["AUPRC_95%CI"] = list(zip(ci95_high.loc[idx].AUPRC.round(decimals=4), ci95_low.loc[idx].AUPRC.round(decimals=4)))
    mean = mean[['AUROC_95%CI', 'AUPRC_95%CI']].reset_index()
    return mean

In [7]:
reg_data_file = "./classification_preds_file_lookup.pkl"
with open(reg_data_file, "rb") as f:
    data_dict = pkl.load(f)

In [8]:
plot_data_save_base = "{}/20190410_SMA_Investigation/plot_data/classification".format(get_env_var("DATA_SAVE_BASE"))
df = pd.read_csv("{}/neighbors_plot_vals.tsv".format(plot_data_save_base), sep="\t", index_col=0)

In [9]:
df.expt.unique().tolist()

['CLASSIFIER_NEG_RM_RATIOS_scrambled',
 'CLASSIFIER_STD',
 'CLASSIFIER_scrambled_idx',
 'CLASSIFIER_NEG_RM',
 'CLASSIFIER_scrambled_idx_No_SMA',
 'CLASSIFIER_NEG_RM_scrambled',
 'CLASSIFIER_NEG_RM_SMA_RATIOS',
 'CLASSIFIER_scrambled_idx_LC',
 'CLASSIFIER_scrambled_idx_No_SMA_LC',
 'CLASSIFIER_SMA_RATIOS',
 'CLASSIFIER_SEA_SMA']

In [10]:
dset_order = ['drugmatrix', 'timesplit', 'test', 'train']
expt_order = [
     'CLASSIFIER_STD', 'CLASSIFIER_scrambled_idx_No_SMA_LC',
     'CLASSIFIER_SMA_RATIOS',  'CLASSIFIER_scrambled_idx_LC',
     'CLASSIFIER_NEG_RM', 'CLASSIFIER_NEG_RM_scrambled',
     'CLASSIFIER_NEG_RM_SMA_RATIOS', 'CLASSIFIER_NEG_RM_RATIOS_scrambled',
     'CLASSIFIER_SEA_SMA']
sort_dict = {x: i for i,x in enumerate(product(dset_order, expt_order))}

In [26]:
mean = get_ci_df(df)
mean = mean[mean.expt.isin(['CLASSIFIER_STD', 'CLASSIFIER_scrambled_idx_LC',
       'CLASSIFIER_scrambled_idx_No_SMA_LC', 'CLASSIFIER_SMA_RATIOS'])]
mean["sort_val"] = mean.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
mean = mean.sort_values(["sort_val"])
mean = rename_vals(mean)
mean = mean[['dset', 'expt', 'AUROC_95%CI', 'AUPRC_95%CI']]
mean.columns = ["Dataset", "Training Type", "95% CI AUROC", "95% CI AUPRC"]
mean = mean.set_index(["Dataset", "Training Type",])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [27]:
mean.to_csv("{}/table_95ci_STD_expts_ratio1_classification.csv".format(plot_data_save_base), sep="\t")

In [28]:
all_df = get_ci_df(df)
all_df = all_df[all_df.expt.isin(['CLASSIFIER_STD', 'CLASSIFIER_NEG_RM',
       'CLASSIFIER_NEG_RM_SMA_RATIOS', "CLASSIFIER_NEG_RM_scrambled", 'CLASSIFIER_scrambled_idx_LC',
       'CLASSIFIER_scrambled_idx_No_SMA_LC', 'CLASSIFIER_SMA_RATIOS','CLASSIFIER_NEG_RM_RATIOS_scrambled',
       'CLASSIFIER_SEA_SMA'])]
all_df["sort_val"] = all_df.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
all_df = all_df.sort_values(["sort_val"])
all_df = rename_vals(all_df)
all_df = all_df[['dset', 'expt', 'AUROC_95%CI', 'AUPRC_95%CI']]
all_df.columns = ["Dataset", "Training Type", "95% CI AUROC", "95% CI AUPRC"]
all_df = all_df.set_index(["Dataset", "Training Type",])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [29]:
all_df.to_csv("{}/table_95ci_all_class_expts_ratio1.csv".format(plot_data_save_base), sep="\t")

In [30]:
ci95_lo, ci95_hi, m = ci95(df)
m = m.reset_index()
m = m[m.expt.isin(['CLASSIFIER_STD', 'CLASSIFIER_scrambled_idx_LC',
       'CLASSIFIER_scrambled_idx_No_SMA_LC', 'CLASSIFIER_SMA_RATIOS'])]
m["sort_val"] = m.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
m = m.sort_values(["sort_val"])
m = rename_vals(m)
m.drop("sort_val", axis=1)
m.groupby(["dset", "expt"]).mean()
m = m[["dset", "expt", "AUROC", "AUPRC"]]
m.columns = ["Dataset", "Training Type", "mean AUROC", "mean AUPRC"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [31]:
m.to_csv("{}/table_mean_STD_expts_ratio1.csv".format(plot_data_save_base), sep="\t")

In [32]:
ci95_lo, ci95_hi, a = ci95(df)
a = a.reset_index()
a = a[a.expt.isin(['CLASSIFIER_STD', 'CLASSIFIER_NEG_RM',
       'CLASSIFIER_NEG_RM_SMA_RATIOS', "CLASSIFIER_NEG_RM_scrambled",'CLASSIFIER_scrambled_idx_LC',
       'CLASSIFIER_scrambled_idx_No_SMA_LC', 'CLASSIFIER_SMA_RATIOS','CLASSIFIER_NEG_RM_RATIOS_scrambled',
       'CLASSIFIER_SEA_SMA'])]
a["sort_val"] = a.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
a = a.sort_values(["sort_val"])
a = rename_vals(a)
a.drop("sort_val", axis=1)
a.groupby(["dset", "expt"]).mean()
a = a[["dset", "expt", "AUROC", "AUPRC"]]
a.columns = ["Dataset", "Training Type", "mean AUROC", "mean AUPRC",]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [33]:
a.to_csv("{}/table_mean_all_class_expts_ratio1.csv".format(plot_data_save_base), sep="\t")

In [34]:
a

Unnamed: 0,Dataset,Training Type,mean AUROC,mean AUPRC
6,Drug Matrix,STD (classifier),0.720208,0.169047
10,Drug Matrix,STD scrambled (classifier),0.606986,0.107459
5,Drug Matrix,SNA (classifier),0.816827,0.423998
8,Drug Matrix,SNA scrambled (classifier),0.56454,0.084489
0,Drug Matrix,Negatives Removed (classifier),0.543427,0.079392
3,Drug Matrix,Negatives Removed scrambled (classifier),0.527522,0.074814
2,Drug Matrix,Negatives Removed +SNA (classifier),0.803516,0.31025
1,Drug Matrix,Negatives Removed +SNA scrambled (classifier),0.544033,0.093577
4,Drug Matrix,SNA +SEA blacklist (classifier),0.819885,0.431897
28,Time Split,STD (classifier),0.731373,0.939683


In [16]:
mean_std_df = get_mean_std(df)
# filter out ratios expts
mean_std_df = mean_std_df[mean_std_df.expt.isin(['CLASSIFIER_STD', 'CLASSIFIER_NEG_RM',
       'CLASSIFIER_NEG_RM_SMA_RATIOS', "CLASSIFIER_NEG_RM_scrambled",'CLASSIFIER_scrambled_idx_LC',
       'CLASSIFIER_scrambled_idx_No_SMA_LC', 'CLASSIFIER_SMA_RATIOS','CLASSIFIER_NEG_RM_RATIOS_scrambled',
       'CLASSIFIER_SEA_SMA'])]

# sort vals by expt and dataset
mean_std_df["sort_val"] = mean_std_df.apply(lambda x: sort_dict[(x["dset"], x["expt"])], axis=1)
mean_std_df = mean_std_df.sort_values(["sort_val"])
# rename experiments
mean_std_df = rename_vals(mean_std_df)
# save only cols of interest
mean_std_df=mean_std_df[['expt', 'dset', 'AUROC', 'std_AUROC', 'AUPRC',
       'std_AUPRC', ]]
mean_std_df.columns = ["Dataset", "Training Type","mean AUROC", "AUROC std","mean AUPRC", "AUPRC std",]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [18]:
mean_std_df.to_csv("{}/table_mean_stddev_all_class_expts_ratio1.csv".format(plot_data_save_base), sep="\t")

In [19]:
plot_data_save_base

'/srv/nas/mk1/users/ecaceres//20190410_SMA_Investigation/plot_data/classification'