In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
from IPython.display import Image, display
import glob
from itertools import chain
import json

In [2]:
def get_env_var(handle):
    ''' Get an environment variable given the handle for the bash variable'''
    tmp = os.getenv(handle)
    if not tmp:
        raise LookupError("Environment variable: {} not set.".format(handle))
    return tmp.strip("'")

In [3]:
def get_loss_df(loss_file):
    df = pd.read_csv(loss_file, header=None, names=["loss"])
    df = df.reset_index()
    df.columns = ["epoch", "loss"]
    return df

def find_nearest_epoch(array, value, n=5):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    
    return array[idx]
    
class  Experiment(dict):
    def __init__(self, name):
        self.name = name
        self.folds = []
        self.converged_epochs = []
        self.trained_paths = []
      
    def __repr__(self):
        return str(vars(self))
    
    def __str__(self):
        return json.dumps(vars(self), indent=2)
        
    def set_convereged_epoch(self, epoch, train_path, fold=None):        
        self.folds.append(fold)
        self.converged_epochs.append(epoch)
        self.trained_paths.append(train_path)
        

In [4]:
def load_losses(train_base):
    loss_fmter="{}_loss.csv"
    train_loss_file = "{}/{}".format(train_base, loss_fmter.format("train"))
    test_loss_file = "{}/{}".format(train_base, loss_fmter.format("test"))
    test_loss = get_loss_df(test_loss_file)
    train_loss = get_loss_df(train_loss_file)
    return test_loss, train_loss


def get_best_epochs(train_base):
    """get best epoch weight file given list of test_loss, train_loss, and 
    a train base for the weights"""
    test_loss, train_loss = load_losses(train_base)
    expt = train_base.split("trained_nets/")[-1]
    weights_files = glob.glob("{}/model_at_epoch_*.npz".format(train_base))
    # take the minimum test loss
    min_test = np.min(test_loss["loss"])
    # get the values there for test and train
    test_epoch, test_value = test_loss[test_loss["loss"] == min_test].values[0]
    train_epoch, train_value = train_loss[train_loss["epoch"] == test_epoch].values[0]
    # sort available weight files
    file_epochs = [int(n.split("_")[-1].split(".")[0]) for n in weights_files]
    sorted_epochs = sorted(zip((test_loss["loss"] - test_value), test_loss["epoch"]), 
                           key=lambda x: x[0], reverse=False)
    best_epochs = [g for g in sorted_epochs if g[1] in file_epochs]
    best_value, best_epoch = best_epochs[0]
    curr_test_val = test_loss[test_loss["epoch"]==best_epoch].loss.values[0]
    return_weights = [i for i in weights_files if "_{}.npz".format(int(best_epoch)) in i][0]
    
    return best_epoch, curr_test_val, return_weights

In [5]:
SAVE_BASE = get_env_var("DATA_SAVE_BASE")
model_base = "{}/20190410_SMA_Investigation/trained_nets/*".format(SAVE_BASE)
loss_fmter="{}_loss.csv"
dirs = glob.glob("{}/*/fold_*/".format(model_base))
subdirs = [glob.glob("{}/pnr_*".format(d)) for d in dirs]
subdirs.append(d for d in dirs if len(glob.glob("{}/pnr_*".format(d)))==0)
subdirs = list(chain.from_iterable(subdirs))
subdirs = sorted(subdirs)
names = set(i.split("trained_nets/")[-1].split("/")[1] for i in subdirs)

In [6]:
expt_dict = {n:[] for n in names}

for k, v in expt_dict.items():
    tmp_dirs = set(i for i in subdirs if "/{}/".format(k) in i)
    # get all unique pnrs per expt
    unique_pnrs = set(i.split("/")[-1] for i in tmp_dirs if "pnr_" in i)
    
    if len(unique_pnrs) > 0:
        for curr_pnr in unique_pnrs:
            expt_name = "{}_{}".format(k, curr_pnr)
            exp = Experiment(expt_name)
            tmp_dir_subset =  [i for i in tmp_dirs if curr_pnr in i]
            tmp_dir_subset = sorted(tmp_dir_subset)
            for d in tmp_dir_subset:
                train_base = d
                fold = d.split("/")[-2]
                try:
                    b_e, b_v, returned_weights = get_best_epochs(train_base)
                    exp.set_convereged_epoch(b_e, d, fold)
                    
                except IOError:
                    print("No test or train loss file for {}".format(train_base))
            expt_dict[k].append(exp)
    else:
        curr_pnr = None
        expt_name = "{}".format(k)
        exp = Experiment(expt_name)
        tmp_dir_subset = sorted(tmp_dirs)
        for d in tmp_dir_subset:
                train_base = d
                fold = d.split("/")[-2]
                try:
                    b_e, b_v, returned_weights = get_best_epochs(train_base)
                    exp.set_convereged_epoch(b_e, d, fold)
                    
                except IOError:
                    print("No test or train loss file for {}".format(train_base))
        expt_dict[k].append(exp)

No test or train loss file for /srv/nas/mk1/users/ecaceres//20190410_SMA_Investigation/trained_nets/lr_nesterov_binary_classifier_1024_2048_3072/CLASSIFIER_SMA_RATIOS_scrambled_LR01/fold_0/pnr_19.0
No test or train loss file for /srv/nas/mk1/users/ecaceres//20190410_SMA_Investigation/trained_nets/lr_nesterov_binary_classifier_1024_2048_3072/CLASSIFIER_SMA_RATIOS_scrambled_LR03/fold_0/pnr_0.25


In [7]:
outdir = "{}/20190410_SMA_Investigation/predictions/{}".format(SAVE_BASE, "{}")

for k in expt_dict.keys():
    json_dir = outdir.format(k)
    if not os.path.exists(json_dir):
        os.makedirs(json_dir)
    experiment_epochs = expt_dict[k]
    expt_json_name = os.path.join(json_dir, 'experiments.json')
    if not os.path.exists(expt_json_name):
        print("saving experiments as json to:")
        print(json_dir)
        with open(expt_json_name, 'w') as fp:
            json.dump([vars(e) for e in experiment_epochs], fp, indent=2)

saving experiments as json to:
/srv/nas/mk1/users/ecaceres//20190410_SMA_Investigation/predictions/CLASSIFIER_NEG_RM_SMA_RATIOS_scrambled_LR03
saving experiments as json to:
/srv/nas/mk1/users/ecaceres//20190410_SMA_Investigation/predictions/NEG_RM_scrambled


In [12]:
!ls /srv/nas/mk1/users/ecaceres//20190410_SMA_Investigation/predictions/NEG_RM_scrambled

experiments.json


In [17]:
!ls /srv/nas/mk1/users/ecaceres//20190410_SMA_Investigation/trained_nets/all_negs_stochastic/NEG_RM_scrambled/fold_4/

all_negs_stochastic_args.txt
all_negs_stochastic_INFO.log
model_at_epoch_0.npz
model_at_epoch_100.npz
model_at_epoch_13.npz
model_at_epoch_200.npz
model_at_epoch_25.npz
model_at_epoch_300.npz
model_at_epoch_400.npz
model_at_epoch_499.npz
test_indices.npy
test_loss.csv
test_loss.png
test_no_gt_loss.csv
test_no_gt_loss.png
training_log.txt
train_loss.csv
train_loss.png
ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5_target_index.pkl


In [None]:
      276,
      310,
      293,
      387,
      349

In [None]:
    "converged_epochs": [
      300,
      300,
      300,
      400,
      300
    ],
    "name": "NEG_RM_scrambled",
    "trained_paths": [
      "/srv/nas/mk1/users/ecaceres//20190410_SMA_Investigation/trained_nets/all_negs_stochastic/NEG_RM_scrambled/fold_0/",
