For more info see https://github.com/keiserlab/lab-notebook-caceres/wiki/20180815_Paper_Retrains_scrambled_idx

The purpose of this experiment is to create an artificial baseline for neural net training on scrambled indices. I will scramble the training set from lab-notebook-caceres/Projects/nnets/20180613_ChEMBL_only_runs and then evaluate performance on the validation, Drug Matrix, and Time Split holds outs to provide baseline measures.

In [1]:
import numpy as np

In [2]:
help(np.random.shuffle)

Help on built-in function shuffle:

shuffle(...)
    shuffle(x)
    
    Modify a sequence in-place by shuffling its contents.
    
    This function only shuffles the array along the first axis of a
    multi-dimensional array. The order of sub-arrays is changed but
    their contents remains the same.
    
    Parameters
    ----------
    x : array_like
        The array or list to be shuffled.
    
    Returns
    -------
    None
    
    Examples
    --------
    >>> arr = np.arange(10)
    >>> np.random.shuffle(arr)
    >>> arr
    [1 7 5 2 9 4 3 6 0 8]
    
    Multi-dimensional arrays are only shuffled along the first axis:
    
    >>> arr = np.arange(9).reshape((3, 3))
    >>> np.random.shuffle(arr)
    >>> arr
    array([[3, 4, 5],
           [6, 7, 8],
           [0, 1, 2]])



In [26]:
# create array
a = np.asarray([[1,1,1,1], [2,2,2,2], [3,3,3,3],[4,4,4,4], [5,5,5,5]])

In [27]:
a

array([[1, 1, 1, 1],
       [2, 2, 2, 2],
       [3, 3, 3, 3],
       [4, 4, 4, 4],
       [5, 5, 5, 5]])

In [28]:
# we will use np.random.shuffle to randomly change the rows
np.random.shuffle(a)

In [29]:
a

array([[3, 3, 3, 3],
       [2, 2, 2, 2],
       [1, 1, 1, 1],
       [5, 5, 5, 5],
       [4, 4, 4, 4]])

# Scramble hdf5

In [1]:
import h5py
import numpy as np
import os
import logging
np.random.seed(42)

def get_env_var(handle):
    tmp = os.getenv(handle)
    if not tmp:
        raise LookupError("Environment variable: {} not set.".format(handle))
    return tmp.strip("'")

In [2]:
input_base = get_env_var("DATA_SAVE_BASE")
output_base = get_env_var("HOME_SAVE_BASE")

In [3]:
input_file = "{}/20180525_DM_scrubbing/train_data/train_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5".format(input_base)
output_file = "{}/output/20180815_Paper_Retrains/scrambled_idx/SCRAMBLED_train_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5".format(output_base)

In [7]:
with h5py.File(input_file, 'r') as f:
    num_training_cases = f.attrs['training_cases']
    fp_len = f.attrs['fprint_len']
    num_targets = f.attrs['num_targets']
    num_training_cases, fp_len = f['fp_array'].shape
    print("Number of training cases: %d" % num_training_cases)
    print("Fingerprint length: %d" % fp_len)
    print("Number of targets: %d" % num_targets)
    print([i for i in f.iterkeys()])
    acts = f['activity'][()].copy()
    pos = f['position'][()].copy()
    fps = f['fp_array'][()].copy()
    rels = f['relation'][()].copy()
    years = f['year'][()].copy()

Number of training cases: 296190
Fingerprint length: 4096
Number of targets: 2038
[u'activity', u'fp_array', u'position', u'relation', u'year']


In [10]:
fps.sum(axis=1)

array([43, 40, 44, ..., 53, 55, 58])

In [11]:
np.random.shuffle(fps)

In [12]:
fps.sum(axis=1)

array([57, 26, 55, ..., 50, 61, 53])

In [18]:
with h5py.File(output_file, 'w') as f:
        fp_arr = f.create_dataset('fp_array', fps.shape, dtype=np.bool, chunks=True, fillvalue=False, compression="lzf")
        act_arr = f.create_dataset('activity', acts.shape, dtype=np.float32, chunks=True, fillvalue=0.0, compression="lzf")
        pos_arr = f.create_dataset('position', pos.shape, dtype=np.uint16, chunks=True, fillvalue=0, compression="lzf")
        rel_arr = f.create_dataset('relation', rels.shape, dtype="S1", chunks=True, fillvalue='', compression="lzf")
        year_arr = f.create_dataset('year', years.shape, dtype=np.uint16, chunks=True, fillvalue=0, compression="lzf")
        
        # set values
        fp_arr[:] = fps
        act_arr[:] = acts
        pos_arr[:] = pos
        rel_arr[:] = rels
        year_arr[:] = years

        # declare attributes for meta-data
        f.attrs["activity_units"] = "nM, median"
        f.attrs["relationship_type"] = "mode"
        f.attrs["year_type"]="First publication date. If None given, value = 0"
        f.attrs["training_cases"] = num_training_cases
        f.attrs["num_targets"] = num_targets 
        f.attrs["fprint_len"] = fp_len
        f.attrs["fprint_type"] = "bit/ECFP4"
        f.attrs["desc"] = "Scrambled Training data for ECFP multi-task network with DM scrubbed and no PCBA. 10 positive ligands/target with a cutoff of pac50 of 5.0.  See lookup tables for target indexing"

### On mk-gpu-1, I ran 5-fold cross validation with a 1.0 pnr
```
./scrambled_trains.sh
```

### Then, I ran get_metrics to get the best performing epoch
```
./get_metrics.sh
```

### Run get_best_epochs.sh to get best epoch then change experiments.json to agree with visual inspection

In [1]:
import pandas as pd
import os
from glob import glob
import numpy as np
import json
from itertools import izip

In [4]:
def get_env_var(handle):
    ''' Get an environment variable given the handle for the bash variable'''
    tmp = os.getenv(handle)
    if not tmp:
        raise LookupError("Environment variable: {} not set.".format(handle))
    return tmp.strip("'")

def format_md_img(link_name, rel_plot_loc):
    img_formatter="[[{}/{}]]".format(link_name, rel_plot_loc)
    return img_formatter

def print_table(list_of_headers):
    fmter =  "| {} {}".format("{}", " {} ")
    for header in list_of_headers:
        fmter = fmter.format("{} {}".format(header, "| {}"), "{}")
    fmter = fmter.format(" ", " ")
    return fmter

class  Experiment(dict):
    def __init__(self, name):
        self.name = name
        self.folds = []
        self.converged_epochs = []
        self.trained_paths = []
      
    def __repr__(self):
        return str(vars(self))
    
    def __str__(self):
        return json.dumps(vars(self), indent=2)
        
    def set_convereged_epoch(self, epoch, train_path, fold=None):        
        self.folds.append(fold)
        self.converged_epochs.append(epoch)
        self.trained_paths.append(train_path)
        

In [6]:
home=get_env_var("HOME")
base="{}/labgits/lab-notebook-caceres.wiki/images/20180815_Paper_Retrains/".format(home)

home_save_dir = get_env_var("HOME_SAVE_BASE")
srv_save_dir = get_env_var("DATA_SAVE_BASE")
proj_dir = get_env_var("NMEW_PROJ_BASE")

sneg_pnrs=[1.0]
fold_tmplt = "fold_[0-9]*/pnr_*/*.png"

# github formatting info:
github_wiki_link="https://github.com/keiserlab/lab-notebook-caceres/wiki/images"
expt_name = "20180815_Paper_Retrains"
expt_sub_name="scrambled_idx"
github_wiki_expt = "{}/{}".format(github_wiki_link, expt_name, expt_sub_name)

expt_dir = "{}/lr_nesterov_1024_2048_3072/".format(base)

metrics_for_convergance = ['matthews-corrcoef_binary-5.0_test', 'matthews-corrcoef_binary-6.0_test', 'r2_test']
experiments = []

parent_dir = os.path.join(expt_dir, expt_sub_name)

In [7]:
!mkdir -p $parent_dir

### copied images of interest into parent dir

```
!rsync -azrRv $HOME_SAVE_BASE/output/20180815_Paper_Retrains/trained_nets/./lr_nesterov_1024_2048_3072/scrambled_idx/fold_*/pnr_*/*.png $base
```

In [10]:
png_files = glob(os.path.join(parent_dir, fold_tmplt))

In [11]:
experiments = []
converged_epochs = ("{}/output/20180815_Paper_Retrains/predictions/{}/experiments.json".format(home_save_dir, expt_sub_name))
with open(converged_epochs, "r") as fp:
    expts = json.load(fp)

expt_list = []
for e in expts:
    tmp = Experiment(e["name"])
    for epoch, path, fold in izip(e["converged_epochs"], e["trained_paths"], e["folds"]):
        tmp.set_convereged_epoch(epoch, path, fold)
    experiments.append(tmp)
    del(tmp)    
    
expt_dict = {float(i.name.split("_")[-1]): dict(zip([int(j.split("_")[1]) for j in i.folds], i.converged_epochs)) for i in experiments}

In [12]:
expt_dict

{1.0: {0: 9, 1: 35, 2: 13, 3: 26, 4: 32}}

In [13]:
img_order = ['train_loss','test_loss', 'test_no_gt_loss', 'test_sneg_loss']
for ratio in sneg_pnrs:
    png_subset = [f for f in png_files if str(ratio) in f]
    headers = ["PNR_{}".format(ratio)]
    headers.extend(img_order)
    print(print_table(headers))
    print(print_table([":---"]*len(headers)))
    for f in np.arange(0, 5, 1):
        name_fmter = "fold_{} best epoch: {}".format(str(f), expt_dict[ratio][f])
        fold_fmter = "fold_{}".format(str(f))
        fold_pngs = [i for i in png_subset if fold_fmter in i]
        fold_pngs = sorted(fold_pngs, key=lambda x: img_order.index(x.split("/")[-1].split(".")[0]))
        md_print_fmt = [name_fmter] + [format_md_img(github_wiki_expt, i.split("//")[-1]) for i in fold_pngs]
        print(print_table(md_print_fmt))
    print("\n\n")

| PNR_1.0 | train_loss | test_loss | test_no_gt_loss | test_sneg_loss |      
| :--- | :--- | :--- | :--- | :--- |      
| fold_0 best epoch: 9 | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/fold_0/pnr_1.0/train_loss.png]] | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/fold_0/pnr_1.0/test_loss.png]] | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/fold_0/pnr_1.0/test_no_gt_loss.png]] | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/fold_0/pnr_1.0/test_sneg_loss.png]] |      
| fold_1 best epoch: 35 | [[https://github.com/keiserlab/lab-notebook-caceres/wiki/images/20180815_Paper_Retrains/lr_nesterov_1024_2048_3072/scrambled_idx/fold_1/pnr_1.0/train_loss.png]] |

### since the images show poor performance on the test set (expected), I just choose the epoch for the .json files to be the same as the default for PNR training: 200

In [14]:
!cp ../STD_SMA_RATIOS/plot_fcns.py ./