The purpose of this experiment is to create an artificial baseline for neural net training on scrambled indices. I will scramble the training set and then evaluate performance on the validation, Drug Matrix, and Time Split holds outs to provide baseline measures.

In [1]:
import numpy as np
import h5py
import numpy as np
import os
import logging
np.random.seed(42)

def get_env_var(handle):
    tmp = os.getenv(handle)
    if not tmp:
        raise LookupError("Environment variable: {} not set.".format(handle))
    return tmp.strip("'")

In [2]:
input_base = get_env_var("DATA_SAVE_BASE")
output_base = get_env_var("HOME_SAVE_BASE")

In [3]:
input_file = "{}/20180525_DM_scrubbing/train_data/train_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5".format(input_base)
output_file = "{}/output/20180815_Paper_Retrains/scrambled_idx/SCRAMBLED_train_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5".format(output_base)

In [4]:
with h5py.File(input_file, 'r') as f:
    num_training_cases = f.attrs['training_cases']
    fp_len = f.attrs['fprint_len']
    num_targets = f.attrs['num_targets']
    num_training_cases, fp_len = f['fp_array'].shape
    print("Number of training cases: %d" % num_training_cases)
    print("Fingerprint length: %d" % fp_len)
    print("Number of targets: %d" % num_targets)
    print([i for i in f.iterkeys()])
    acts = f['activity'][()].copy()
    pos = f['position'][()].copy()
    fps = f['fp_array'][()].copy()
    rels = f['relation'][()].copy()
    years = f['year'][()].copy()

In [5]:
with h5py.File(output_file, 'w') as f:
        fp_arr = f.create_dataset('fp_array', fps.shape, dtype=np.bool, chunks=True, fillvalue=False, compression="lzf")
        act_arr = f.create_dataset('activity', acts.shape, dtype=np.float32, chunks=True, fillvalue=0.0, compression="lzf")
        pos_arr = f.create_dataset('position', pos.shape, dtype=np.uint16, chunks=True, fillvalue=0, compression="lzf")
        rel_arr = f.create_dataset('relation', rels.shape, dtype="S1", chunks=True, fillvalue='', compression="lzf")
        year_arr = f.create_dataset('year', years.shape, dtype=np.uint16, chunks=True, fillvalue=0, compression="lzf")
        
        # set values
        fp_arr[:] = fps
        act_arr[:] = acts
        pos_arr[:] = pos
        rel_arr[:] = rels
        year_arr[:] = years

        # declare attributes for meta-data
        f.attrs["activity_units"] = "nM, median"
        f.attrs["relationship_type"] = "mode"
        f.attrs["year_type"]="First publication date. If None given, value = 0"
        f.attrs["training_cases"] = num_training_cases
        f.attrs["num_targets"] = num_targets 
        f.attrs["fprint_len"] = fp_len
        f.attrs["fprint_type"] = "bit/ECFP4"
        f.attrs["desc"] = "Scrambled Training data for ECFP multi-task network with DM scrubbed and no PCBA. 10 positive ligands/target with a cutoff of pac50 of 5.0.  See lookup tables for target indexing"