In [1]:
%matplotlib inline
__author__ = "Elena Caceres"
__email__ = "ecaceres@keiserlab.org"
"""Purpose: Deterimine time-splits for our training sets and write them to file."""

'Purpose: Deterimine time-splits for our training sets and write them to file.'

In [2]:
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd
import cPickle as pkl
from common.data_converter import convert_to_pki

np.random.seed(42)

  from ._conv import register_converters as _register_converters


In [3]:
def get_env_var(handle):
    tmp = os.getenv(handle)
    if not tmp:
        raise LookupError("Environment variable: {} not set.".format(handle))
    return tmp.strip("'")


# what are the new valid targets? Get rid of targets with <10pos ligands
def get_targets_with_count(positions, affinities, query_idxes, num_pos=10, affinity_cutoff=5.0):
    """Given positions"""
    tmp_pos = positions[query_idxes]
    tmp_aff = convert_to_pki(affinities[query_idxes])
    
    pos_counts = {i:0 for i in set(tmp_pos)}
    # get column counts for each target above the cutoff
    for target, value in zip(tmp_pos, tmp_aff):
        if value and value > affinity_cutoff:
            pos_counts[target] += 1
    return {k for k,v in pos_counts.iteritems() if v > num_pos-1}


def target_reindexer(targs):
    return {k: v for v, k in enumerate(targs)}


def make_new_tid_index_map(orig_tid_index_file, reindexed_targs, save_name=None):
    # load target index and create a new pkl with the mapping for later
    with open(orig_tid_index_file, 'rb') as t:
            orig = pkl.load(t)
    rv_orig = {v:k for k,v in orig.iteritems()}
    new_targ_map = {rv_orig[k]:v for k,v in reindexed_targs.iteritems()}
    if save_name:
        # store new target index
        with open(save_name, 'wb') as o:
                pkl.dump(new_targ_map, o)
    return new_targ_map


def load_hdf5_data(in_file):
    with h5py.File(in_file, 'r') as h:
        act = h["activity"][:].copy()
        fp = h["fp_array"][:].copy()
        pos =  h["position"][:].copy()
        year = h["year"][:].copy()
        rel = h["rel_array"][:].copy()
    return act, fp, pos, year, rel


# make numpy arrays to write
def make_empty_arrays(num_mols, max_examples):
    act = np.full((num_mols, max_examples), np.nan, dtype = np.float32)
    pos = np.full((num_mols, max_examples), np.nan, dtype = np.uint16)
    rel = np.full((num_mols, max_examples), np.nan, dtype = "S1")
    year = np.zeros((num_mols, max_examples), dtype = np.uint16)
    return act, pos, rel, year


def get_masks(idxes, target_mask):
    example_counter = np.sum(idxes, axis=1)
    columns_needed = example_counter.max()
    # Remove any molecules that no longer have examples
    good_fps = example_counter > 0
    # given initial good indexes, targets of interest, and molecules with >0 examples
    good_fps_repeat = good_fps.reshape((-1,1)).repeat(target_mask.shape[1], axis=1)
    total_mask = target_mask & idxes & good_fps_repeat
    # get the number of molecules needed to represent data in h5py with new mask
    good_fps = np.sum(total_mask, axis=1)>0
    num_cols = np.sum(total_mask, axis=1).max()
    num_fps = good_fps.sum()
    return num_fps, num_cols, good_fps, total_mask


def make_arrays(num_fps, num_cols, mask, fp_mask, act_arr, fp_arr, pos_arr, year_arr, rel_arr, old_pos_to_new_pos):
    new_act, new_pos, new_rel, new_year = make_empty_arrays(num_fps, num_cols)
    new_fp = fp_arr[fp_mask]
    row_counter = 0
    # given the good molecules AND the train indices, move values around.
    for idx in np.arange(0, mask.shape[0], 1):
        row_mask = mask[idx]
        activities = act_arr[idx][row_mask]
        positions = [old_pos_to_new_pos[i] for i in pos_arr[idx][row_mask]]
        relations = rel_arr[idx][row_mask]
        years = year_arr[idx][row_mask]
        num_values = len(activities)
        assert(num_values == len(positions) == len(relations) == len(years))
        if num_values > 0:
            np.put(new_act[row_counter], range(num_values), activities)
            np.put(new_pos[row_counter], range(num_values), positions)
            np.put(new_rel[row_counter], range(num_values), relations)
            np.put(new_year[row_counter], range(num_values), years)
            row_counter += 1       
    return new_fp, new_act, new_pos, new_rel, new_year

def save_h5py(save_name, num_targets, tmp_fp_arr, tmp_act, tmp_pos, tmp_rel, tmp_year, desc=""):
        with h5py.File(save_name, 'w') as f:
            
            # pre-allocate arrays for our dataset
            num_fps = tmp_fp_arr.shape[0]
            fp_len = tmp_fp_arr.shape[1]
            max_cols = tmp_act.shape[1]
            
            fp_arr = f.create_dataset('fp_array', (num_fps, fp_len), dtype=np.bool, chunks=True, fillvalue=False, compression="lzf")
            act_arr = f.create_dataset('activity', (num_fps, max_cols), dtype=np.float32, chunks=True, fillvalue=0.0, compression="lzf")
            pos_arr = f.create_dataset('position', (num_fps, max_cols), dtype=np.uint16, chunks=True, fillvalue=0, compression="lzf")
            rel_arr = f.create_dataset('relation', (num_fps, max_cols), dtype="S1", chunks=True, fillvalue='', compression="lzf")
            year_arr = f.create_dataset('year', (num_fps, max_cols), dtype=np.uint16, chunks=True, fillvalue=0, compression="lzf")

            # set values
            fp_arr[:] = tmp_fp_arr
            act_arr[:] = tmp_act
            pos_arr[:] = tmp_pos
            rel_arr[:] = tmp_rel
            year_arr[:] = tmp_year

            # declare attributes for meta-data
            f.attrs["activity_units"] = "nM, median"
            f.attrs["relationship_type"] = "mode"
            f.attrs["year_type"]="First publication date. If None given, value = 0"
            f.attrs["training_cases"] = num_fps
            f.attrs["fprint_len"] = fp_len
            f.attrs["num_targets"] = num_targets
            f.attrs["fprint_type"] = "bit/ECFP4"
            f.attrs["desc"] = desc
        return

In [4]:
# for mk-1-a server
# SAVE_IMAGE_DIR="/srv/home/ecaceres/labgits/lab-notebook-caceres.wiki/images/20160328-Time-Split-Info"

# update save image dir for new positive:negative ligand cutoff:
base = get_env_var("DATA_SAVE_BASE")
expt_base = "{}/20180525_DM_scrubbing/train_data".format(base)
bv_type = "4096"
CUTOFF_YEAR = 2012
fp_len=int(bv_type)

# # output files
in_h5py = "{}/chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5".format(expt_base)
in_target_index = "{}/chembl20_MWmax800_scrubDM_minpos10_cutoff5_target_index.pkl".format(expt_base)

val_hdf5 = "{}/val_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5".format(expt_base)
train_hdf5 = "{}/train_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5".format(expt_base)
out_target_index = "{}/ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5_target_index.pkl".format(expt_base)# read data to arrays

In [5]:
act_arr, fp_arr, pos_arr, year_arr, rel_arr = load_hdf5_data(in_h5py)

# indexes with a value in them
valid_idxes = ~np.isnan(act_arr)

# indexes where the year is < 2012
year_lt_2012 = year_arr<CUTOFF_YEAR

# new train and test masks. 
#new_val_idxes ~1/5 of new_train indexes confirmed.
new_train_idxes = valid_idxes & year_lt_2012
new_val_idxes = valid_idxes & ~year_lt_2012

# get counts of targets with >=10 positive ligands
targs_to_use = get_targets_with_count(pos_arr, act_arr, new_train_idxes)
# give new positions to those targets
old_pos_to_new_pos = target_reindexer(targs_to_use)
# make a new map of those positions and save it
targ_reindex = make_new_tid_index_map(in_target_index, old_pos_to_new_pos, save_name=out_target_index)
# get a mask for the positons we'd like to keep based on the target & valid indexes in general
valid_pos = [k for k in old_pos_to_new_pos.keys()]
# target mask where there is data stored and where the targets are in our new dataset.
target_mask = np.isin(pos_arr, valid_pos) & valid_idxes

# get masks and good fingerprints to write
num_train_fps, num_train_cols, train_fp_mask, train_mask = get_masks(new_train_idxes, target_mask)
num_val_fps, num_val_cols, val_fp_mask, val_mask = get_masks(new_val_idxes, target_mask)

# get actual training arrays
train_fp_arr, train_act_arr, train_pos_arr, train_rel_arr, train_year_arr = make_arrays(num_train_fps, num_train_cols, train_mask, train_fp_mask, act_arr, fp_arr, pos_arr, year_arr, rel_arr, old_pos_to_new_pos)
val_fp_arr, val_act_arr, val_pos_arr, val_rel_arr, val_year_arr = make_arrays(num_val_fps, num_val_cols, val_mask, val_fp_mask, act_arr, fp_arr, pos_arr, year_arr, rel_arr, old_pos_to_new_pos)

# save data
num_targets = len(targ_reindex)
desc = "TS 2012 {} data for ECFP multi-task network with DM scrubbed and no PCBA. 10 positive ligands/target with a cutoff of pac50 of 5.0.  See lookup tables for target indexing"
save_h5py(train_hdf5, num_targets, train_fp_arr, train_act_arr, train_pos_arr, train_rel_arr, train_year_arr, 
          desc=desc.format("Train"))
save_h5py(val_hdf5, num_targets, val_fp_arr, val_act_arr, val_pos_arr, val_rel_arr, val_year_arr, 
          desc=desc.format("Validation"))