In [2]:
import os, sys
import json
from itertools import izip
from common.plot_fcns import *
import numpy as np
import pandas as pd

sys.path.insert(0,'/srv/home/nmew/myprojects/clean-neural-nets/')
from common.h5py_loading import load_target_map, align_target_maps
from common.chembl_export_data_loader import DrugMatrixDataLoader
from common.h5py_data_loader import H5pyDataLoader
from lasagne_nn.run_nn import get_predictions_of_knowns, get_network_from_weights

In [3]:
# dataset can be 'test', 'train', 'val' or 'drugmatrix'# datase 
def predictions_knowns_from_trained_network_and_data(dataset, network, weights_file, train_dl, ts_dl, dm_dl):
    if dataset == 'test' or dataset == 'train':
        data_loader = train_dl
    if dataset == 'timesplit':
        data_loader = ts_dl
    if dataset == 'drugmatrix':
        data_loader = dm_dl

    network_target_map = load_target_map(train_dl.target_map_file)

    if dataset == 'train':
        km = data_loader.get_known_mask(data_loader.train_indices)
        inds = data_loader.train_indices
    elif dataset == 'test':
        km = data_loader.get_known_mask(data_loader.test_indices)
        inds = data_loader.test_indices
    elif dataset == 'timesplit':
        km = data_loader.get_known_mask(np.arange(len(data_loader.all_pos), dtype=int))
        inds = None
    elif dataset == 'drugmatrix': 
        known_target_slice, _ = align_target_maps(data_loader.target_map, train_dl.target_map)
        km = data_loader.get_known_mask(np.arange(len(data_loader.fingerprints), dtype=int))
        km = km[known_target_slice]
        inds = None
    predictions, knowns = get_predictions_of_knowns(data_loader=data_loader,
                                                    weights_filename=weights_file,
                                                    indices=inds,
                                                    network=network,
                                                    network_target_map=network_target_map)    

    # unravel and save predictions
    pred_matrix = np.zeros(km.shape)
    pred_matrix[:] = np.nan
    pred_matrix[km] = predictions

    # unravel and save knowns
    known_matrix = np.zeros(km.shape)
    known_matrix[:] = np.nan
    known_matrix[km] = knowns

    return pred_matrix, known_matrix
    
class  Experiment(dict):
    def __init__(self, name):
        self.name = name
        self.folds = []
        self.converged_epochs = []
        self.trained_paths = []
      
    def __repr__(self):
        return str(vars(self))
    
    def __str__(self):
        return json.dumps(vars(self), indent=2)
        
    def set_converged_epoch(self, epoch, train_path, fold=None):        
        self.folds.append(fold)
        self.converged_epochs.append(epoch)
        self.trained_paths.append(train_path)
        

In [4]:
def get_network_script_from_train_path(train_path, network_script_fmter):
    script_name = train_path.split("trained_nets/")[-1].split("/")[0]
    return network_script_fmter.format(script_name)

In [5]:
SAVE_BASE = get_env_var("DATA_SAVE_BASE")
outdir = "{}/20190410_SMA_Investigation/predictions/{}".format(SAVE_BASE, "{}")
expt_base = outdir.format("STD_SMA_RATIOS")
converged_epochs = "{}/experiments.json".format(expt_base)

In [6]:
# datasets
srv_save_dir = get_env_var("DATA_SAVE_BASE")
new_timesplit_dir = '{}/20180525_DM_scrubbing/train_data/'.format(srv_save_dir)
new_timesplit_train = os.path.join(new_timesplit_dir, 'train_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5')
new_timesplit_val = os.path.join(new_timesplit_dir, 'val_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5')
new_timesplit_map = os.path.join(new_timesplit_dir, 'ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5_target_index.pkl')

In [7]:
with open(converged_epochs, "r") as fp:
    expts = json.load(fp)

In [8]:
network_script_fmter = "{}/labgits/neural-nets/experiments/{}.py".format(get_env_var("HOME"), "{}")

ts_dl = H5pyDataLoader(
    hdf5_file=new_timesplit_val,
    target_map_file=new_timesplit_map, 
    train_percentage=None, multitask=True)
ts_dl.load_training_data()
dm_dl = DrugMatrixDataLoader()
train_dl = H5pyDataLoader(hdf5_file=new_timesplit_train,
                          target_map_file=new_timesplit_map, 
                          train_percentage=None, 
                          multitask=True)

datasets = ['test', 'train', 'timesplit', 'drugmatrix']

for e in expts:
    first = True
    for epoch, path, fold in izip(e["converged_epochs"], e["trained_paths"], e["folds"]):
        # epoch network info
        network_script = get_network_script_from_train_path(path, network_script_fmter)
        test_index_file = "{}/test_indices.npy".format(path)
        train_dl.test_indices_file = test_index_file
        weights_f = os.path.join(path, "model_at_epoch_{}.npz".format(epoch))
        network = get_network_from_weights(weights_f, build_nn=network_script)
        
        # this should update the indices each time.
        train_dl.train_indices, train_dl.test_indices = train_dl.get_train_test_indices()
        train_dl.load_training_data()
        # get data ready for predictions
        train_known = train_dl.all_act[train_dl.train_indices]
        test_known = train_dl.all_act[train_dl.test_indices]
        # n_molecules shouldn't change
        assert(train_known.shape[0] + test_known.shape[0] == train_dl.all_act.shape[0])
        # make predictions
        for ds in datasets:
            preds, knowns = predictions_knowns_from_trained_network_and_data(ds, network, weights_f, train_dl, ts_dl, dm_dl)
            predf = os.path.join(outdir, '{}_{}_{}_regression_preds.npz'.format(e["name"], ds, fold))
            knwnf = os.path.join(outdir, '{}_{}_{}_regression_knowns.npz'.format(e["name"], ds, fold))
            
            break
        break
#             np.savez_compressed(predf, preds)
#             np.savez_compressed(knwnf, knowns)
            
#         # save targets to file
#         if first: 
#             np.savez('{}/targets/ValTrain_targets.npz'.format(outdir), load_target_list(train_dl.target_map_file))
#             # these two should map to the same protein targets
#             dm_target_slice, train_target_slice = align_target_maps(dm_dl.target_map, train_dl.target_map)
#             np.savez('{}/targets/drugmatrix_targets.npz'.format(outdir), train_dl.target_map[train_target_slice[-1]])
#             np.savez('{}/targets/timesplit_targets.npz'.format(outdir), load_target_list(ts_dl.target_map_file))
#             first = False

  tdf.sortlevel(['target', 'compound'], inplace=True)
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [9]:
dm_target_slice, train_target_slice = align_target_maps(dm_dl.target_map, train_dl.target_map)

In [10]:
train_dl.target_map[train_target_slice[-1]]

TypeError: unhashable type: 'list'

In [20]:
tm_lookup = {v,k for k,v in train_dl.target_map}

SyntaxError: invalid syntax (<ipython-input-20-34554e4f0360>, line 1)