In [1]:
%matplotlib inline
import os
import cPickle as pkl
from common.h5py_data_loader import H5pyDataLoader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from itertools import combinations
from common.chembl_export_data_loader import DrugMatrixDataLoader

In [2]:
##### Filesystem handler functions #####

def get_env_var(handle):
    """Get an environment variable given the handle for the bash variable
    
    Params
    ========
    handle : str
        handle for a bash variable
        
    Returns
    ========
    tmp : str
        environment variable as a string
    """ 
    tmp = os.getenv(handle)
    if not tmp:
        raise LookupError("Environment variable: {} not set.".format(handle))
    return tmp.strip("'")


def get_kfold_dataloader(dset, kfold_file):
    return H5pyDataLoader(hdf5_file=dataset, target_map_file=None, train_percentage=None, test_indices_file=kfold_file)

def get_counts(activity_df):
    notna = ~np.isnan(activity_df)
    gfive = activity_df>5.0
    leqfive = activity_df<=5.0
    npos = np.sum(notna & gfive)
    nneg = np.sum(notna & leqfive)
    ntot = np.sum(notna)
    return npos, nneg, ntot


In [3]:
expt_base="{}/20180525_DM_scrubbing/train_data".format(get_env_var("DATA_SAVE_BASE"))
dataset = "{}/train_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5".format(expt_base)
ts_dataset =  "{}/val_ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5.hdf5".format(expt_base)
target_map = "{}/ts2012_chembl20_MWmax800_scrubDM_minpos10_cutoff5_target_index.pkl".format(expt_base)

In [4]:
dl = H5pyDataLoader(hdf5_file=dataset, target_map_file=target_map, train_percentage=None)
ts_dl = H5pyDataLoader(hdf5_file=ts_dataset, target_map_file=target_map, train_percentage=None)

In [5]:
kfold_dir = "{}/20190410_SMA_Investigation/kfold_indices".format(get_env_var("DATA_SAVE_BASE"))

In [6]:
n_folds=5
fold_list = np.arange(0,n_folds,1)
kfold_file = "{}/pickleKF_{}_indices".format(kfold_dir, {})
df = pd.DataFrame(columns=["dset", "npos", "nneg", "ntot"])
a = dl.load_activity()
npos, nneg, ntot = get_counts(a)
df = df.append({"dset":"train_all", "npos": npos, "nneg": nneg, "ntot":ntot}, ignore_index=True)

for i in fold_list:
    dl_i = get_kfold_dataloader(dataset, kfold_file.format(i))
    a_i = dl_i.load_activity()
    train_idx, test_idx = dl_i.get_train_test_indices()
    a_test = a_i[test_idx]
    npos, nneg, ntot = get_counts(a_test)
    df = df.append({"dset":"test_fold_{}".format(i), "npos": npos, "nneg": nneg, "ntot":ntot}, ignore_index=True)
    
    a_train = a_i[train_idx]
    npos, nneg, ntot = get_counts(a_train)
    df = df.append({"dset":"train_fold_{}".format(i), "npos": npos, "nneg": nneg, "ntot":ntot}, ignore_index=True)



In [7]:
a_i = ts_dl.load_activity()
npos, nneg, ntot = get_counts(a_i)
df = df.append({"dset":"timesplit", "npos": npos, "nneg": nneg, "ntot":ntot}, ignore_index=True)



In [8]:
with open(target_map, "rb") as f:
    d = pkl.load(f)

In [9]:
dm_dl = DrugMatrixDataLoader()
a = dm_dl.chembl_export_df
dm_targets = [i for i in a.columns if i in d.keys()]
dm_stuff = a[dm_targets]
notna = ~np.isnan(dm_stuff.values)
gfive = dm_stuff.values>5.0
leqfive = dm_stuff.values<=5.0
npos = np.sum(notna & gfive)
nneg = np.sum(notna & leqfive)
ntot = npos+nneg
df = df.append({"dset":"drugmatrix_known_vals", "npos": npos, "nneg": nneg, "ntot":ntot}, ignore_index=True)

  tdf.sortlevel(['target', 'compound'], inplace=True)
  
  import sys


In [10]:
dm_dl = DrugMatrixDataLoader()
a = dm_dl.chembl_export_df
dm_targets = [i for i in a.columns if i in d.keys()]
dm_stuff = a[dm_targets]
notna = ~np.isnan(dm_stuff)
isna = np.isnan(dm_stuff)
gfive = dm_stuff>5.0
leqfive = dm_stuff<=5.0
npos = np.sum(notna.values & gfive.values)
nneg = np.sum(isna.values | leqfive.values)
ntot = npos+nneg
df = df.append({"dset":"drugmatrix_all_vals", "npos": npos, "nneg": nneg, "ntot":ntot}, ignore_index=True)

In [11]:
df["ppos"] = (df["npos"]/df["ntot"])*100
df["pneg"] = (df["nneg"]/df["ntot"])*100

In [12]:
df.to_csv("split_ratios.csv", sep="\t")

In [15]:
a = dl.load_activity()

In [16]:
m = ~np.isnan(a)

In [17]:
n_interactions = m.sum()

In [18]:
ten_micromolar = 0.00001
one_micromolar = 0.000001
onehundred_nanomolar = 0.0000001

In [19]:
geq_5 = a[m] >= -np.log10(ten_micromolar)
geq_6 = a[m] >= -np.log10(one_micromolar)
geq_7 = a[m] >= -np.log10(onehundred_nanomolar)

In [20]:
100*(float(geq_5.sum())/n_interactions)

72.80876333443801

In [21]:
100*(float(geq_6.sum())/n_interactions)

55.47806927190162

In [22]:
100*(float(geq_7.sum())/n_interactions)

34.309385832131625

In [23]:
t = dl.load_pos()

In [24]:
targs = t[m]

In [32]:
target_lookup_dict = {v:k for k,v in d.items()}

In [None]:
no_negatives = 0
no_positives = 0
dat = pd.DataFrame(columns=["target", "n_pos", "n_neg"])
for i in np.arange(0, max(targs)+1, 1):
    targ_mask = targs == i
    mols = a[m][targ_mask] 
    pos_mask = mols >= 5.0
    neg_mask = mols < 5.0
    if pos_mask.sum() == 0:
        no_positives += 1
    if neg_mask.sum() == 0:
        no_negatives += 1
    if (pos_mask.sum()==0) and (neg_mask.sum() == 0):
        print("ERROR")
    dat.loc[i] = {"target": target_lookup_dict[i], "n_pos": pos_mask.sum(), "n_neg": neg_mask.sum()}

In [None]:
dat

In [51]:
dat.to_csv("targets_and_counts.csv", sep="\t")

In [21]:
no_negatives

139

In [22]:
no_positives

0

In [24]:
(float(139)/2037)*100

6.823760432007854

In [41]:
d = pd.DataFrame(columns=["cutoff", "n_pos", "n_neg"])

In [42]:
for i in np.arange(0,16, 1):
    cutoff = i
    n_pos = (a[m] >= i).sum()
    n_neg = (a[m] < i).sum()
    d.loc[i] = {"cutoff": cutoff, "n_pos": n_pos, "n_neg": n_neg}

In [47]:
d["percent_pos"] = 100*((d["n_pos"])/(d["n_pos"]+d["n_neg"]))
d["percent_neg"] = 100*((d["n_neg"])/(d["n_pos"]+d["n_neg"]))

In [52]:
d.to_csv("cutoffs_and_counts.csv", sep="\t")

In [53]:
d

Unnamed: 0,cutoff,n_pos,n_neg,percent_pos,percent_neg
0,0,558235,0,100.0,0.0
1,1,553759,4476,99.1982,0.801813
2,2,531531,26704,95.2164,4.78365
3,3,475564,82671,85.1906,14.8094
4,4,451808,106427,80.9351,19.0649
5,5,406444,151791,72.8088,27.1912
6,6,309698,248537,55.4781,44.5219
7,7,191527,366708,34.3094,65.6906
8,8,86705,471530,15.532,84.468
9,9,23784,534451,4.26057,95.7394


In [54]:
dat

Unnamed: 0,target,n_pos,n_neg
0,0,16,34
1,1,83,61
2,2,26,1
3,3,15,5
4,4,928,373
5,5,30,25
6,6,45,7
7,7,207,13
8,8,14,8
9,9,47,17
