In [1]:
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import tensorflow as tf
import deepchem as dc
import pandas as pd
import imblearn
from deepchem.models.tensorgraph.models.graph_models import GraphConvModel

In [3]:
# Do transfer learning

# Build separate models for each binary endpoint
df = pd.read_csv("/home/nolelin/ULCT_Train_1.5.0.csv")
endpoints = list(df.endpoint.unique())
binary_endpoints = [x for x in endpoints if 'Binary' in x]
b_e_models = {}
for b_e in binary_endpoints:
    df = pd.read_csv('/home/nolelin/ULCTBinaryDFs/' + b_e + '.csv')
    ulct_tasks = [x for x in df.columns if x != 'canonical_smiles']
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=ulct_tasks, smiles_field="canonical_smiles", featurizer=featurizer)
    dataset = loader.featurize('/home/nolelin/ULCTBinaryDFs/' + b_e + '.csv', shard_size=8192)

    transformers = [
        dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]
    for transformer in transformers:
        dataset = transformer.transform(dataset)
        
    model = GraphConvModel(
        len(ulct_tasks), batch_size=50, mode='classification', dense_layer_size=128)
    model.fit(dataset, nb_epoch=10)
    b_e_models[b_e] = model

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/ULCTBinaryDFs/Acute_Oral_Binary.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 18.225 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 1 took 19.870 s
Loading shard 3 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 2 took 21.265 s
Loading shard 4 of si

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Instructions for updating:
Use standard file APIs to delete files with this prefix.
Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/ULCTBinaryDFs/Acute_Dermal_Binary.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 17.952 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 1 took 4.282 s
TIMING: dataset construction took 25.603 s
Loading dataset from disk.
TIMING: dataset construction took 4.124 s
Loading dataset from disk.
Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/ULCTBinaryDFs/Acute_Inhalation_Binary.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 30

In [77]:
# Get outputs of trained baseline models after last graphconv layer

import sys
for b_e in binary_endpoints:
    df = pd.read_csv('/home/nolelin/ULCTBinaryDFs/' + b_e + '.csv')
    ulct_tasks = [x for x in df.columns if x != 'canonical_smiles']
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=ulct_tasks, smiles_field="canonical_smiles", featurizer=featurizer)
    dataset = loader.featurize('/home/nolelin/ULCTBinaryDFs/' + b_e + '.csv', shard_size=8192)

    transformers = [
        dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]
    for transformer in transformers:
        dataset = transformer.transform(dataset)
    concatenated_outputs = []
    for b_e2 in binary_endpoints:
        model = b_e_models[b_e2]
        trunc_output = model.predict(dataset, transformers, outputs=model.layers['GraphConv_13'])
        concatenated_outputs.append(trunc_output)
    concatenated_outputs = np.hstack(tuple(concatenated_outputs))
    concatenated_df = pd.DataFrame(concatenated_outputs)
    concatenated_df.to_csv('/home/nolelin/ULCTTransferLearning/' + b_e + '.csv', index=False)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/ULCTBinaryDFs/Acute_Oral_Binary.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 16.860 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 1 took 19.584 s
Loading shard 3 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 2 took 19.337 s
Loading shard 4 of si

TIMING: featurizing shard 0 took 22.360 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 1 took 2.672 s
TIMING: dataset construction took 28.533 s
Loading dataset from disk.
TIMING: dataset construction took 4.610 s
Loading dataset from disk.


In [99]:
layer_names = list(model.layers.keys())
for l_name in layer_names:
    try:
        trunc_output = model.predict(dataset, transformers, outputs=model.layers[l_name])
        print(l_name, trunc_output.shape)
    except:
        continue
#trunc_output = model.predict(dataset, transformers, outputs=model.layers['GraphPool_11'])
#trunc_output.shape

SoftMax_1 (9341, 1, 2)
TrimGraphOutput_2 (9341, 1, 2)
Reshape_3 (9350, 1, 2)
Dense_4 (9350, 2)
GraphGather_5 (9350, 256)
BatchNorm_6 (216338, 128)
Dense_7 (216338, 128)
GraphPool_8 (216338, 64)
BatchNorm_9 (216338, 64)
GraphConv_10 (216338, 64)
GraphPool_11 (216338, 64)
BatchNorm_12 (216338, 64)
GraphConv_13 (216338, 64)
Feature_14 (216338, 75)
Feature_15 (2057, 2)
Feature_16 (216338,)
Feature_17 (39596, 1)
Feature_18 (115184, 2)
Feature_19 (58317, 3)
Feature_20 (2011, 4)
Feature_21 (2, 5)
Feature_22 (1, 6)
Feature_23 (1, 7)
Feature_24 (0, 8)
Feature_25 (0, 9)
Feature_26 (0, 10)
Weights_28 (9341, 1)
SoftMaxCrossEntropy_30 (9341, 1)
Label_31 (9341, 1, 2)


In [96]:
[x for x in trunc_output]

[array([[-1.8926654,  1.7063129]], dtype=float32),
 array([[-1.4716318,  2.0025737]], dtype=float32),
 array([[-4.1078463,  4.139362 ]], dtype=float32),
 array([[-1.3646259,  1.444082 ]], dtype=float32),
 array([[-2.2318816,  1.8412462]], dtype=float32),
 array([[-1.9952691,  1.8405786]], dtype=float32),
 array([[-2.2269242,  1.9596369]], dtype=float32),
 array([[-1.5949316,  0.3796339]], dtype=float32),
 array([[-3.2079086,  2.9203014]], dtype=float32),
 array([[-2.2028327,  1.4735292]], dtype=float32),
 array([[-3.0948555,  2.2504964]], dtype=float32),
 array([[-1.0994891 ,  0.36110717]], dtype=float32),
 array([[-2.1405084,  0.8258286]], dtype=float32),
 array([[-1.4629049 ,  0.33393428]], dtype=float32),
 array([[-2.1759837,  0.7349798]], dtype=float32),
 array([[-2.4997714,  1.6093571]], dtype=float32),
 array([[-1.1237078,  0.2508222]], dtype=float32),
 array([[-3.7633345,  2.962167 ]], dtype=float32),
 array([[-3.7916367,  2.8052537]], dtype=float32),
 array([[-3.3527853,  2.654

In [21]:
b_e_models['Mutagenic_Binary'].layers

{'SoftMax_1': <deepchem.models.tensorgraph.layers.SoftMax at 0x7f9a3063fc88>,
 'TrimGraphOutput_2': <deepchem.models.tensorgraph.models.graph_models.TrimGraphOutput at 0x7f9a90fd4d30>,
 'Reshape_3': <deepchem.models.tensorgraph.layers.Reshape at 0x7f9ad838f320>,
 'Dense_4': <deepchem.models.tensorgraph.layers.Dense at 0x7f9ad9f53eb8>,
 'GraphGather_5': <deepchem.models.tensorgraph.layers.GraphGather at 0x7f9a29dbb160>,
 'BatchNorm_6': <deepchem.models.tensorgraph.layers.BatchNorm at 0x7f9a2be98c50>,
 'Dense_7': <deepchem.models.tensorgraph.layers.Dense at 0x7f9a90289198>,
 'GraphPool_8': <deepchem.models.tensorgraph.layers.GraphPool at 0x7f9ae3f70470>,
 'BatchNorm_9': <deepchem.models.tensorgraph.layers.BatchNorm at 0x7f9ad9dc5630>,
 'GraphConv_10': <deepchem.models.tensorgraph.layers.GraphConv at 0x7f9a2b4d5ac8>,
 'GraphPool_11': <deepchem.models.tensorgraph.layers.GraphPool at 0x7f9a30b54320>,
 'BatchNorm_12': <deepchem.models.tensorgraph.layers.BatchNorm at 0x7f9ad8969710>,
 'GraphC

In [29]:
layers = b_e_models['Mutagenic_Binary'].layers
replacements = {}
layer_copy = layers['SoftMaxCrossEntropy_30'].shared()

TypeError: shared() missing 1 required positional argument: 'in_layers'

In [66]:
testmodel = GraphConvModel(
        len(ulct_tasks), batch_size=50, mode='classification')
#print(testmodel.layers)
layers_to_skip = ['Weights_28', 'WeightedError_29', 'SoftMaxCrossEntropy_30', 'Label_31']
#layers_to_skip = []
testsubmodel = testmodel.create_submodel(list({k:v for (k,v) in testmodel.layers.items() if k not in layers_to_skip}.values()))
#testsubmodel.layers
testmodel.fit(dataset, nb_epoch=10)


2.4283764656252482

In [47]:
testmodel.layers

{'SoftMax_1': <deepchem.models.tensorgraph.layers.SoftMax at 0x7f9a328b84e0>,
 'TrimGraphOutput_2': <deepchem.models.tensorgraph.models.graph_models.TrimGraphOutput at 0x7f9a328b84a8>,
 'Reshape_3': <deepchem.models.tensorgraph.layers.Reshape at 0x7f9a328b86d8>,
 'Dense_4': <deepchem.models.tensorgraph.layers.Dense at 0x7f9a328b8048>,
 'GraphGather_5': <deepchem.models.tensorgraph.layers.GraphGather at 0x7f9a328b80f0>,
 'BatchNorm_6': <deepchem.models.tensorgraph.layers.BatchNorm at 0x7f9a328b8208>,
 'Dense_7': <deepchem.models.tensorgraph.layers.Dense at 0x7f9a328b85c0>,
 'GraphPool_8': <deepchem.models.tensorgraph.layers.GraphPool at 0x7f9a328b8160>,
 'BatchNorm_9': <deepchem.models.tensorgraph.layers.BatchNorm at 0x7f9a328b8278>,
 'GraphConv_10': <deepchem.models.tensorgraph.layers.GraphConv at 0x7f9a328b81d0>,
 'GraphPool_11': <deepchem.models.tensorgraph.layers.GraphPool at 0x7f9a328b86a0>,
 'BatchNorm_12': <deepchem.models.tensorgraph.layers.BatchNorm at 0x7f9a328b8550>,
 'GraphC

In [67]:
result = testmodel.predict(dataset, outputs=testmodel.layers['GraphConv_13'])
result.shape

(216338, 64)

In [65]:
result = testmodel.predict(dataset, outputs=testmodel.layers['Feature_15'])
result.shape

(2057, 2)

In [51]:
result

array([[0.43120676, 0.        , 0.30532956, ..., 0.17277968, 0.42479753,
        0.        ],
       [0.43120676, 0.        , 0.30532956, ..., 0.17277968, 0.42479753,
        0.        ],
       [0.43120676, 0.        , 0.30532956, ..., 0.17277968, 0.42479753,
        0.        ],
       ...,
       [0.        , 1.0867612 , 1.0480222 , ..., 0.        , 0.1579994 ,
        1.4061632 ],
       [0.        , 1.0867612 , 1.0480222 , ..., 0.        , 0.1579994 ,
        1.4061632 ],
       [0.        , 1.1291945 , 0.26606447, ..., 0.        , 0.        ,
        1.231877  ]], dtype=float32)

In [13]:
dc.feat.UserDefinedFeaturizer(feature_fields=[])

<deepchem.feat.base_classes.UserDefinedFeaturizer at 0x7f99db2f9518>

In [6]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

In [7]:
X

array([[ 0.60013068, -1.42766402, -0.8128431 , ..., -0.55302378,
         0.14876985, -1.74795877],
       [ 0.25364115, -1.45638734, -0.52259796, ...,  1.69409346,
        -0.24969905,  0.4587598 ],
       [ 0.18225579, -1.27960188,  0.6780773 , ..., -0.81115113,
        -0.28733609, -1.8361891 ],
       ...,
       [-1.47024551,  1.44291998, -0.61025173, ..., -0.80856535,
         0.92899574, -2.11510777],
       [ 1.16080297,  1.07930132, -0.10917057, ..., -2.58494382,
        -1.74326156, -0.89317882],
       [-0.29874435, -1.48144343, -0.34051766, ..., -0.17772046,
        -0.13834763, -0.81195185]])

In [8]:
X_res

array([[ 0.60013068, -1.42766402, -0.8128431 , ..., -0.55302378,
         0.14876985, -1.74795877],
       [ 0.25364115, -1.45638734, -0.52259796, ...,  1.69409346,
        -0.24969905,  0.4587598 ],
       [ 0.18225579, -1.27960188,  0.6780773 , ..., -0.81115113,
        -0.28733609, -1.8361891 ],
       ...,
       [ 0.20191803, -1.09843048, -0.33465617, ..., -0.37267686,
        -0.75324544, -1.93477673],
       [ 0.61852541, -0.10485812,  0.88110485, ...,  0.64689718,
         0.58448752, -1.1286644 ],
       [-1.57200353, -0.43539858,  1.17987188, ...,  1.36385934,
         0.15761316, -1.36429082]])

In [4]:
#df = pd.read_csv('/home/nolelin/ULCTBinaryDFs/Acute_Oral_Binary.trainfold.0.csv')
df = pd.read_csv('/home/nolelin/ULCTBinaryDFs/Acute_Oral_Binary.trainfold.0.resampled.csv')
print(df.shape[0], np.sum(df['Acute_Oral_Binary']))

39113 19847.0


In [2]:
import pandas as pd

multiclass_cols = ['NTPAcuteOralChallenge_EPA', 'NTPAcuteOralChallenge_GHS']
only_one_class = ['H200', 'H203', 'H221', 'H223', 'H227', 'H240', 'H252', 'H316', 'H320', 'H335', 'H336', 'H370', 
                 'H371', 'H372', 'H373']
df = pd.read_csv('/home/nolelin/ULCT_Train_1.5.0.csv')
df = df.loc[~df['endpoint'].isin(only_one_class)]
df = df.pivot(index='canonical_smiles', columns='endpoint', values='value')
df = df.replace(-1.0, 0.0)
df = pd.get_dummies(df, columns=multiclass_cols)
df['canonical_smiles'] = df.index
df.to_csv('/home/nolelin/ULCT_Train_1.5.0.featurized.csv', index=False)

In [7]:
pd.read_csv('/home/nolelin/ULCTBinaryDFs/holdout_1.5.0.csv', delimiter='\t', header=None)

Unnamed: 0,0,1,2
0,O1CC1C2OC2,H335,1.0
1,O1CC1C2OC2,H340,1.0
2,O1CC1C2OC2,H341,-1.0
3,O1CC1C2OC2,H350,-1.0
4,O1CC1C2OC2,H351,1.0
5,O1CCNNCC1,H250,-1.0
6,O1CCNNCC1,H251,-1.0
7,O1CCNNCC1,H260,-1.0
8,O1CCNNCC1,H261,-1.0
9,O1CCNNCC1,H270,-1.0


In [10]:
df = pd.read_csv('/home/nolelin/ULCTBinaryDFs/holdout_1.5.0.csv', delimiter='\t', header=None)
df.columns = ['canonical_smiles', 'endpoint', 'value']
bin_cols = [x for x in list(df.endpoint.unique()) if 'Binary' in x]
for bin_col in bin_cols:
    newdf = df.loc[df['endpoint'] == bin_col][['canonical_smiles', 'value']]
    newdf.columns = ['canonical_smiles', bin_col]
    newdf = newdf.replace(-1.0, 0.0)
    newdf.to_csv('/home/nolelin/ULCTBinaryDFs/' + bin_col + '_holdout.csv', index=False)

In [12]:
# Get class imbalance measurements and resample

import sys
from sklearn.model_selection import KFold
import sklearn
class_count = {}
dfs_to_append = []
df = pd.read_csv("/home/nolelin/ULCT_Train_1.5.0.csv")
endpoints = list(df.endpoint.unique())
df = df.loc[df['endpoint'].isin([x for x in endpoints if 'Binary' in x])]
df = df.replace(-1.0, 0.0)
df = df.pivot(index='canonical_smiles', columns='endpoint', values='value')
df['canonical_smiles'] = df.index
'''
for ep in list(df.endpoint.unique()):
    print(ep)
    one_count = df.loc[(df['endpoint'] == ep) & (df['value'] == 1.0)].shape[0]
    zero_count = df.loc[(df['endpoint'] == ep) & (df['value'] != 1.0)].shape[0]
    one_df = df.loc[(df['endpoint'] == ep) & (df['value'] == 1.0)]
    zero_df = df.loc[(df['endpoint'] == ep) & (df['value'] != 1.0)]
    class_count[ep] = (zero_count, one_count)
    if one_count > zero_count:
        zero_multiplier = round(float(one_count) / float(zero_count)) - 1
        dfs_to_append.extend([zero_df] * zero_multiplier)
    else:
        one_multiplier = round(float(zero_count) / float(one_count)) - 1
        dfs_to_append.extend([one_df] * one_multiplier)
    
append_df = pd.concat(dfs_to_append)
df = pd.concat([df, append_df])
#df = df.pivot_table(index='canonical_smiles', columns='endpoint', values='value')
#df = df.replace(-1.0, 0.0)
#df['canonical_smiles'] = df.index
#df.to_csv('/home/nolelin/ULCT_Train_1.5.0.featurized.resampled.csv', index=False)
'''
binary_endpoints = [x for x in endpoints if 'Binary' in x]
for b_e in binary_endpoints:
    this_df = df[['canonical_smiles', b_e]]
    this_df = this_df.dropna(how='any')
    this_df.reset_index(drop=True, inplace=True)
    kf = KFold(n_splits=4, shuffle=True)
    kf.get_n_splits(this_df)
    count = 0
    #for train_index, test_index in kf.split(this_df):
        #train, test = this_df.iloc[train_index], this_df.iloc[test_index]
        #train.to_csv('/home/nolelin/ULCTBinaryDFs/' + b_e + '.trainfold.' + str(count) + '.csv', index=False)
        #test.to_csv('/home/nolelin/ULCTBinaryDFs/' + b_e + '.testfold.' + str(count) + '.csv', index=False)
        #count += 1
    this_df.to_csv('/home/nolelin/ULCTBinaryDFs/' + b_e + '.csv', index=False)

for b_e in binary_endpoints:
    for fold in range(4):
        dfs_to_append = []
        this_df = pd.read_csv('/home/nolelin/ULCTBinaryDFs/' + b_e + '.trainfold.' + str(fold) + '.csv')
        one_count = this_df.loc[this_df[b_e] == 1.0].shape[0]
        zero_count = this_df.loc[this_df[b_e] != 1.0].shape[0]
        assert(one_count > 0 and zero_count > 0)
        one_df = this_df.loc[this_df[b_e] == 1.0]
        zero_df = this_df.loc[this_df[b_e] != 1.0]
        if one_count > zero_count:
            zero_multiplier = round(float(one_count) / float(zero_count)) - 1
            dfs_to_append.extend([zero_df] * zero_multiplier)
        else:
            one_multiplier = round(float(zero_count) / float(one_count)) - 1
            dfs_to_append.extend([one_df] * one_multiplier)
        append_df = pd.concat(dfs_to_append)
        this_df = pd.concat([this_df, append_df])
        this_df.to_csv('/home/nolelin/ULCTBinaryDFs/' + b_e + '.trainfold.' + str(fold) + '.resampled.csv', index=False)

#sys.exit(0)

b_e_bas = {}
for b_e in binary_endpoints:
    df = pd.read_csv('/home/nolelin/ULCTBinaryDFs/' + b_e + '.csv')
    ulct_tasks = [x for x in df.columns if x != 'canonical_smiles']
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=ulct_tasks, smiles_field="canonical_smiles", featurizer=featurizer)
    dataset = loader.featurize('/home/nolelin/ULCTBinaryDFs/' + b_e + '.csv', shard_size=8192)

    transformers = [
        dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]
    for transformer in transformers:
        dataset = transformer.transform(dataset)
        
    model = GraphConvModel(
        len(ulct_tasks), batch_size=50, mode='classification', dense_layer_size=128)
    model.fit(dataset, nb_epoch=10)
    
    test_dataset = loader.featurize('/home/nolelin/ULCTBinaryDFs/' + b_e + '_holdout.csv', shard_size=8192)
    balanced_accuracies = []
    predictions = model.predict(test_dataset)
    y_trues = test_dataset.y
    for idx in range(y_trues.shape[1]):
        task_preds = [list(x).index(max(x)) for x in predictions[:,idx,:]]
        task_y_trues = y_trues[:, idx]
        bas = sklearn.metrics.balanced_accuracy_score(task_y_trues, task_preds)
        balanced_accuracies.append(bas)
    print(balanced_accuracies)
    print("Validation Balanced Accuracy Score: %f" % np.mean(balanced_accuracies))
    b_e_bas[b_e] = np.mean(balanced_accuracies)
'''
b_e_bas = {}
for b_e in binary_endpoints:
    print(b_e)
    for fold in range(4):
        df = pd.read_csv('/home/nolelin/ULCTBinaryDFs/' + b_e + '.trainfold.' + str(fold) + '.csv')
        ulct_tasks = [x for x in df.columns if x != 'canonical_smiles']
        featurizer = dc.feat.ConvMolFeaturizer()
        loader = dc.data.CSVLoader(tasks=ulct_tasks, smiles_field="canonical_smiles", featurizer=featurizer)
        dataset = loader.featurize('/home/nolelin/ULCTBinaryDFs/' + b_e + '.trainfold.' + str(fold) + '.csv', shard_size=8192)
        
        transformers = [
            dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
        ]
        for transformer in transformers:
            dataset = transformer.transform(dataset)
        
        test_dataset = loader.featurize('/home/nolelin/ULCTBinaryDFs/' + b_e + '.testfold.' + str(fold) + '.csv', shard_size=8192)
        
        test_transformers = [
            dc.trans.BalancingTransformer(transform_w=True, dataset=test_dataset)
        ]
        for transformer in test_transformers:
            test_dataset = transformer.transform(test_dataset)
        
        model = GraphConvModel(
            len(ulct_tasks), batch_size=50, mode='classification', dense_layer_size=128*2)
        model.fit(dataset, nb_epoch=15)
        balanced_accuracies = []
        predictions = model.predict(test_dataset, transformers)
        #predictions = model.predict(test_dataset)
        #valid_scores = model.evaluate(datasplits[fold][1], [metric], transformers)
        y_trues = test_dataset.y
        for idx in range(y_trues.shape[1]):
            task_preds = [list(x).index(max(x)) for x in predictions[:,idx,:]]
            task_y_trues = y_trues[:, idx]
            bas = sklearn.metrics.balanced_accuracy_score(task_y_trues, task_preds)
            balanced_accuracies.append(bas)
        print(balanced_accuracies)
        print("Validation Balanced Accuracy Score: %f" % np.mean(balanced_accuracies))
        b_e_bas[(b_e, fold)] = np.mean(balanced_accuracies)
        #sys.exit(0)
'''

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/ULCTBinaryDFs/Acute_Oral_Binary.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 15.556 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 1 took 18.367 s
Loading shard 3 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 2 took 18.203 s
Loading shard 4 of si

[0.6659277642160928]
Validation Balanced Accuracy Score: 0.665928
Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/ULCTBinaryDFs/Chronic_Aquatic_Binary.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 21.209 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
TIMING: featurizing shard 1 took 18.943 s
TIMING: dataset construction took 46.878 s
Loading dataset from disk.
TIMING: dataset construction took 7.935 s
Loading dataset from disk.
Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/ULCTBinaryDFs/Chronic_Aquatic_Binary_holdout.csv
Loading

'\nb_e_bas = {}\nfor b_e in binary_endpoints:\n    print(b_e)\n    for fold in range(4):\n        df = pd.read_csv(\'/home/nolelin/ULCTBinaryDFs/\' + b_e + \'.trainfold.\' + str(fold) + \'.csv\')\n        ulct_tasks = [x for x in df.columns if x != \'canonical_smiles\']\n        featurizer = dc.feat.ConvMolFeaturizer()\n        loader = dc.data.CSVLoader(tasks=ulct_tasks, smiles_field="canonical_smiles", featurizer=featurizer)\n        dataset = loader.featurize(\'/home/nolelin/ULCTBinaryDFs/\' + b_e + \'.trainfold.\' + str(fold) + \'.csv\', shard_size=8192)\n        \n        transformers = [\n            dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)\n        ]\n        for transformer in transformers:\n            dataset = transformer.transform(dataset)\n        \n        test_dataset = loader.featurize(\'/home/nolelin/ULCTBinaryDFs/\' + b_e + \'.testfold.\' + str(fold) + \'.csv\', shard_size=8192)\n        \n        test_transformers = [\n            dc.trans.Bal

In [13]:
b_e_bas

{'Acute_Oral_Binary': 0.7148254310663258,
 'Acute_Dermal_Binary': 0.7263810849783701,
 'Acute_Inhalation_Binary': 0.6999355187257106,
 'Skin_Corrosion_Binary': 0.6706432582089166,
 'Eye_Irritation_Binary': 0.6659277642160928,
 'Chronic_Aquatic_Binary': 0.7690749538136712,
 'Skin_Sensitisation_Binary': 0.7573228794700615,
 'Mutagenic_Binary': 0.7606796931862718,
 'Acute_Aquatic_Binary': 0.8741371375977911}

In [3]:
b_e_bas

{('Acute_Oral_Binary', 0): 0.7281567789408658,
 ('Acute_Oral_Binary', 1): 0.7001025941757981,
 ('Acute_Oral_Binary', 2): 0.6934649704419864,
 ('Acute_Oral_Binary', 3): 0.6852924752198799,
 ('Acute_Dermal_Binary', 0): 0.7352329598275533,
 ('Acute_Dermal_Binary', 1): 0.7499479302265035,
 ('Acute_Dermal_Binary', 2): 0.712266941864465,
 ('Acute_Dermal_Binary', 3): 0.7384728841839708,
 ('Acute_Inhalation_Binary', 0): 0.7125566458623638,
 ('Acute_Inhalation_Binary', 1): 0.73137981953314,
 ('Acute_Inhalation_Binary', 2): 0.7207980353034813,
 ('Acute_Inhalation_Binary', 3): 0.6880644410681778,
 ('Skin_Corrosion_Binary', 0): 0.6799003287388269,
 ('Skin_Corrosion_Binary', 1): 0.7078233658866055,
 ('Skin_Corrosion_Binary', 2): 0.7180245971793249,
 ('Skin_Corrosion_Binary', 3): 0.6946181876884738,
 ('Eye_Irritation_Binary', 0): 0.6834896856732802,
 ('Eye_Irritation_Binary', 1): 0.6722352426692686,
 ('Eye_Irritation_Binary', 2): 0.6996079295915897,
 ('Eye_Irritation_Binary', 3): 0.668178942877738,


In [4]:
for b_e in binary_endpoints:
    b_a_mean = np.mean(list({k:v for (k,v) in b_e_bas.items() if k[0] == b_e}.values()))
    print(b_e, b_a_mean)

Acute_Oral_Binary 0.7017542046946326
Acute_Dermal_Binary 0.7339801790256232
Acute_Inhalation_Binary 0.7131997354417907
Skin_Corrosion_Binary 0.7000916198733078
Eye_Irritation_Binary 0.6808779502029692
Chronic_Aquatic_Binary 0.6817634718952116
Skin_Sensitisation_Binary 0.7188023211536254
Mutagenic_Binary 0.7397246032007855
Acute_Aquatic_Binary 0.829685446544693


In [11]:
df = pd.read_csv("/home/nolelin/ULCT_Train_1.5.0.csv")
endpoint_values = df.groupby('endpoint').agg({'value': lambda x: tuple(set(x))})
for index, row in endpoint_values.iterrows():
    print(index, row['value'])
#endpoint_values.columns

Acute_Aquatic_Binary (1.0, -1.0)
Acute_Dermal_Binary (1.0, -1.0)
Acute_Inhalation_Binary (1.0, -1.0)
Acute_Oral_Binary (1.0, -1.0)
Chronic_Aquatic_Binary (1.0, -1.0)
Eye_Irritation_Binary (1.0, -1.0)
H200 (1.0,)
H201 (1.0, -1.0)
H203 (1.0,)
H220 (1.0, -1.0)
H221 (1.0,)
H223 (1.0,)
H224 (1.0, -1.0)
H225 (1.0, -1.0)
H226 (1.0, -1.0)
H227 (1.0,)
H228 (1.0, -1.0)
H240 (1.0,)
H242 (1.0, -1.0)
H250 (1.0, -1.0)
H251 (1.0, -1.0)
H252 (1.0,)
H260 (1.0, -1.0)
H261 (1.0, -1.0)
H270 (1.0, -1.0)
H271 (1.0, -1.0)
H272 (1.0, -1.0)
H280 (1.0, -1.0)
H290 (1.0, -1.0)
H300 (1.0, -1.0)
H301 (1.0, -1.0)
H302 (1.0, -1.0)
H303 (1.0, -1.0)
H304 (1.0, -1.0)
H310 (1.0, -1.0)
H311 (1.0, -1.0)
H312 (1.0, -1.0)
H314 (1.0, -1.0)
H315 (1.0, -1.0)
H316 (1.0,)
H317 (1.0, -1.0)
H318 (1.0, -1.0)
H319 (1.0, -1.0)
H320 (1.0,)
H330 (1.0, -1.0)
H331 (1.0, -1.0)
H332 (1.0, -1.0)
H334 (1.0, -1.0)
H335 (1.0,)
H336 (1.0,)
H340 (1.0, -1.0)
H341 (1.0, -1.0)
H350 (1.0, -1.0)
H351 (1.0, -1.0)
H360 (1.0, -1.0)
H361 (1.0, -1.0)
H362 

In [3]:
df

Unnamed: 0_level_0,Acute_Aquatic_Binary,Acute_Dermal_Binary,Acute_Inhalation_Binary,Acute_Oral_Binary,Chronic_Aquatic_Binary,Eye_Irritation_Binary,H201,H220,H224,H225,...,NTPAcuteOralChallenge_EPA_1.0,NTPAcuteOralChallenge_EPA_2.0,NTPAcuteOralChallenge_EPA_3.0,NTPAcuteOralChallenge_EPA_4.0,NTPAcuteOralChallenge_GHS_1.0,NTPAcuteOralChallenge_GHS_2.0,NTPAcuteOralChallenge_GHS_3.0,NTPAcuteOralChallenge_GHS_4.0,NTPAcuteOralChallenge_GHS_5.0,canonical_smiles
canonical_smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B#B,,,1.0,,,,,1.0,,,...,0,0,0,0,0,0,0,0,0,B#B
B(B(N(C)C)N(C)C)(N(C)C)N(C)C,,,,1.0,,1.0,,0.0,,1.0,...,0,0,0,0,0,0,0,0,0,B(B(N(C)C)N(C)C)(N(C)C)N(C)C
B(B(N1CCCC1)N2CCCC2)(N3CCCC3)N4CCCC4,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,B(B(N1CCCC1)N2CCCC2)(N3CCCC3)N4CCCC4
B(C(C)CC)(C(C)CC)C(C)CC,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,B(C(C)CC)(C(C)CC)C(C)CC
B(C)(C)C,,0.0,0.0,0.0,,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,B(C)(C)C
B(CC)(CC)CC,,,1.0,1.0,,1.0,0.0,,,1.0,...,0,1,0,0,0,0,1,0,0,B(CC)(CC)CC
B(CCCC)(CCCC)CCCC,,,,1.0,,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,B(CCCC)(CCCC)CCCC
B(C[Si](C)(C)C)(C[Si](C)(C)C)C[Si](C)(C)C,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,B(C[Si](C)(C)C)(C[Si](C)(C)C)C[Si](C)(C)C
B(N(C)C)(N(C)C)N(C)C,,,,,,1.0,,,0.0,1.0,...,0,0,0,0,0,0,0,0,0,B(N(C)C)(N(C)C)N(C)C
B(N1CCCC1)(N2CCCC2)N3CCCC3,,,,,,1.0,,,,,...,0,0,0,0,0,0,0,0,0,B(N1CCCC1)(N2CCCC2)N3CCCC3


In [4]:
ulct_tasks = [x for x in df.columns if x != 'NTPAcuteOralChallenge_LD50' and x != 'canonical_smiles']
for idx, value in enumerate(ulct_tasks):
    if 'Binary' in value:
        print(idx, value)
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=ulct_tasks, smiles_field="canonical_smiles", featurizer=featurizer)
dataset = loader.featurize('/home/nolelin/ULCT_Train_1.5.0.featurized.csv', shard_size=8192)

0 Acute_Aquatic_Binary
1 Acute_Dermal_Binary
2 Acute_Inhalation_Binary
3 Acute_Oral_Binary
4 Chronic_Aquatic_Binary
5 Eye_Irritation_Binary
54 Mutagenic_Binary
57 Skin_Corrosion_Binary
58 Skin_Sensitisation_Binary
Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/ULCT_Train_1.5.0.featurized.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 16.626 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 1 took 16.812 s
Loading shard 3 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 20

In [5]:
transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]
for transformer in transformers:
    dataset = transformer.transform(dataset)

TIMING: dataset construction took 33.830 s
Loading dataset from disk.


In [6]:
splitter = dc.splits.RandomSplitter()
datasplits = splitter.k_fold_split(dataset, 4)

TIMING: dataset construction took 16.698 s
Loading dataset from disk.
TIMING: dataset construction took 28.173 s
Loading dataset from disk.
TIMING: dataset construction took 51.751 s
Loading dataset from disk.
TIMING: dataset construction took 16.301 s
Loading dataset from disk.
TIMING: dataset construction took 13.694 s
Loading dataset from disk.
TIMING: dataset construction took 19.275 s
Loading dataset from disk.
TIMING: dataset construction took 50.829 s
Loading dataset from disk.
TIMING: dataset construction took 33.827 s
Loading dataset from disk.
TIMING: dataset construction took 10.785 s
Loading dataset from disk.
TIMING: dataset construction took 10.895 s
Loading dataset from disk.
TIMING: dataset construction took 50.778 s
Loading dataset from disk.
TIMING: dataset construction took 51.130 s
Loading dataset from disk.
TIMING: dataset construction took 7.559 s
Loading dataset from disk.
TIMING: dataset construction took 0.005 s
Loading dataset from disk.
TIMING: dataset constr

In [7]:
from sklearn.metrics import accuracy_score
import sklearn

def balanced_accuracy_score(y, y_pred):
    """Computes balanced accuracy score."""
    num_positive = float(np.count_nonzero(y))
    num_negative = float(len(y) - num_positive)
    pos_weight = num_negative / num_positive
    weights = np.ones_like(y)
    weights[y != 0] = pos_weight
    return accuracy_score(y, y_pred, sample_weight=weights)

for fold in range(4):
    model = GraphConvModel(
        len(ulct_tasks), batch_size=50, mode='classification')
    # Set nb_epoch=10 for better results.
    model.fit(datasplits[fold][0], nb_epoch=10)
    #metric = dc.metrics.Metric(
        #dc.metrics.roc_auc_score, np.mean, mode="classification")

    #metric = dc.metrics.Metric(
        #dc.metrics.accuracy_score, np.mean, mode="classification")
        
    #metric = dc.metrics.Metric(
        #sklearn.metrics.balanced_accuracy_score, np.mean, mode="classification")

    balanced_accuracies = []
    predictions = model.predict(datasplits[fold][1], transformers)
    #valid_scores = model.evaluate(datasplits[fold][1], [metric], transformers)
    y_trues = datasplits[fold][1].y
    for idx in range(y_trues.shape[1]):
        task_preds = [list(x).index(max(x)) for x in predictions[:,idx,:]]
        task_y_trues = y_trues[:, idx]
        bas = sklearn.metrics.balanced_accuracy_score(task_y_trues, task_preds)
        balanced_accuracies.append(bas)
    print(balanced_accuracies)
    print("Validation Balanced Accuracy Score: %f" % np.mean(balanced_accuracies))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Instructions for updating:
Use standard file APIs to delete files with this prefix.




[0.7183518189496225, 0.5404084133479985, 0.5185715682311194, 0.5270310017874106, 0.6906465952559404, 0.5604974169332699, 0.7649132848720737, 0.8751129695395934, 0.9458345261952477, 0.8892658030575036, 0.792773364004913, 0.659005532323576, 0.8128698956354268, 0.8802180123706764, 0.9799713876967096, 0.8591662899073066, 0.8803735105407883, 0.4666647590706192, 0.7341424318754294, 0.8984947311493141, 0.9819218427687371, 0.7400323893602385, 0.7165448669351266, 0.6320706335217975, 0.5429522780445492, 0.9656099794003205, 0.8908341503431441, 0.7357325730271957, 0.6913905806993594, 0.5275535621301635, 0.8278083729232116, 0.5555934576108446, 0.5098705470765597, 0.6792535165779974, 0.5736029613023303, 0.7249616462475128, 0.6990823181394265, 0.5193621628969753, 0.5842940511343986, 0.6737710222912564, 0.5884300788296637, 0.6303102655147886, 0.5439981041748383, 0.6671464305274822, 0.5775611132486854, 0.707763737315719, 0.719659011025833, 0.8784046692607004, 0.9581664186791805, 0.6447906235148346, 0.5

In [21]:
datasplits[0][1].get_shape()

((17476,), (17476, 68), (17476, 68), (17476,))

In [21]:
valid_scores

{'mean-accuracy_score': 0.7876297940151127}

In [35]:
datasplits[0][1].y[:, 0]

array([0., 0., 0., ..., 0., 0., 0.])

In [22]:
predictions = model.predict(datasplits[0][1], transformers)

array([[0.8730956 , 0.12690444],
       [0.59595054, 0.40404952],
       [0.63571453, 0.3642855 ],
       [0.7095415 , 0.29045856],
       [0.88570446, 0.11429551],
       [0.68373555, 0.31626442],
       [0.2827851 , 0.717215  ],
       [0.01608323, 0.9839168 ],
       [0.12678489, 0.8732151 ],
       [0.19476506, 0.8052349 ],
       [0.3368949 , 0.66310513],
       [0.5880199 , 0.4119801 ],
       [0.60987496, 0.3901251 ],
       [0.39637512, 0.60362494],
       [0.06493384, 0.9350661 ],
       [0.24029492, 0.75970507],
       [0.37336218, 0.6266378 ],
       [0.01313169, 0.9868684 ],
       [0.542001  , 0.457999  ],
       [0.57648927, 0.42351076],
       [0.02064835, 0.97935164],
       [0.44898778, 0.5510122 ],
       [0.4456479 , 0.55435205],
       [0.48891178, 0.5110882 ],
       [0.5756717 , 0.42432827],
       [0.9849321 , 0.01506786],
       [0.8901492 , 0.10985085],
       [0.4623978 , 0.5376022 ],
       [0.6332491 , 0.36675084],
       [0.48814744, 0.51185256],
       [0.

In [27]:
predictions.shape

(17476, 68, 2)

In [31]:
predictions[:,0,:][0:5]

array([[0.8730956 , 0.12690444],
       [0.73695505, 0.26304498],
       [0.8881445 , 0.11185554],
       [0.4479316 , 0.5520684 ],
       [0.76400954, 0.23599048]], dtype=float32)

In [30]:
[list(x).index(max(x)) for x in predictions[:,0,:]]

[0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [2]:
pd.read_csv("/home/nolelin/ULCT_Train_1.5.0.csv")
#df.loc[df['endpoint'] == 'H203']

Unnamed: 0,vset_name,canonical_smiles,inchi,endpoint,value
0,ULCT_Train_Data_1.5.0,O=C(O)CC(O)(C(=O)O)CC(=O)O.O(C=1C=CC=CC1CC=2C=...,ZZYHCCDMBJTROG-UHFFFAOYSA-N,H300,-1.0
1,ULCT_Train_Data_1.5.0,O=C(O)CC(O)(C(=O)O)CC(=O)O.O(C=1C=CC=CC1CC=2C=...,ZZYHCCDMBJTROG-UHFFFAOYSA-N,H301,-1.0
2,ULCT_Train_Data_1.5.0,O=C(O)CC(O)(C(=O)O)CC(=O)O.O(C=1C=CC=CC1CC=2C=...,ZZYHCCDMBJTROG-UHFFFAOYSA-N,H302,1.0
3,ULCT_Train_Data_1.5.0,O=C(O)CC(O)(C(=O)O)CC(=O)O.O(C=1C=CC=CC1CC=2C=...,ZZYHCCDMBJTROG-UHFFFAOYSA-N,H303,-1.0
4,ULCT_Train_Data_1.5.0,O=C(O)CC(O)(C(=O)O)CC(=O)O.O(C=1C=CC=CC1CC=2C=...,ZZYHCCDMBJTROG-UHFFFAOYSA-N,H312,1.0
5,ULCT_Train_Data_1.5.0,O=C(O)CC(O)(C(=O)O)CC(=O)O.O(C=1C=CC=CC1CC=2C=...,ZZYHCCDMBJTROG-UHFFFAOYSA-N,H332,1.0
6,ULCT_Train_Data_1.5.0,O=C(O)CC(O)(C(=O)O)CC(=O)O.O(C=1C=CC=CC1CC=2C=...,ZZYHCCDMBJTROG-UHFFFAOYSA-N,Acute_Oral_Binary,1.0
7,ULCT_Train_Data_1.5.0,O=C(O)CC(O)(C(=O)O)CC(=O)O.O(C=1C=CC=CC1CC=2C=...,ZZYHCCDMBJTROG-UHFFFAOYSA-N,Acute_Dermal_Binary,1.0
8,ULCT_Train_Data_1.5.0,O=C(O)CC(O)(C(=O)O)CC(=O)O.O(C=1C=CC=CC1CC=2C=...,ZZYHCCDMBJTROG-UHFFFAOYSA-N,Acute_Inhalation_Binary,1.0
9,ULCT_Train_Data_1.5.0,O=S(=O)(O)C=1C=CC=C(C1C)C,ZZXDRXVIRVJQBT-UHFFFAOYSA-N,H314,-1.0


In [4]:
pd.read_csv("/home/nolelin/tox21.csv")

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,TOX5110,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX6619,O=S(=O)(Cl)c1ccccc1
7,0.0,,0.0,,1.0,,,1.0,0.0,1.0,0.0,1.0,TOX25232,O=C(O)Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,TOX22514,OC[C@H](O)[C@@H](O)[C@H](O)CO
9,,,,,,,,0.0,,0.0,,,TOX22517,CCCCCCCC(=O)[O-].CCCCCCCC(=O)[O-].[Zn+2]


In [8]:
import pandas as pd

def tf_to_int(tf_str):
    return 1 if tf_str else 0

df = pd.read_csv('/home/nolelin/trainingset_171130_3.csv')
#df['very_toxic'] = df['very_toxic'].apply(tf_to_int)
#df['nontoxic'] = df['nontoxic'].apply(tf_to_int)
#df.to_csv('/home/nolelin/trainingset_171130_2.csv', index=False)
#type(list(df['LD50_mgkg'])[4])
#df = pd.get_dummies(df, columns = ['EPA_category', 'GHS_category'])
#df.to_csv('/home/nolelin/trainingset_171130_3.csv', index=False)
df

Unnamed: 0,very_toxic,nontoxic,LD50_mgkg,Canonical_QSARr,EPA_category_1.0,EPA_category_2.0,EPA_category_3.0,EPA_category_4.0,GHS_category_1.0,GHS_category_2.0,GHS_category_3.0,GHS_category_4.0,GHS_category_5.0
0,0,0,460.00,[O-][N+](=O)C1C=CC(Cl)=CC=1,0,1,0,0,0,0,0,1,0
1,0,0,750.00,NC1=CC=C(C=C1)[N+]([O-])=O,0,0,1,0,0,0,0,1,0
2,0,0,170.00,[O-][N+](=O)C1C=CC(O)=CC=1,0,1,0,0,0,0,1,0,0
3,0,0,1809.00,[O-][N+](=O)C1C=CC(CCl)=CC=1,0,0,1,0,0,0,0,1,0
4,0,1,,CNC1C=CC(=CC=1)[N+]([O-])=O,0,0,1,0,0,0,0,0,1
5,0,1,2300.00,COC1C=CC(=CC=1)[N+]([O-])=O,0,0,1,0,0,0,0,0,1
6,0,1,,CC(C)C1C=CC(=CC=1)C(C)C,0,0,1,0,0,0,0,0,1
7,0,1,2500.00,O=C(Cl)C1C=CC(=CC=1)C(=O)Cl,0,0,1,0,0,0,0,0,1
8,0,0,1960.00,OC(=O)C1C=CC(=CC=1)C(O)=O,0,0,1,0,0,0,0,1,0
9,0,0,1600.00,[O-][N+](=O)C1C=CC(=CC=1)N=C=O,0,0,1,0,0,0,0,1,0


In [6]:
tox21_tasks = [
      'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
      'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
  ]
loader = dc.data.CSVLoader(tasks=tox21_tasks, smiles_field="smiles", featurizer='GraphConv')
dataset = loader.featurize('/home/nolelin/tox21.csv', shard_size=8192)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/tox21.csv
Loading shard 1 of size 8192.
Featurizing sample 0


AttributeError: 'str' object has no attribute 'featurize'

In [9]:
#ld50_tasks = ['very_toxic', 'nontoxic', 'EPA_category', 'GHS_category']
ld50_tasks = [x for x in df.columns if x != 'LD50_mgkg' and x != 'Canonical_QSARr']
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=ld50_tasks, smiles_field="Canonical_QSARr", featurizer=featurizer)
dataset = loader.featurize('/home/nolelin/trainingset_171130_3.csv', shard_size=8192)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/trainingset_171130_3.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 18.807 s
Loading shard 2 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 1 took 1.730 s
TIMING: dataset construction took 23.525 s
Loading dataset from disk.


In [2]:
#ld50_tasks = ['very_toxic', 'nontoxic', 'EPA_category', 'GHS_category']
ld50_tasks = ['LD50_mgkg']
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=ld50_tasks, smiles_field="Canonical_QSARr", featurizer=featurizer)
dataset = loader.featurize('/home/nolelin/trainingset_171130_3.csv', shard_size=8192)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/nolelin/trainingset_171130_3.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 16.439 s
Loading shard 2 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 1 took 1.591 s
TIMING: dataset construction took 20.683 s
Loading dataset from disk.


In [3]:
transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]
for transformer in transformers:
    dataset = transformer.transform(dataset)

AssertionError: 
Not equal to tolerance rtol=1e-07, atol=0

(shapes (2092,), (2,) mismatch)
 x: array([0.00e+00, 1.20e-02, 2.00e-02, ..., 6.00e+04, 6.45e+04, 7.00e+04])
 y: array([0., 1.])

In [4]:
split = 'index'
splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'scaffold': dc.splits.ScaffoldSplitter(),
      'butina': dc.splits.ButinaSplitter(),
      'task': dc.splits.TaskSplitter()
}
splitter = splitters[split]
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)

TIMING: dataset construction took 3.523 s
Loading dataset from disk.
TIMING: dataset construction took 1.325 s
Loading dataset from disk.
TIMING: dataset construction took 1.511 s
Loading dataset from disk.


In [5]:
model = GraphConvModel(
    len(ld50_tasks), batch_size=50, mode='regression')
# Set nb_epoch=10 for better results.
model.fit(train_dataset, nb_epoch=5)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


867760782.4444444

In [12]:
model = GraphConvModel(
    len(ld50_tasks), batch_size=50, mode='classification')
# Set nb_epoch=10 for better results.
model.fit(train_dataset, nb_epoch=5)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


487.07027327219646

In [11]:
metric = dc.metrics.Metric(
    dc.metrics.mae_score, np.mean, mode="regression")

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric])
print("Training MAE Score: %f" % train_scores['mean-mae_score'])
valid_scores = model.evaluate(valid_dataset, [metric])
print("Validation MAE Score: %f" % valid_scores['mean-mae_score'])

Evaluating model
computed_metrics: [2138.673805837574]
Training MAE Score: 2138.673806
computed_metrics: [2258.3374299537204]
Validation MAE Score: 2258.337430


In [10]:
train_scores

{'mean-mae_score': 2138.673805837574}

In [13]:
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, np.mean, mode="classification")

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
print("Training ROC-AUC Score: %f" % train_scores["mean-roc_auc_score"])
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
print("Validation ROC-AUC Score: %f" % valid_scores["mean-roc_auc_score"])

Evaluating model
computed_metrics: [0.9126540241742707, 0.7998227925683882, 0.9122862776411916, 0.7343646708602265, 0.664616693120281, 0.7833849789217617, 0.952373941847626, 0.897397568743437, 0.7630683477248611, 0.6980390023312979, 0.7997945272307823]
Training ROC-AUC Score: 0.810709
computed_metrics: [0.7510856792520308, 0.7219547592833551, 0.7430620761606677, 0.6966896301667875, 0.5413197242367267, 0.6951582763126467, 0.9175084175084175, 0.7253003406849561, 0.6562320632257495, 0.6891457481580565, 0.7203898951668619]
Validation ROC-AUC Score: 0.714350


In [24]:
# Load Tox21 dataset
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = tox21_datasets

Loading dataset from disk.
Loading dataset from disk.
Loading dataset from disk.


In [13]:
tox21_datasets

(<deepchem.data.datasets.DiskDataset at 0x7fcbf601cbe0>,
 <deepchem.data.datasets.DiskDataset at 0x7fcbf7621048>,
 <deepchem.data.datasets.DiskDataset at 0x7fcbf40143c8>)

In [14]:
model = GraphConvModel(
    len(tox21_tasks), batch_size=50, mode='classification')
# Set nb_epoch=10 for better results.
model.fit(train_dataset, nb_epoch=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


581.2145543174138

In [4]:
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, np.mean, mode="classification")

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
print("Training ROC-AUC Score: %f" % train_scores["mean-roc_auc_score"])
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
print("Validation ROC-AUC Score: %f" % valid_scores["mean-roc_auc_score"])

Evaluating model
computed_metrics: [0.8077090871667596, 0.80883108542068072, 0.83718687229262456, 0.73910735547346063, 0.68028700348341964, 0.76186917681171784, 0.75815841402908002, 0.69998460160846254, 0.76642308937034997, 0.6281806984369307, 0.79220694916941459, 0.74514096324886725]
Training ROC-AUC Score: 0.752090
computed_metrics: [0.7184979963389897, 0.74983465608465605, 0.80125608701588313, 0.6727249618708695, 0.63149999999999995, 0.62280701754385959, 0.6362062997577016, 0.70774325371662083, 0.7119378384742171, 0.56241010427903637, 0.76310809839756644, 0.68673557278208452]
Validation ROC-AUC Score: 0.688730


In [5]:
from deepchem.models.tensorgraph.tensor_graph import TensorGraph

tg = TensorGraph(use_queue=False)

In [6]:
from deepchem.models.tensorgraph.layers import Feature

atom_features = Feature(shape=(None, 75))
degree_slice = Feature(shape=(None, 2), dtype=tf.int32)
membership = Feature(shape=(None,), dtype=tf.int32)

deg_adjs = []
for i in range(0, 10 + 1):
    deg_adj = Feature(shape=(None, i + 1), dtype=tf.int32)
    deg_adjs.append(deg_adj)

In [7]:
from deepchem.models.tensorgraph.layers import Dense, GraphConv, BatchNorm
from deepchem.models.tensorgraph.layers import GraphPool, GraphGather

batch_size = 50

gc1 = GraphConv(
    64,
    activation_fn=tf.nn.relu,
    in_layers=[atom_features, degree_slice, membership] + deg_adjs)
batch_norm1 = BatchNorm(in_layers=[gc1])
gp1 = GraphPool(in_layers=[batch_norm1, degree_slice, membership] + deg_adjs)
gc2 = GraphConv(
    64,
    activation_fn=tf.nn.relu,
    in_layers=[gp1, degree_slice, membership] + deg_adjs)
batch_norm2 = BatchNorm(in_layers=[gc2])
gp2 = GraphPool(in_layers=[batch_norm2, degree_slice, membership] + deg_adjs)
dense = Dense(out_channels=128, activation_fn=tf.nn.relu, in_layers=[gp2])
batch_norm3 = BatchNorm(in_layers=[dense])
readout = GraphGather(
    batch_size=batch_size,
    activation_fn=tf.nn.tanh,
    in_layers=[batch_norm3, degree_slice, membership] + deg_adjs)

In [8]:
from deepchem.models.tensorgraph.layers import Dense, SoftMax, \
    SoftMaxCrossEntropy, WeightedError, Stack
from deepchem.models.tensorgraph.layers import Label, Weights

costs = []
labels = []
for task in range(len(tox21_tasks)):
    classification = Dense(
        out_channels=2, activation_fn=None, in_layers=[readout])

    softmax = SoftMax(in_layers=[classification])
    tg.add_output(softmax)

    label = Label(shape=(None, 2))
    labels.append(label)
    cost = SoftMaxCrossEntropy(in_layers=[label, classification])
    costs.append(cost)
all_cost = Stack(in_layers=costs, axis=1)
weights = Weights(shape=(None, len(tox21_tasks)))
loss = WeightedError(in_layers=[all_cost, weights])
tg.set_loss(loss)

In [11]:
from deepchem.metrics import to_one_hot
from deepchem.feat.mol_graphs import ConvMol

def data_generator(dataset, epochs=1, predict=False, pad_batches=True):
    for epoch in range(epochs):
        if not predict:
            print('Starting epoch %i' % epoch)
        for ind, (X_b, y_b, w_b, ids_b) in enumerate(
            dataset.iterbatches(
                batch_size, pad_batches=pad_batches, deterministic=True)):
          d = {}
          for index, label in enumerate(labels):
            d[label] = to_one_hot(y_b[:, index])
          d[weights] = w_b
          multiConvMol = ConvMol.agglomerate_mols(X_b)
          d[atom_features] = multiConvMol.get_atom_features()
          d[degree_slice] = multiConvMol.deg_slice
          d[membership] = multiConvMol.membership
          for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
            d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
          yield d

In [12]:
tg.fit_generator(data_generator(train_dataset, epochs=1))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Starting epoch 0


590.5404334900871

In [13]:
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, np.mean, mode="classification")

def reshape_y_pred(y_true, y_pred):
    """
    TensorGraph.Predict returns a list of arrays, one for each output
    We also have to remove the padding on the last batch
    Metrics taks results of shape (samples, n_task, prob_of_class)
    """
    n_samples = len(y_true)
    retval = np.stack(y_pred, axis=1)
    return retval[:n_samples]


print("Evaluating model")
train_predictions = tg.predict_on_generator(data_generator(train_dataset, predict=True))
train_predictions = reshape_y_pred(train_dataset.y, train_predictions)
train_scores = metric.compute_metric(train_dataset.y, train_predictions, train_dataset.w)
print("Training ROC-AUC Score: %f" % train_scores)

valid_predictions = tg.predict_on_generator(data_generator(valid_dataset, predict=True))
valid_predictions = reshape_y_pred(valid_dataset.y, valid_predictions)
valid_scores = metric.compute_metric(valid_dataset.y, valid_predictions, valid_dataset.w)
print("Valid ROC-AUC Score: %f" % valid_scores)

Evaluating model
computed_metrics: [0.81576472641253339, 0.81526824421965305, 0.82699658753835303, 0.76797467906880745, 0.68571770264230114, 0.7750655608383259, 0.78129298949811155, 0.66845162478136788, 0.76453068560650883, 0.67815831931023718, 0.78350012565114091, 0.73536387537058778]
Training ROC-AUC Score: 0.758174
computed_metrics: [0.74852817493692181, 0.71875, 0.80862877167432767, 0.75400355871886116, 0.62449999999999994, 0.66379513806938872, 0.59570785739010046, 0.67025998731769187, 0.71137273369437248, 0.59198579647608773, 0.7510838582960071, 0.66162790697674412]
Valid ROC-AUC Score: 0.691687


In [29]:
import pybel
import sys

for mol in pybel.readfile('sdf', '/home/nolelin/tox21_10k_data_all.sdf'):
    print(mol)
    for atom in mol:
        coords = atom.coords
        print(atom)
        for neighbor in pybel.ob.OBAtomAtomIter(atom.OBAtom):
            neighbor_coords = pybel.Atom(neighbor).coords
            print(neighbor_coords)
    sys.exit(0)

[Cl-].C[n+]1c2cc(N)ccc2cc2c1cc(N)cc2.n1c2cc(N)ccc2cc2c1cc(N)cc2	NCGC00178831-03

Atom: 17 (4.88 -2.74 0.00)
Atom: 6 (2.86 -2.48 0.00)
(2.8647, -1.6501, 0.0)
Atom: 7 (2.86 -1.65 0.00)
(2.8647, -2.4751, 0.0)
(3.5808, -1.2318, 0.0)
(2.1485, -1.2318, 0.0)
Atom: 6 (3.58 -1.23 0.00)
(2.8647, -1.6501, 0.0)
(4.297, -1.6501, 0.0)
(3.5808, -0.4068, 0.0)
Atom: 6 (4.30 -1.65 0.00)
(3.5808, -1.2318, 0.0)
(5.0017, -1.2318, 0.0)
Atom: 6 (5.00 -1.23 0.00)
(4.297, -1.6501, 0.0)
(5.7179, -1.6501, 0.0)
(5.0017, -0.4068, 0.0)
Atom: 7 (5.72 -1.65 0.00)
(5.0017, -1.2318, 0.0)
Atom: 6 (5.00 -0.41 0.00)
(5.0017, -1.2318, 0.0)
(4.297, 0.0, 0.0)
Atom: 6 (4.30 0.00 0.00)
(5.0017, -0.4068, 0.0)
(3.5808, -0.4068, 0.0)
Atom: 6 (3.58 -0.41 0.00)
(3.5808, -1.2318, 0.0)
(4.297, 0.0, 0.0)
(2.8647, 0.0, 0.0)
Atom: 6 (2.86 0.00 0.00)
(3.5808, -0.4068, 0.0)
(2.1485, -0.4068, 0.0)
Atom: 6 (2.15 -0.41 0.00)
(2.8647, 0.0, 0.0)
(2.1485, -1.2318, 0.0)
(1.4324, 0.0, 0.0)
Atom: 6 (2.15 -1.23 0.00)
(2.8647, -1.6501, 0.0)
(2.1485,

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
