In [4]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
import deepchem as dc
from deepchem.molnet import load_delaney
from deepchem.models import GraphConvModel
import tensorflow as tf

#Setting up model

In [5]:
##input malaria file
delaney_tasks = ['Kau_IC50']
featurizer = dc.feat.ConvMolFeaturizer()

input_dataset = 'malaria_GSK_Syngene_Dundee_subset.csv'

loader = dc.data.CSVLoader(tasks=delaney_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(input_dataset, shard_size=8192)

# Initialize transformers
transformers = [
  dc.trans.NormalizationTransformer(
      transform_y=True, dataset=dataset)
]

print("About to transform data")
for transformer in transformers:
    dataset = transformer.transform(dataset)

# splitters = {
#   'index': dc.splits.IndexSplitter(),
#   'random': dc.splits.RandomSplitter(),
#   'scaffold': dc.splits.ScaffoldSplitter()
# }
splitter = dc.splits.ScaffoldSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)


Loading raw samples now.
shard_size: 8192
About to start loading CSV from malaria_GSK_Syngene_Dundee_subset.csv
Loading shard 1 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 0 took 0.175 s
TIMING: dataset construction took 0.228 s
Loading dataset from disk.
About to transform data
TIMING: dataset construction took 0.043 s
Loading dataset from disk.
TIMING: dataset construction took 0.036 s
Loading dataset from disk.
TIMING: dataset construction took 0.017 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.


In [11]:
# Load Delaney dataset
# delaney_tasks, delaney_datasets, transformers = load_delaney(
#     featurizer='GraphConv', split='index')
# train_dataset, valid_dataset, test_dataset = dataset

# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

# Do setup required for tf/keras models
# Number of features on conv-mols
n_feat = 250
# Batch size of models
batch_size = 128
model = GraphConvModel(
    len(delaney_tasks), batch_size=batch_size, mode='regression')


##Train model

In [13]:
# Fit trained model
model.fit(train_dataset, nb_epoch=500)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

Evaluating model
computed_metrics: [0.8957445340227508]
computed_metrics: [0.8562918277333406]
Train scores
{'mean-pearson_r2_score': 0.8957445340227508}
Validation scores
{'mean-pearson_r2_score': 0.8562918277333406}
