Setting up imports

In [1]:
%load_ext autoreload
%autoreload 2
%pdb off
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Joseph Gomes"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "LGPL"

import os
import unittest
import tempfile
import shutil

import numpy as np
import numpy.random

from deepchem import metrics
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.hyperparameters import HyperparamOpt
from deepchem.metrics import Metric
from deepchem.models import Model
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskRegressor
from deepchem.models.tensorflow_models import TensorflowModel
from deepchem.transformers import NormalizationTransformer
from deepchem.utils.evaluate import Evaluator

Automatic pdb calling has been turned OFF


Creating temporary directories

In [2]:
feature_dir = tempfile.mkdtemp()
train_dir = tempfile.mkdtemp()
valid_dir = tempfile.mkdtemp()
test_dir = tempfile.mkdtemp()
model_dir = tempfile.mkdtemp()

Setting up model variables

In [3]:
from deepchem.featurizers.coulomb_matrices import CoulombMatrixEig
compound_featurizers = [CoulombMatrixEig(23, remove_hydrogens=False)]
complex_featurizers = []
tasks = ["atomization_energy"]
task_type = "regression"
task_types = {task: task_type for task in tasks}
input_file = "../datasets/gdb1k.sdf"
smiles_field = "smiles"
mol_field = "mol"

Load featurized data

In [4]:
featurizers = compound_featurizers + complex_featurizers
featurizer = DataFeaturizer(tasks=tasks,
                            smiles_field=smiles_field,
                            mol_field=mol_field,
                            featurizers=featurizers,
                            verbosity="high")

In [5]:
featurized_dataset = featurizer.featurize(input_file, feature_dir, shard_size=None)

Loading raw samples now.
Reading structures from ../datasets/gdb1k.sdf.
Loaded shard 1 of size None from file.
About to featurize shard.
Applying processing transformation to shard.
Currently featurizing feature_type: CoulombMatrixEig
Featurizing sample 0
About to start initializing dataset


  if features[feature_ind] == "":
  if y[ind, task] == "":


Perform Train, Validation, and Testing Split

In [6]:
from deepchem.splits import RandomSplitter
random_splitter = RandomSplitter()
train_dataset, valid_dataset, test_dataset = random_splitter.train_valid_test_split(featurized_dataset,
    train_dir, valid_dir, test_dir)

Transforming datasets

In [7]:
input_transformers = [NormalizationTransformer(transform_X=True, dataset=train_dataset)]
output_transformers = [NormalizationTransformer(transform_y=True, dataset=train_dataset)]
transformers = input_transformers + output_transformers
for transformer in transformers:
    transformer.transform(train_dataset)
for transformer in transformers:
    transformer.transform(valid_dataset)
for transformer in transformers:
    transformer.transform(test_dataset)

Build a FCNet using Tensorflow

In [11]:
def tf_model_builder(tasks, task_types, params_dict, model_dir, verbosity=None):
    """Builds tensorflow models given hyperparameters.

    """
    return TensorflowModel(tasks, task_types, params_dict, model_dir,
                        tf_class=TensorflowMultiTaskRegressor,
                        verbosity="low")

params_dict = { 
    "layer_sizes": [[1000, 1000, 100],[1000, 1000, 50],[1000, 1000, 250],[1000, 1000, 500]],
    "dropouts": [[.1, .1, .1]],
    "learning_rate": [0.0001],
    "momentum": [.9],
    "batch_size": [100],
    "num_regression_tasks": [len(tasks)],
    "weight_init_stddevs": [[np.sqrt(6)/np.sqrt(1000), np.sqrt(6)/np.sqrt(1000), np.sqrt(6)/np.sqrt(1000)]],
    "bias_init_consts": [[1., 1., 1.]],
    "nb_epoch": [500],
    "penalty": [0.0],
    "optimizer": ["rmsprop", "momentum"],
    "data_shape": [train_dataset.get_data_shape()],
}   

verbosity="high"
metric = Metric(metrics.mean_absolute_error)
optimizer = HyperparamOpt(tf_model_builder, tasks, task_types, verbosity=verbosity)       
best_model, best_hyperparams, all_results = optimizer.hyperparam_search(              
    params_dict, train_dataset, valid_dataset, output_transformers,                     
    metric, use_max=False, logdir=None)

Fitting model 1/8
hyperparameters: {u'optimizer': u'rmsprop', u'layer_sizes': [1000, 1000, 100], u'data_shape': (23,), u'learning_rate': 0.0001, u'batch_size': 100, u'penalty': 0.0, u'bias_init_consts': [1.0, 1.0, 1.0], u'weight_init_stddevs': [0.077459666924148338, 0.077459666924148338, 0.077459666924148338], u'num_regression_tasks': 1, u'dropouts': [0.1, 0.1, 0.1], u'nb_epoch': 500, u'momentum': 0.9}
Training for 500 epochs
Ending epoch 0: loss 1.36035
Ending epoch 1: loss 1.30184
Ending epoch 2: loss 1.12685
Ending epoch 3: loss 1.0381
Ending epoch 4: loss 1.17485
Ending epoch 5: loss 0.796555
Ending epoch 6: loss 1.02207
Ending epoch 7: loss 0.646329
Ending epoch 8: loss 0.814315
Ending epoch 9: loss 0.546832
Ending epoch 10: loss 0.725649
Ending epoch 11: loss 0.42637
Ending epoch 12: loss 0.48826
Ending epoch 13: loss 0.222245
Ending epoch 14: loss 0.244443
Ending epoch 15: loss 0.295166
Ending epoch 16: loss 0.15471
Ending epoch 17: loss 0.103894
Ending epoch 18: loss 0.116969
E

In [12]:
test_evaluator = Evaluator(best_model, test_dataset, transformers,
                         verbosity=verbosity)
test_scores = test_evaluator.compute_model_performance([metric])

print("Test scores")
print(test_scores)

Test scores
{'mean_absolute_error': 23.479550284211136}
