In [1]:
from util import *
import pandas as pd
import numpy as np
import deepchem as dc
from matplotlib import pyplot as plt
import warnings
import os

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

## Graph Convolutional Models

In [2]:
# load esol dataset from csv
tasks = ['measured log solubility in mols per litre']
loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=dc.feat.ConvMolFeaturizer())
dataset = loader.create_dataset('esol.csv')

# split esol dataset
splitter = dc.splits.RandomSplitter()
train_set, test_set = splitter.train_test_split(dataset, frac_train=0.8, seed=0)

In [None]:
hyper_params = {
    'batch_size': [32, 16],
    'graph_conv_layers': [[64, 64], [128, 128], [256, 256]],
    'dense_layer_size': [256, 128],
    'dropout': [0.0],
}

search_results, (batch_size, conv_layers, layer_sizes, dropout_rate) = grid_search_graph_conv(train_set, hyper_params)

In [5]:
print(search_results)

       rmse  batch_size conv_layers  layer_sizes  dropout_rate
7  0.980285          32  [128, 128]          128           0.0
6  1.003968          32  [128, 128]          256           0.0
5  1.033282          32    [64, 64]          128           0.0
4  1.036602          32    [64, 64]          256           0.0
3  1.055363          64  [128, 128]          128           0.0
2  1.080275          64  [128, 128]          256           0.0
0  1.093807          64    [64, 64]          256           0.0
1  1.130356          64    [64, 64]          128           0.0


In [None]:
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_set, move_mean=True)]

# preprocess data
for transformer in transformers:
    train_set = transformer.transform(train_set)
    test_set = transformer.transform(test_set)

# intantiate and fit model
model = dc.models.GraphConvModel(1, mode='regression', batch_size=batch_size, graph_conv_layers=conv_layers, dense_layer_size=layer_sizes, dropout=dropout_rate)
model.fit(train_set, nb_epoch=100)

# evaluate model
metric = [
    dc.metrics.Metric(dc.metrics.rms_score, np.mean),
    dc.metrics.Metric(dc.metrics.mae_score, np.mean),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
]
train_scores = model.evaluate(train_set, metric, transformers)
test_scores = model.evaluate(test_set, metric, transformers)

print("Train scores")
print(train_scores)

print("Test scores")
print(test_scores)

## Message Passing Neural Network

In [None]:
# load esol dataset from csv
tasks = ['measured log solubility in mols per litre']
loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=dc.feat.WeaveFeaturizer())
dataset = loader.create_dataset('esol.csv')

# split esol dataset
splitter = dc.splits.RandomSplitter()
train_set, test_set = splitter.train_test_split(dataset, frac_train=0.8, seed=0)

In [None]:
hyper_params = {
    'batch_size': [32, 16],
    'n_atom_feat': [75],
    'n_pair_feat': [14],
    'n_hidden': [100]
}

search_results, (batch_size, n_atom_feat, n_pair_feat, n_hidden) = grid_search_mpnn(train_set, hyper_params)

0.7057699567631636
(1.00) rmse = 0.7058 => (64, 75, 14, 100)
Best params = (64, 75, 14, 100)


In [None]:
print(search_results)

In [None]:
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_set, move_mean=True)]

# preprocess data
for transformer in transformers:
    train_set = transformer.transform(train_set)
    test_set = transformer.transform(test_set)

# intantiate and fit model
model = dc.models.MPNNModel(1, mode='regression', batch_size=batch_size, use_queue=False, n_atom_feat=n_atom_feat, n_pair_feat=n_pair_feat, n_hidden=n_hidden, learning_rate=0.0001, T=3, M=5)
model.fit(train_set, nb_epoch=50, checkpoint_interval=100)

# evaluate model
metric = [
    dc.metrics.Metric(dc.metrics.rms_score, np.mean),
    dc.metrics.Metric(dc.metrics.mae_score, np.mean),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
]
train_scores = model.evaluate(train_set, metric, transformers)
test_scores = model.evaluate(test_set, metric, transformers)

print("Train scores")
print(train_scores)

print("Test scores")
print(test_scores)

## Random Forest Regressor

In [2]:
# load esol dataset from csv
tasks = ['measured log solubility in mols per litre']
loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=dc.feat.CircularFingerprint(size=2048, radius=4))
dataset = loader.create_dataset('esol.csv')

# split esol dataset
splitter = dc.splits.RandomSplitter()
train_set, test_set = splitter.train_test_split(dataset, frac_train=0.8, seed=0)

In [3]:
hyper_params = {
    'n_estimators': [100, 250, 500],
    'criterion': ['mse', 'mae'],
    'max_features': ['auto', 'sqrt', 'log2']
}

search_results, (n_estimators, criterion, max_features) = grid_search_random_forest(train_set, hyper_params)

[####################      ]

In [4]:
print(search_results)

       rmse  n_estimators criterion max_features
0  1.286859            10       mse         auto
3  1.294778            10       mae         auto
1  1.335149            10       mse         sqrt
4  1.365178            10       mae         sqrt
2  1.407378            10       mse         log2
5  1.413807            10       mae         log2


In [5]:
from sklearn.ensemble import RandomForestRegressor

# intantiate and fit model
sklearn_model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_features=max_features, random_state=0)
model = dc.models.SklearnModel(sklearn_model)
model.fit(train_set)

# evaluate model
metric = [
    dc.metrics.Metric(dc.metrics.rms_score, np.mean),
    dc.metrics.Metric(dc.metrics.mae_score, np.mean),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
]
train_scores = model.evaluate(train_set, metric, [])
test_scores = model.evaluate(test_set, metric, [])

print("Train scores")
print(train_scores)

print("Test scores")
print(test_scores)

Train scores
{'mean-rms_score': 0.5164803548946703, 'mean-mae_score': 0.35819262379896527, 'mean-pearson_r2_score': 0.9473553599901838}
Test scores
{'mean-rms_score': 1.1804430520170475, 'mean-mae_score': 0.9221431489675517, 'mean-pearson_r2_score': 0.6808385798655135}
