In [1]:
from util import *
import pandas as pd
import numpy as np
import deepchem as dc
from matplotlib import pyplot as plt
import warnings
import os

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

## Graph Convolutional Models

In [2]:
# load esol dataset from csv
tasks = ['measured log solubility in mols per litre']
loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=dc.feat.ConvMolFeaturizer())
dataset = loader.create_dataset('esol.csv')

# split esol dataset
splitter = dc.splits.RandomSplitter()
train_set, test_set = splitter.train_test_split(dataset, frac_train=0.8, seed=0)

In [None]:
hyper_params = {
    'batch_size': [32, 16],
    'graph_conv_layers': [[64, 64], [128, 128], [256, 256]],
    'dense_layer_size': [256, 128],
    'dropout': [0.0],
}

search_results, (batch_size, conv_layers, layer_sizes, dropout_rate) = grid_search_graph_conv(train_set, hyper_params)

In [4]:
print(search_results)

        rmse  batch_size conv_layers  layer_sizes  dropout_rate
10  0.936538          16  [256, 256]          256           0.0
11  0.939595          16  [256, 256]          128           0.0
9   0.944217          16  [128, 128]          128           0.0
4   0.954203          32  [256, 256]          256           0.0
5   0.972162          32  [256, 256]          128           0.0
7   0.982165          16    [64, 64]          128           0.0
2   0.984718          32  [128, 128]          256           0.0
8   0.995951          16  [128, 128]          256           0.0
6   1.005711          16    [64, 64]          256           0.0
3   1.017335          32  [128, 128]          128           0.0
0   1.034305          32    [64, 64]          256           0.0
1   1.060179          32    [64, 64]          128           0.0


In [5]:
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_set, move_mean=True)]

# preprocess data
for transformer in transformers:
    train_set = transformer.transform(train_set)
    test_set = transformer.transform(test_set)

# intantiate and fit model
model = dc.models.GraphConvModel(1, mode='regression', batch_size=batch_size, graph_conv_layers=conv_layers, dense_layer_size=layer_sizes, dropout=dropout_rate)
model.fit(train_set, nb_epoch=100)

# evaluate model
metric = [
    dc.metrics.Metric(dc.metrics.rms_score, np.mean),
    dc.metrics.Metric(dc.metrics.mae_score, np.mean),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
]
train_scores = model.evaluate(train_set, metric, transformers)
test_scores = model.evaluate(test_set, metric, transformers)

print("Train scores")
print(train_scores)

print("Test scores")
print(test_scores)

Train scores
{'mean-rms_score': 0.35644945087154256, 'mean-mae_score': 0.28097528781080516, 'mean-pearson_r2_score': 0.9792561778491514}
Test scores
{'mean-rms_score': 0.7986796892479974, 'mean-mae_score': 0.6178939486484055, 'mean-pearson_r2_score': 0.8544007404847818}


## Message Passing Neural Network

In [6]:
# load esol dataset from csv
tasks = ['measured log solubility in mols per litre']
loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=dc.feat.WeaveFeaturizer())
dataset = loader.create_dataset('esol.csv')

# split esol dataset
splitter = dc.splits.RandomSplitter()
train_set, test_set = splitter.train_test_split(dataset, frac_train=0.8, seed=0)

In [7]:
hyper_params = {
    'batch_size': [32, 16],
    'n_atom_feat': [75],
    'n_pair_feat': [14],
    'n_hidden': [100]
}

search_results, (batch_size, n_atom_feat, n_pair_feat, n_hidden) = grid_search_mpnn(train_set, hyper_params)

[####################                    ]

In [8]:
print(search_results)

       rmse  batch_size  n_atom_feat  n_pair_feat  n_hidden
1  0.655984          16           75           14       100
0  0.685698          32           75           14       100


In [9]:
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_set, move_mean=True)]

# preprocess data
for transformer in transformers:
    train_set = transformer.transform(train_set)
    test_set = transformer.transform(test_set)

# intantiate and fit model
model = dc.models.MPNNModel(1, mode='regression', batch_size=batch_size, use_queue=False, n_atom_feat=n_atom_feat, n_pair_feat=n_pair_feat, n_hidden=n_hidden, learning_rate=0.0001, T=3, M=5)
model.fit(train_set, nb_epoch=50, checkpoint_interval=100)

# evaluate model
metric = [
    dc.metrics.Metric(dc.metrics.rms_score, np.mean),
    dc.metrics.Metric(dc.metrics.mae_score, np.mean),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
]
train_scores = model.evaluate(train_set, metric, transformers)
test_scores = model.evaluate(test_set, metric, transformers)

print("Train scores")
print(train_scores)

print("Test scores")
print(test_scores)

Train scores
{'mean-rms_score': 0.33132370266898736, 'mean-mae_score': 0.2588325988489977, 'mean-pearson_r2_score': 0.9758838930741033}
Test scores
{'mean-rms_score': 0.5928773782943892, 'mean-mae_score': 0.4536293255027136, 'mean-pearson_r2_score': 0.9237314639815634}


## Random Forest Regressor

In [10]:
# load esol dataset from csv
tasks = ['measured log solubility in mols per litre']
loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=dc.feat.CircularFingerprint(size=2048, radius=4))
dataset = loader.create_dataset('esol.csv')

# split esol dataset
splitter = dc.splits.RandomSplitter()
train_set, test_set = splitter.train_test_split(dataset, frac_train=0.8, seed=0)

In [11]:
hyper_params = {
    'n_estimators': [100, 250, 500],
    'criterion': ['mse', 'mae'],
    'max_features': ['auto', 'sqrt', 'log2']
}

search_results, (n_estimators, criterion, max_features) = grid_search_random_forest(train_set, hyper_params)

[####################  ]

In [12]:
print(search_results)

        rmse  n_estimators criterion max_features
12  1.225431           500       mse         auto
6   1.226358           250       mse         auto
0   1.226689           100       mse         auto
9   1.233063           250       mae         auto
15  1.235029           500       mae         auto
3   1.239940           100       mae         auto
13  1.240248           500       mse         sqrt
7   1.244766           250       mse         sqrt
1   1.253478           100       mse         sqrt
16  1.262508           500       mae         sqrt
10  1.265118           250       mae         sqrt
4   1.274192           100       mae         sqrt
14  1.279299           500       mse         log2
8   1.282334           250       mse         log2
2   1.297865           100       mse         log2
11  1.309369           250       mae         log2
17  1.310112           500       mae         log2
5   1.312742           100       mae         log2


In [13]:
from sklearn.ensemble import RandomForestRegressor

# intantiate and fit model
sklearn_model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_features=max_features, random_state=0)
model = dc.models.SklearnModel(sklearn_model)
model.fit(train_set)

# evaluate model
metric = [
    dc.metrics.Metric(dc.metrics.rms_score, np.mean),
    dc.metrics.Metric(dc.metrics.mae_score, np.mean),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
]
train_scores = model.evaluate(train_set, metric, [])
test_scores = model.evaluate(test_set, metric, [])

print("Train scores")
print(train_scores)

print("Test scores")
print(test_scores)

Train scores
{'mean-rms_score': 0.4557074378032759, 'mean-mae_score': 0.32590803615678415, 'mean-pearson_r2_score': 0.965785107964993}
Test scores
{'mean-rms_score': 1.1422869134998823, 'mean-mae_score': 0.8716654813527174, 'mean-pearson_r2_score': 0.7005435810374528}
