In [4]:
import glob,os
import pandas as pd
import deepchem as dc
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw, PyMol, rdFMCS
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from deepchem import metrics
from IPython.display import Image, display
from rdkit.Chem.Draw import SimilarityMaps
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

In [5]:
DATASET_FILE ='aug-dataset.csv'
MODEL_DIR = 'temp_model'
########################################Featurizerization#########################
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=["Tg"], feature_field="Smiles", featurizer=featurizer)
dataset = loader.create_dataset(DATASET_FILE, shard_size=10000)
print("\nLoad data successfully！\n")
splitter = dc.splits.splitters.RandomSplitter()
trainset, testset = splitter.train_test_split(dataset, frac_train=0.8, seed=1)



Load data successfully！



In [7]:
###########################################Two Conv.##########################################
"""
graph_conv_layers: [64,64]
dense_layer_size: 128
batch_size: 20
dropout: 0.2
nb_epoch: 1500
""" 

import time

start = time.time()

metrics_rmse_train = []
metrics_mae_train = []
metrics_r2_train = []
metrics_rmse_test = []
metrics_mae_test = []
metrics_r2_test = []

for i in range(5):   # five loops
    print("Executing: %d/5" %(i+1))
    print("#"*60)
    MODEL_DIR = 'temp_model'
    
    if not os.path.exists(MODEL_DIR + '/2 layers/' + 'loop' + str(i+1)):
        os.makedirs(MODEL_DIR + '/2 layers/' + 'loop' + str(i+1))
    MODEL_DIR = MODEL_DIR + '/2 layers/' + 'loop' + str(i+1)

    ########################################Model######################################
    model = dc.models.GraphConvModel(1, 
              graph_conv_layers=[64,64],
#               dense_layer_size =128,
              mode="regression",
              batch_normalize=False,
              batch_size=20,
              model_dir=MODEL_DIR,
              dropout=0.1)

    ########################################Fit########################################
    model.fit(trainset, nb_epoch=1000)

    ########################################Predict####################################
    test_pred = model.predict(testset)
    train_pred = model.predict(trainset)

    ########################################Metrics####################################
    rmse = metrics.mean_squared_error(y_true=trainset.y, y_pred=train_pred, squared=False)   # RMSE
    r2 = metrics.r2_score(y_true=trainset.y, y_pred=train_pred)
    mae = metrics.mean_absolute_error(y_true=trainset.y, y_pred=train_pred)

    rmse_test = metrics.mean_squared_error(y_true=testset.y, y_pred=test_pred, squared=False)   # RMSE
    r2_test = metrics.r2_score(y_true=testset.y, y_pred=test_pred)
    mae_test = metrics.mean_absolute_error(y_true=testset.y, y_pred=test_pred)

    metrics_r2_train.append(r2)
    metrics_rmse_train.append(rmse)
    metrics_mae_train.append(mae)
    metrics_r2_test.append(r2_test)
    metrics_rmse_test.append(rmse_test)
    metrics_mae_test.append(mae_test)

end = time.time()

print("Time cost for GNN on polymer dataset: %.3f min" % ((end-start)/60))

print("Train_R2: %.2f (+/- %.2f)" % (np.mean(metrics_r2_train), np.std(metrics_r2_train)))
print("Train_RMSE: %.2f (+/- %.2f)" % (np.mean(metrics_rmse_train), np.std(metrics_rmse_train)))
print("Train_MAE: %.2f (+/- %.2f)" % (np.mean(metrics_mae_train), np.std(metrics_mae_train)))

print("Test_R2: %.2f (+/- %.2f)" % (np.mean(metrics_r2_test), np.std(metrics_r2_test)))
print("Test_RMSE: %.2f (+/- %.2f)" % (np.mean(metrics_rmse_test), np.std(metrics_rmse_test)))
print("Test_MAE: %.2f (+/- %.2f)" % (np.mean(metrics_mae_test), np.std(metrics_mae_test)))

Executing: 1/5
############################################################
Executing: 2/5
############################################################
Executing: 3/5
############################################################
Executing: 4/5
############################################################
Executing: 5/5
############################################################
Time cost for GNN on polymer dataset: 36.084 min
Train_R2: 1.00 (+/- 0.00)
Train_RMSE: 9.06 (+/- 0.27)
Train_MAE: 5.07 (+/- 0.10)
Test_R2: 0.92 (+/- 0.00)  # This was done on another device in terms of reproducibility test, and the 3% error across devices proves the reproducibility and validity of the model
Test_RMSE: 36.65 (+/- 0.58)
Test_MAE: 23.65 (+/- 0.89)
