In [7]:
%load_ext autoreload
%autoreload 2
# Occupy a GPU for the model to be loaded 
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
# GPU ID, if occupied change to an available GPU ID listed under !nvidia-smi
%env CUDA_VISIBLE_DEVICES=2

import numpy as np
import pandas as pd
import rdkit
from rdkit import Chem
import h5py, ast, pickle

from ddc_pub.vectorizers import SmilesVectorizer
from ddc_pub import ddc_v3 as ddc

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=2


In [8]:
df = pd.read_csv('./datasets/OPD_Data/Desc_Training_Validation.csv')

In [9]:
mol_data = df['smiles'].tolist()

binmols = np.array([Chem.MolFromSmiles(x) for x in mol_data])
sv = SmilesVectorizer()
sv.fit(binmols)
maxlen = sv.maxlength + 35
charset = sv.charset

In [10]:
descr = df[['homo', 'gap', 'lumo']].values

In [11]:
# Name of the dataset
name = "OPD_Descr"

dataset_info = {"charset": charset, "maxlen": maxlen, "name": name}

In [12]:
# Initialize a model
model = ddc.DDC(x              = descr,        # input
                y              = binmols,      # output
                dataset_info   = dataset_info, # dataset information
                scaling        = True,         # scale the descriptors
                noise_std      = 0.1,          # std of the noise layer
                lstm_dim       = 512,          # breadth of LSTM layers
                dec_layers     = 3,            # number of decoding layers
                batch_size     = 128)          # batch size for training

Initializing model in train mode.
Input type is 'molecular descriptors'.
Applying scaling on input.
Model received 12251 train samples and 1362 validation samples.
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Latent_Input (InputLayer)       [(None, 3)]          0                                            
__________________________________________________________________________________________________
Decoder_Inputs (InputLayer)     [(None, 244, 47)]    0                                            
__________________________________________________________________________________________________
latent_to_states_model (Model)  [(None, 512), (None, 24576       Latent_Input[0][0]               
__________________________________________________________________________________________________
batch_model (Model)        

In [14]:
model.fit(epochs              = 100,         # number of epochs
          lr                  = 1e-3,        # initial learning rate for Adam, recommended
          model_name          = "opd_descr", # base name to append the checkpoints with
          checkpoint_dir      = "./models/", # save checkpoints in the notebook's directory
          mini_epochs         = 10,          # number of sub-epochs within an epoch to trigger lr decay
          save_period         = 50,          # checkpoint frequency (in mini_epochs)
          lr_decay            = True,        # whether to use exponential lr decay or not
          sch_epoch_to_start  = 500,         # mini-epoch to start lr decay (bypassed if lr_decay=False)
          sch_lr_init         = 1e-3,        # initial lr, should be equal to lr (bypassed if lr_decay=False)
          sch_lr_final        = 1e-6,        # final lr before finishing training (bypassed if lr_decay=False)
          patience            = 25)          # patience for Keras' ReduceLROnPlateau (bypassed if lr_decay=True)


Model trained with dataset OPD_Descr that has maxlen=240 and charset=]Z=g@N78Ma/+(PlC9I2s3)S6%rc0Hp5oFeO#1-\[Bin4^$? for 2 epochs.
noise_std: 0.100000, lstm_dim: 512, dec_layers: 3, td_dense_dim: 0, batch_size: 128, codelayer_dim: 3, lr: 0.001000.

Epoch 00001: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 1/2
Model saved in ./models/opd_descr--01--0.4807--0.0010000.
95/95 - 31s - loss: 0.5956 - val_loss: 0.4807

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 2/2
Model saved in ./models/opd_descr--02--0.3681--0.0010000.
95/95 - 32s - loss: 0.3277 - val_loss: 0.3681
