In [11]:
%load_ext autoreload
%autoreload 2
# Occupy a GPU for the model to be loaded 
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
# GPU ID, if occupied change to an available GPU ID listed under !nvidia-smi
%env CUDA_VISIBLE_DEVICES=1

import numpy as np
import rdkit
from rdkit import Chem, DataStructs
import h5py, ast, pickle

from ddc_pub import ddc_v3 as ddc
from ddc_pub.vectorizers import SmilesVectorizer

import logging
import sys

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [10]:
data = pd.read_csv('./datasets/OPD_Data/FP_C_Training_Validation.csv')['smiles'].tolist()

In [17]:
binmols = [(Chem.MolFromSmiles(smiles)) for smiles in data]
mols_in = [Chem.rdchem.Mol.ToBinary(smiles) for smiles in binmols]

In [20]:
sv = SmilesVectorizer()
sv.fit(binmols)
maxlen = sv.maxlength
charset = sv.charset

In [23]:
# Name of the dataset
name = "OPD_FP_C"

dataset_info = {"charset": charset, "maxlen": maxlen + 35, "name": name}

In [24]:
model = ddc.DDC(x              = mols_in,
                y              = mols_in,
                dataset_info   = dataset_info,
                noise_std      = 0.1,
                lstm_dim       = 256,
                dec_layers     = 3,
                td_dense_dim   = 0,
                batch_size     = 128,
                codelayer_dim  = 128)

Initializing model in train mode.
Input type is 'binary mols'.
Model received 141898 train samples and 15767 validation samples.
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_Inputs (InputLayer)     [(None, 460, 67)]    0                                            
__________________________________________________________________________________________________
mol_to_latent_model (Model)     (None, 128)          733824      Encoder_Inputs[0][0]             
__________________________________________________________________________________________________
Decoder_Inputs (InputLayer)     [(None, 459, 67)]    0                                            
__________________________________________________________________________________________________
latent_to_states_model (Model)  [(None, 256), (None, 204288   

In [None]:
model.fit(epochs              = 200,         # number of epochs
          lr                  = 1e-3,        # initial learning rate for Adam, recommended
          model_name          = "./models/FP_C_", # base name to append the checkpoints with
          checkpoint_dir      = "",          # save checkpoints in the notebook's directory
          mini_epochs         = 5,          # number of sub-epochs within an epoch to trigger lr decay
          save_period         = 50,          # checkpoint frequency (in mini_epochs)
          lr_decay            = True,        # whether to use exponential lr decay or not
          sch_epoch_to_start  = 100,         # mini-epoch to start lr decay (bypassed if lr_decay=False)
          sch_lr_init         = 1e-3,        # initial lr, should be equal to lr (bypassed if lr_decay=False)
          sch_lr_final        = 1e-6,        # final lr before finishing training (bypassed if lr_decay=False)
          patience            = 25)          # patience for Keras' ReduceLROnPlateau (bypassed if lr_decay=True)


Model trained with dataset OPD_fingerprint that has maxlen=455 and charset=a0bMAuGpgFL=(-2l+i)5mYtVnD\%TPI8y1S3N7OZ/#CsRUfd@Be9Eo[WKcrH4]6h^$? for 200 epochs.
noise_std: 0.100000, lstm_dim: 256, dec_layers: 3, td_dense_dim: 0, batch_size: 128, codelayer_dim: 128, lr: 0.001000.

Epoch 00001: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 1/1000
222/221 - 122s - loss: 0.3226 - val_loss: 0.1940

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 2/1000
222/221 - 108s - loss: 0.1639 - val_loss: 0.1628

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 3/1000
222/221 - 115s - loss: 0.1383 - val_loss: 0.1688

Epoch 00004: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 4/1000
222/221 - 115s - loss: 0.1248 - val_loss: 0.3504

Epoch 00005: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 5/1000
