In [1]:
%load_ext autoreload
%autoreload 2


import numpy as np
import rdkit
from rdkit import Chem
import pandas as pd
import h5py, ast, pickle

# Occupy a GPU for the model to be loaded 
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
# GPU ID, if occupied change to an available GPU ID listed under !nvidia-smi
%env CUDA_VISIBLE_DEVICES=2 

from ddc_pub import ddc_v3 as ddc

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=2


# Load model

In [2]:
# Import existing (trained) model
# Ignore any warning(s) about training configuration or non-seriazable keyword arguments
model_name = "./models/opd_fp_complete" # complete model
# model_name = "./models/opd_fp_tl" # retrain model
model = ddc.DDC(model_name=model_name)

Initializing model in test mode.
Loading model.
Loading finished in 1 seconds.
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_Inputs (InputLayer)     [(None, 460, 67)]    0                                            
__________________________________________________________________________________________________
mol_to_latent_model (Model)     (None, 128)          733824      Encoder_Inputs[0][0]             
__________________________________________________________________________________________________
Decoder_Inputs (InputLayer)     [(None, 459, 67)]    0                                            
__________________________________________________________________________________________________
latent_to_states_model (Model)  [(None, 256), (None, 136192      mol_to_latent_model[1][0]        
_______________

# Load data from dataset

In [3]:
data = pd.read_csv('./datasets/OPD_Data/FP_C_TL_Seeds.csv')['smiles'].tolist()

# Alternatively, use your own SMILES

In [5]:
# Input SMILES to auto-encode
smiles_in = data

# MUST convert SMILES to binary mols for the model to accept them (it re-converts them to SMILES internally)
mols_in = [Chem.rdchem.Mol.ToBinary(Chem.MolFromSmiles(smiles)) for smiles in smiles_in]

In [6]:
# Encode the binary mols into their latent representations
latent = model.transform(model.vectorize(mols_in))

In [7]:
# Convert back to SMILES
smiles_out = []
nll_out = []
for lat in latent:
    smiles, nll = model.predict(lat, temp=0)
    smiles_out.append(smiles)
    nll_out.append(nll)

In [8]:
# To compare the results, convert smiles_out to CANONICAL
for idx, smiles in enumerate(smiles_out):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        smiles_out[idx] = Chem.MolToSmiles(mol, canonical=True)
    else:
        smiles_out[idx] = "INVALID"