### License

Copyright 2019 Ruud van Deursen, Firmenich SA.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

### Example notebook Smiles-GEN using Chembl

In this notebook a generative model is trained to generate SMILES for a subset of Chembl. This notebook is an example notebook associated to the publication as mentioned in the github repository.

In [None]:
#!pip install livelossplot
from SmilesGEN_utils_fixed import DataUtils
import keras.backend as K
from SmilesGEN_bilstm_lstm_multi_encoding import *
import tensorflow as tf
from numpy import arange
from rdkit.Chem import MolFromSmiles,MolToSmiles
config = None

In [None]:
def IsValidMolecule(smi):
    try:
        return MolFromSmiles(smi).GetNumAtoms()>0
    except:
        return False
    
def augment(smi,naug=10):
    try:
        mol = MolFromSmiles(smi)
        return set([smi if idx == 0 else MolToSmiles(mol,doRandom=True,canonical=False) for idx in range(naug)])
    except:
        return set()

### Memory block

In [None]:
if config is None:
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.log_device_placement = True
    sess = tf.Session(config=config)
    K.set_session(sess)

### Prepare the data

Select the datasets, set the preparation and compute X,Y to fit the model.

In [None]:
# Read the data and setup Utils
trainingset = "<your_dataset.smi>"
Utils = DataUtils(maxlen=42,step=3)

# Prepare data with an augmentation method
text = Utils.Prepare(trainingset,augment=augment,naug=10)
print("Corpus length: %s"%(len(text)))
print("Total chars %s"%(len(Utils.chars)))

# Translate to X,Y and get maxlen observed
# We take the computed maxlen, because
# this max very on the dataset used.
X,y,maxlen = Utils.Encode(text)

### Make and train the model

In [None]:
num_iterations,batch_size = 80,256
def TestModel(Utils,l1=64,l2=64,Bidirectional=[True,True],nmodels=4,merge=0):
    model_setup = BaseModel(Utils,Layers=[l1,l2],Bidirectional=Bidirectional,minimodels=nmodels,merge=merge)
    model = model_setup.Init(verbose=True)
    return model_setup

# Define num iterations
def RunModel(model_setup,filepath,verbose=0,minimum=.95,pat=10):
    logfile = "%s.csv"%(filepath[:filepath.index("-cuda5")])
    trainer = model_setup.InitTrainer(sanitycheck=IsValidMolecule,verbose=True,minimum=minimum,patience=pat)
    model,history = trainer.Fit(X,y,filepath,logfile,num_epochs=num_iterations,batch_size=batch_size,
                                mycallbacks=[],verbose=verbose)
    return (model,history)

In [None]:
l1,l2 = 128,64
b1,b2 = True,False
model_setup = TestModel(Utils,l1=l1,l2=l2,Bidirectional=[b1,b2])

In [None]:
runno = 1
filepath = "models/Chembl_45k_bilstm_4x-lstm-%s-%s-merge-0-run-%s-cuda5-sg-{epoch:03d}-{loss:.4f}.hdf5"%(l1,l2,runno)
models,history = RunModel(model_setup,filepath,verbose=1)    

In [None]:
runno = 2
filepath = "models/Chembl_45k_bilstm_4x-lstm-%s-%s-merge-0-run-%s-cuda5-sg-{epoch:03d}-{loss:.4f}.hdf5"%(l1,l2,runno)
models,history = RunModel(model_setup,filepath,verbose=1)    

In [None]:
runno = 3
filepath = "models/Chembl_45k_bilstm_4x-lstm-%s-%s-merge-0-run-%s-cuda5-sg-{epoch:03d}-{loss:.4f}.hdf5"%(l1,l2,runno)
models,history = RunModel(model_setup,filepath,verbose=1)    

### Do predictions

This block runs the calculations and must be updated with the prefereed models. The generation method has been setup with a small sample of generated SMILES. Modify the size for larger generation. Select your models, update your directories, start, end, batchsize and launch the generation.

In [None]:
fnames = [
    "model1",
    "model2",
    "model3"
]

In [None]:
# Batch size
from datetime import datetime
for file,setup in fname:
    start,batch,end = 0,180,540
    model_file = "models/%s"%(file)
    fout = "generated/"+file.replace(".hdf5","_%09d_%09d.smi")
    gen = setup.InitGenerator(model_file)
    while start < end:
        now = datetime.now()
        mols = gen.Predict(ncollect=batch)
        now2 = datetime.now()
        ios = fout%(start,start+batch)
        with open(ios,"w") as f:
            _ = [f.write("%s\n"%(smi)) for smi in mols]
        now3 = datetime.now()
        diff1 = now2-now
        diff2 = now3-now2
        diff1 = diff1.seconds + 1e-6*diff1.microseconds
        diff2 = diff2.seconds + 1e-6*diff2.microseconds
        start += batch
        print("Written %s Gen=%.3f Writing=%.3f"%(f,diff1,diff2))
        start += batch