A model for generating SMILES strings from images of molecules.

In [None]:
## setup cell
# import required libraries
import numpy as np
import pandas as pd
import csv
import math
import matplotlib.image as mpimg
from sklearn import model_selection
import keras.layers as layers

# load the list of compounds and image conversion errors
compounds = pd.read_csv("compounds.csv")
with open("errors.csv", "r") as file:
    errors = list(csv.reader(file))
errors = list(map(int, errors[0]))

# filter out the compounds that threw errors
compounds = compounds.query("SubstanceID not in @ errors")

# generate a dictionary of unique characters in SMILES strings and its inverse
characters = sorted(list(set(compounds["SMILES"].sum())))
character_dictionary = dict((j, i) for i, j in enumerate(characters))
character_dictionary_inverse = dict((i, j) for i, j in enumerate(characters))

# augment the dictionary with start, stop, and "blank" codons
character_dictionary.update({"start": 68, "stop": 69, "blank": 70})
character_dictionary_inverse.update({68: "start", 69: "stop", 70: ""})

# find the longest SMILES string in the dataset; add space for start and end codons
max_length = compounds["SMILES"].map(lambda x: len(x)).max() + 2

In [3]:
## test, train, validation, split
# split 80/20 into train and test sets
train, test = model_selection.train_test_split(compounds, test_size = 0.2, random_state = 1935235)

# further sudivide training set into train and validation sets to give an 80/20/20 overall split
train, validation = model_selection.train_test_split(train, test_size = 0.25, random_state = 358235)

In [93]:
## a simple function to pad encoded strings with blanks
# declare a function pad
# string: a list generated by encoding a string with character_dictionary
# max_length: the number of items to pad the string out to
# returns: a list of integers padded with 70 to length max_length

def pad(string, max_length):
    while len(string) < max_length:
        string.append(70)
    return(string)

In [210]:
## data generator to supply images and targets to the neural network
# declare a function molecule_image_generator
# SubstanceIDs: a list of SubstanceIDs from which to draw batches
# batch_size: the number of images included in each batch
# augment: a boolean flag indicating whether data augmentation should be applied
# aug: a funtion responsible for data augmentation
# returns: a data generator

def molecule_image_generator(SubstanceIDs, batch_size, augment = True, aug = None):
    # calculate the maximum number of batches from one iteration of SubstanceIDs
    max_batches = math.floor(len(SubstanceIDs) / batch_size)

    # declare a counter to track current batch
    batch = 0
    
    # loop through batches indefinitely
    while True:                
        # declare an empty array to hold the image data
        # shape == (batch_size * max_length), xpixels, ypixels, channels
        X1 = np.empty([batch_size * (max_length), 300, 300, 1], dtype = "int8")
        
        # declare an empty array to hold the SMILES string input fragments 
        # shape == (batch_size * (max_length), max_length
        X2 = np.empty([batch_size * (max_length), max_length], dtype = "int8")
        
        # declare an empty array (vector) to hold the SMILES string target fragment
        # shape == batch_size * max_length
        Y = np.empty([batch_size * max_length], dtype = "int8")
        
        
        # determine which images to place in this batch
        # take batch_size observations if possible
        if batch < max_batches:
            batch_start = batch * batch_size
            batch_end = batch * batch_size + batch_size
            batch += 1
        # otherwise take as many observations as possible 
        else:
            batch_start = batch * batch_size
            batch_end = batch * batch_size + (len(SubstanceIDs) % batch_size)
            batch = 0
        
        
        # generate X1, X2, Y for a batch of images
        # loop through images by SubstanceID index
        for i in range(batch_start, batch_end):
            # encode the SMILES string for the current SubstanceID into a list of integers
            SMILES = compounds.query("SubstanceID == @SubstanceIDs[@i]")["SMILES"].to_string(header = False, index = False).strip()
            SMILES = [character_dictionary[character] for character in SMILES]
            
            # add start and stop codons, pad with blanks to max_length
            SMILES.insert(0, 68)
            SMILES.append(69)
            SMILES = pad(SMILES, max_length)
            
            # loop through each character in the SMILES string
            for j in range(0, len(SMILES)):
                # calcuate the current observation number
                obs = (i - batch_start) * len(SMILES) + j
                
                # import image into X1; augment with a single channel dimension
                X1[obs, :, :, 0] = mpimg.imread("./molecule_drawings/" + str(SubstanceIDs[i]) + ".png")
                
                # split SMILES at each character into an initiating string and a target character
                X2[obs, :] = pad(SMILES[:j], max_length)
                Y[obs] = SMILES[j]
                
        
        # one-hot encode Y
        Y = keras.utils.to_categorical(Y)
        
        # yield the generated data to the training function
        yield([[X1, X2], Y])

Our network will take two inputs:  
1) The images, which are run through a convolutional branch.  
2) The growing SMILES strings, which are run through an LSTM branch.

The two inputs are merged into an FFNN trunk with softmax output.

In [195]:
## define the neural network architecture
# CNN layers for images
input_image = layers.Input(shape = (300, 300, 1))
conv_1 = layers.Conv2D(filters = 50, kernel_size = (3, 3), strides = (1, 1), activation = "relu")(input_image)
max_pool_1 = layers.MaxPool2D(pool_size = (3, 3))(conv_1)
conv_2 = layers.Conv2D(filters = 100, kernel_size = (5, 5), strides = (1, 1), activation = "relu")(max_pool_1)
max_pool_2 = layers.MaxPool2D(pool_size = (5, 5))(conv_2)
flatten = layers.Flatten()(max_pool_2)
dropout_CNN = layers.Dropout(rate = 0.2)(flatten)
dense_CNN = layers.Dense(units = 256)(dropout_CNN)

# RNN layers for SMILES strings
input_SMILES = layers.Input(shape = (max_length, ))
embedding = layers.Embedding(input_dim = max_length, output_dim = 200)(input_SMILES)
lstm = layers.LSTM(units = 256)(embedding)

# merge into FFNN layers
sum_inputs = layers.add([dense_CNN, lstm])
dense_output = layers.Dense(1000, activation = "relu")(sum_inputs)
dropout_output = layers.Dropout(rate = 0.2)(dense_output)
output = layers.Dense(units = len(character_dictionary), activation = "softmax")(dropout_output)

model = keras.Model(inputs = [input_image, input_SMILES], outputs = output)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_20 (InputLayer)           (None, 300, 300, 1)  0                                            
__________________________________________________________________________________________________
conv2d_25 (Conv2D)              (None, 298, 298, 50) 500         input_20[0][0]                   
__________________________________________________________________________________________________
max_pooling2d_24 (MaxPooling2D) (None, 99, 99, 50)   0           conv2d_25[0][0]                  
__________________________________________________________________________________________________
conv2d_26 (Conv2D)              (None, 95, 95, 100)  125100      max_pooling2d_24[0][0]           
__________________________________________________________________________________________________
max_poolin

In [213]:
## train the model
# set the number of images per batch
batch_size = 2

# compile the model
model.compile(loss = "categorical_crossentropy", optimizer = "adadelta")

# train the model
model.fit_generator(generator = molecule_image_generator(list(train.SubstanceID), batch_size), samples_per_epoch = math.ceil(len(train) * max_length / batch_size), nb_epoch = 1)

Epoch 1/1


ResourceExhaustedError: OOM when allocating tensor with shape[36100,256] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[node training/Adadelta/Variable_19/Assign (defined at C:\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:402) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Errors may have originated from an input operation.
Input Source operations connected to node training/Adadelta/Variable_19/Assign:
 training/Adadelta/zeros_19 (defined at C:\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py:702)

Original stack trace for 'training/Adadelta/Variable_19/Assign':
  File "C:\Anaconda\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Anaconda\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Anaconda\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Anaconda\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Anaconda\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "C:\Anaconda\lib\site-packages\tornado\platform\asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "C:\Anaconda\lib\asyncio\base_events.py", line 539, in run_forever
    self._run_once()
  File "C:\Anaconda\lib\asyncio\base_events.py", line 1775, in _run_once
    handle._run()
  File "C:\Anaconda\lib\asyncio\events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Anaconda\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Anaconda\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "C:\Anaconda\lib\site-packages\tornado\gen.py", line 781, in inner
    self.run()
  File "C:\Anaconda\lib\site-packages\tornado\gen.py", line 742, in run
    yielded = self.gen.send(value)
  File "C:\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Anaconda\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Anaconda\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "C:\Anaconda\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Anaconda\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Anaconda\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 2848, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 2874, in _run_cell
    return runner(coro)
  File "C:\Anaconda\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3049, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3220, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-202-1eac64a94ae2>", line 9, in <module>
    model.fit_generator(generator = molecule_image_generator(list(train.SubstanceID), batch_size), samples_per_epoch = math.ceil(len(train) * max_length / batch_size), nb_epoch = 1)
  File "C:\Anaconda\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "C:\Anaconda\lib\site-packages\keras\engine\training.py", line 1418, in fit_generator
    initial_epoch=initial_epoch)
  File "C:\Anaconda\lib\site-packages\keras\engine\training_generator.py", line 40, in fit_generator
    model._make_train_function()
  File "C:\Anaconda\lib\site-packages\keras\engine\training.py", line 509, in _make_train_function
    loss=self.total_loss)
  File "C:\Anaconda\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "C:\Anaconda\lib\site-packages\keras\optimizers.py", line 398, in get_updates
    delta_accumulators = [K.zeros(shape) for shape in shapes]
  File "C:\Anaconda\lib\site-packages\keras\optimizers.py", line 398, in <listcomp>
    delta_accumulators = [K.zeros(shape) for shape in shapes]
  File "C:\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py", line 704, in zeros
    return variable(v, dtype=dtype, name=name)
  File "C:\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py", line 402, in variable
    v = tf.Variable(value, dtype=tf.as_dtype(dtype), name=name)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\ops\variables.py", line 259, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\ops\variables.py", line 220, in _variable_v1_call
    shape=shape)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\ops\variables.py", line 198, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2511, in default_variable_creator
    shape=shape)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\ops\variables.py", line 263, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\ops\variables.py", line 1568, in __init__
    shape=shape)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\ops\variables.py", line 1745, in _init_from_args
    validate_shape=validate_shape).op
  File "C:\Anaconda\lib\site-packages\tensorflow\python\ops\state_ops.py", line 227, in assign
    validate_shape=validate_shape)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\ops\gen_state_ops.py", line 69, in assign
    use_locking=use_locking, name=name)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\framework\ops.py", line 3616, in create_op
    op_def=op_def)
  File "C:\Anaconda\lib\site-packages\tensorflow\python\framework\ops.py", line 2005, in __init__
    self._traceback = tf_stack.extract_stack()
