In [1]:
from rdkit import Chem
import numpy as np

In [2]:
SMILES_CHAR = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'l', 'n', 'o', 'p', 'r', 's', 't', 'u',
               'A', 'B', 'C', 'G', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 
               'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
               '#', '%', '(', ')', '[', ']', '-', '+', '=', '.', '/', '\\', '@']

char_to_ind = {c : i for i, c in enumerate(SMILES_CHAR)}

In [3]:
def smiles_to_array(smiles: str, max_len: int) -> np.array:
    mol_array = np.zeros((max_len, len(SMILES_CHAR)))
    for i, c in enumerate(smiles):
        mol_array[i][char_to_ind[c]] = 1
    return mol_array

def array_to_smiles(mol_array: np.array) -> str:
    smiles = ''
    for l in mol_array:
        j = np.argmax(l)
        if l[j] == 0:
            break
        smiles += SMILES_CHAR[j]
    return smiles

In [4]:
smiles = 'CCN(CC)C(=O)C1=CC=CC(=C1)C'
ar = smiles_to_array(smiles, 100)
array_to_smiles(ar)

'CCN(CC)C(=O)C1=CC=CC(=C1)C'

In [5]:
import os
import rapidjson
import cv2
from sklearn.model_selection import train_test_split

def create_filename(num: int) -> str:
    return str(num) + '.png'

FOLDER_NAME = 'images'
OUTPUT_FILENAME = 'num_of_images.json'

with open(OUTPUT_FILENAME, 'r') as json_file:
    data = rapidjson.load(json_file)

max_len = max([len(datum['smiles']) for datum in data])
pictures = []
smileses = []
for datum in data:
    img = cv2.imread(
        os.path.join(FOLDER_NAME, create_filename(datum['image_num'])),
        cv2.IMREAD_GRAYSCALE
    )
    img = img.astype('float32')
    img /= 255
    pictures.append(img)
    smileses.append(np.concatenate(smiles_to_array(datum['smiles'], max_len)))
pictures = np.expand_dims(np.array(pictures), axis=3)
smileses = np.array(smileses)
print(pictures.shape)
print(smileses.shape)
p_train, p_test, s_train, s_test = train_test_split(pictures, smileses, test_size=0.2)

(2815, 128, 128, 1)
(2815, 1798)


In [10]:
import numpy as np
np.random.seed(123)  # for reproducibility
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras.datasets import mnist
import tensorflow as tf
from matplotlib import pyplot as plt

model = Sequential()

model.add(Conv2D(filters=16,kernel_size=2,padding='same',activation='relu',input_shape=(128,128,1)))
model.add(MaxPooling2D(pool_size=2))
model.add(Conv2D(filters=32,kernel_size=2,padding='same',activation ='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Conv2D(filters=64,kernel_size=2,padding='same',activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Flatten())
model.add(Dense(1798,activation='relu'))

model.summary()

opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_9 (Conv2D)            (None, 128, 128, 16)      80        
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 64, 64, 16)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 64, 64, 32)        2080      
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 32, 32, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 32, 32, 64)        8256      
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 16, 16, 64)        0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 16384)            

In [11]:
model.fit(p_train, s_train, 
          batch_size=32, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1900e84c0>

In [12]:
model.evaluate(X_test, Y_test)

NameError: name 'X_test' is not defined