references : 
https://github.com/YunjaeChoi/vaemols

In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split


In [6]:
from tensorflow.keras import layers


In [7]:
data = pd.read_csv('/content/drive/MyDrive/zinc-15.csv')

In [6]:
data

Unnamed: 0,zinc_id,smiles
0,ZINC000000008151,C[C@H]1[C@@H](O)[C@H](CO)O[C@@H](O)[C@@H]1N
1,ZINC000000008153,CC[C@@H]1[C@@H](N)[C@@H](O)O[C@@H](CO)[C@@H]1O
2,ZINC000000008155,CC1(C)[C@@H](N)[C@@H](O)O[C@@H](CO)[C@@H]1O
3,ZINC000000018276,CS[C@@H]1CN[C@@H](CO)[C@H](O)[C@H]1O
4,ZINC000000018279,CS[C@@H]1[C@@H](O)CN[C@@H](CO)[C@@H]1O
...,...,...
427851,ZINC000242463989,O[Cl+3](O)(O)O
427852,ZINC000247713634,O1[SiH2][SiH2]O[SiH2][SiH2]1
427853,ZINC000252581626,O[Si](O)(O)F
427854,ZINC000685945533,Cn1nnnc1S(=O)(=O)F


In [8]:
smiles_data = data['smiles']
smiles_data = np.array(smiles_data).reshape(-1)
smiles_data.shape

(427856,)

In [9]:
smiles_data

array(['C[C@H]1[C@@H](O)[C@H](CO)O[C@@H](O)[C@@H]1N',
       'CC[C@@H]1[C@@H](N)[C@@H](O)O[C@@H](CO)[C@@H]1O',
       'CC1(C)[C@@H](N)[C@@H](O)O[C@@H](CO)[C@@H]1O', ..., 'O[Si](O)(O)F',
       'Cn1nnnc1S(=O)(=O)F', 'Cn1nnc(S(=O)(=O)F)n1'], dtype=object)

In [10]:
print('Number of mols: '+str(len(smiles_data)))
idx = [i for i, x in enumerate(smiles_data) if len(x)<=20]
print('Number of valid mols: '+str(len(idx)))
smiles_data = smiles_data[idx]

Number of mols: 427856
Number of valid mols: 56767


In [11]:
char_set = set()
for i in tqdm(range(len(smiles_data))):
    smiles_data[i] = smiles_data[i].ljust(20)
    char_set = char_set.union(set(smiles_data[i]))
char_set_list = sorted(list(char_set))

100%|██████████| 56767/56767 [00:00<00:00, 426077.47it/s]


In [12]:
char_to_int = dict((c, i) for i, c in enumerate(char_set))
int_to_char = dict((i, c) for i, c in enumerate(char_set))

In [13]:
labeled_data = np.zeros((len(smiles_data), 120, 1), dtype=np.int32)
for i in tqdm(range(len(smiles_data))):
    for t, char in enumerate(smiles_data[i]):
        labeled_data[i, t, 0] = char_to_int[char]

100%|██████████| 56767/56767 [00:00<00:00, 155016.90it/s]


In [15]:
len(smiles_data)

56767

In [14]:
x_train, x_val_test = train_test_split(labeled_data, test_size=0.2)

In [15]:
x_val = x_val_test[:len(x_val_test)//2]
x_test = x_val_test[len(x_val_test)//2:]

In [16]:
num_classes = len(char_to_int)

In [17]:
class VAEDataGenerator(tf.keras.utils.Sequence):

    def __init__(self, x, num_classes, batch_size=32):
        self.x = x
        self.num_classes = num_classes
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x_one_hot = tf.keras.utils.to_categorical(batch_x, num_classes=self.num_classes)
        return (batch_x_one_hot, batch_x)

In [18]:
x_train_gen = VAEDataGenerator(x_train, num_classes, batch_size=32)
x_val_gen = VAEDataGenerator(x_val, num_classes, batch_size=32)

In [19]:
class EncoderConv1D(tf.keras.Model):

    def __init__(self, latent_dim, num_samples=16, name='encoder_conv1d'):
        super(EncoderConv1D, self).__init__(name=name)
        self.latent_dim = latent_dim
        self.num_samples = num_samples

        self.conv1 = tf.keras.layers.Conv1D(filters=32, kernel_size=7, strides=3)
        self.norm1 = tf.keras.layers.BatchNormalization()
        self.act1 = tf.keras.layers.Activation('relu')

        self.conv2 = tf.keras.layers.Conv1D(filters=64, kernel_size=7, strides=3)
        self.norm2 = tf.keras.layers.BatchNormalization()
        self.act2 = tf.keras.layers.Activation('relu')

        self.conv3 = tf.keras.layers.Conv1D(filters=128, kernel_size=7, strides=3)
        self.norm3 = tf.keras.layers.BatchNormalization()
        self.act3 = tf.keras.layers.Activation('relu')

        self.dense1 = tf.keras.layers.Dense(512)
        self.dense_norm1 = tf.keras.layers.BatchNormalization()
        self.dense_act1 = tf.keras.layers.Activation('relu')

        self.z_mean_dense = tf.keras.layers.Dense(self.latent_dim)
        self.z_log_var_dense = tf.keras.layers.Dense(self.latent_dim)

    def call(self, inputs):
        with tf.compat.v1.variable_scope(self.name, reuse=tf.compat.v1.AUTO_REUSE):
            x = self.conv1(inputs)
            x = self.norm1(x)
            x = self.act1(x)

            x = self.conv2(x)
            x = self.norm2(x)
            x = self.act2(x)

            x = self.conv3(x)
            x = self.norm3(x)
            x = self.act3(x)
            x = tf.keras.layers.Flatten()(x)

            z_mean = self.z_mean_dense(x)
            z_log_var = self.z_log_var_dense(x)

            self.dist = tf.compat.v1.distributions.Normal(loc=z_mean, scale=tf.exp(0.5*z_log_var))
            sampled = self.dist.sample(self.num_samples)
            z = tf.transpose(sampled, [1, 0, 2])
            return z, z_mean, z_log_var

class Decoderlstm(tf.keras.Model):

    def __init__(self, charset_length, max_length, name='decoder_lstm'):
        super(Decoderlstm, self).__init__(name=name)
        self.charset_length = charset_length
        self.max_length = max_length

        self.dense1 = tf.keras.layers.Dense(512)
        self.norm1 = tf.keras.layers.BatchNormalization()
        self.act1 = tf.keras.layers.Activation('relu')
        self.repeat = tf.keras.layers.RepeatVector(self.max_length)

        self.lstm1 = tf.compat.v1.keras.layers.LSTM(512, return_sequences=True)
        self.norm2 = tf.keras.layers.BatchNormalization()
        self.lstm2 = tf.compat.v1.keras.layers.LSTM(512, return_sequences=True)
        self.norm3 = tf.keras.layers.BatchNormalization()
        self.lstm3 = tf.compat.v1.keras.layers.LSTM(512, return_sequences=True)
        self.out_dense = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(self.charset_length))
        self.out_act = tf.keras.layers.Activation('softmax')


    def call(self, inputs):
        with tf.compat.v1.variable_scope(self.name, reuse=tf.compat.v1.AUTO_REUSE):
            x = self.dense1(inputs)
            x = self.norm1(x)
            x = self.act1(x)

            x = self.repeat(x)
            x = self.lstm1(x)
            x = self.norm2(x)
            x = self.lstm2(x)
            x = self.norm3(x)
            x = self.lstm3(x)
            outputs_logits = self.out_dense(x)
            outputs = self.out_act(outputs_logits)
            return outputs, outputs_logits

class VariationalAutoencoder(tf.keras.Model):

    def __init__(self, latent_dim, charset_length, max_length, num_samples=16, name='vae'):
        super(VariationalAutoencoder, self).__init__(name=name)
        self.latent_dim = latent_dim
        self.charset_length = charset_length
        self.max_length = max_length
        self.num_samples = num_samples

        self.encoder = EncoderConv1D(self.latent_dim, num_samples=self.num_samples)
        self.decoder = Decoderlstm(charset_length, max_length)

    def call(self, inputs):
        with tf.compat.v1.variable_scope(self.name, reuse=tf.compat.v1.AUTO_REUSE):
            z, self.z_mean, self.z_log_var = self.encoder(inputs)
            z_reshaped = tf.reshape(z, (-1, self.encoder.latent_dim))
            outputs, self.outputs_logits = self.decoder(z_reshaped)
            return outputs

    def vae_loss_func(self, y_true, y_pred):
        latent_loss = -0.5*tf.reduce_sum(1.0 + self.z_log_var - tf.square(self.z_mean) - tf.exp(self.z_log_var), 1)
        y_true_r = tf.reshape(y_true, [-1, 1, self.max_length])
        y_true_c = tf.cast(y_true_r, tf.int64)
        tiled = tf.tile(y_true_c, (1, self.num_samples, 1))
        y_true_rep = tf.reshape(tiled, (-1, self.max_length))
        recon_loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(y_true_rep, self.outputs_logits, reduction=tf.compat.v1.losses.Reduction.SUM)
        recon_loss = recon_loss/tf.cast(self.num_samples, tf.float32)
        vae_loss = latent_loss + recon_loss
        return vae_loss

    def sampled_data_acc(self, y_true, y_pred):
        y_true_r = tf.reshape(y_true, [-1, 1, self.max_length])
        y_true_c = tf.cast(y_true_r, tf.int64)
        tiled = tf.tile(y_true_c, (1, self.num_samples, 1))
        y_true_rep = tf.reshape(tiled, (-1, self.max_length))
        y_pred_class = tf.argmax(y_pred, axis=-1)
        acc = tf.reduce_mean(tf.cast(tf.equal(y_true_rep, y_pred_class), tf.float32))
        return acc

In [20]:
#model
inputs = tf.keras.layers.Input(shape=(120, num_classes))
vae = VariationalAutoencoder(256, num_classes, 120, num_samples=4)
outputs = vae(inputs)
optimizer = tf.keras.optimizers.Adam(lr=0.0003, clipvalue=0.1)
vae.compile(optimizer=optimizer, loss=vae.vae_loss_func, metrics=[vae.sampled_data_acc])

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


In [21]:
#callbacks
ckpt = tf.keras.callbacks.ModelCheckpoint('saved_models/' + 'weights-{epoch:02d}-{val_loss:.4f}.ckpt',
                                              monitor='val_loss',
                                              verbose=1, save_best_only=True,
                                              mode='auto', save_weights_only=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, min_lr=0.0)

In [37]:
vae.fit_generator(epochs=1, generator=x_train_gen, validation_data=x_val_gen,
                      use_multiprocessing=True, workers=4,
                      callbacks=[ckpt, reduce_lr])






Epoch 00001: val_loss improved from 609.04718 to 421.35138, saving model to saved_models/weights-01-421.3514.ckpt


<tensorflow.python.keras.callbacks.History at 0x7f75629d2240>

In [None]:
latent_dim = 256
inputs = tf.keras.layers.Input(shape=(120, num_classes))
vae = VariationalAutoencoder(latent_dim, num_classes, 120, num_samples=1)
outputs = vae(inputs)

decoder_inputs = tf.keras.layers.Input(shape=(latent_dim,))
decoder_outputs = vae.decoder(decoder_inputs)


In [26]:
input_data = tf.keras.utils.to_categorical(labeled_data[:1], num_classes=num_classes)

In [None]:
np.unique(input_data)

array([0., 1.], dtype=float32)

In [None]:

vae.encoder.predict(input_data)[1].shape

(1, 256)

In [None]:
restore_model_dir = 'saved_models/'
restore_model_path = tf.train.latest_checkpoint(restore_model_dir)

In [27]:
z_mean = vae.encoder.predict(input_data)[1]

In [28]:
num_samples = 4
std = 0.4
z_mean = np.tile(z_mean, (num_samples, 1))
z_samples = np.random.normal(loc=z_mean, scale=std, size=z_mean.shape)

In [32]:
outputs = vae.decoder.predict(z_samples)[0]

In [34]:
def smiles_to_labels(smiles_data, char_to_int, max_length):
    labeled_data = np.zeros((len(smiles_data), max_length, 1), dtype=np.int32)
    smiles_data = [d.ljust(max_length) for d in smiles_data]
    for i in range(len(smiles_data)):
        for t, char in enumerate(smiles_data[i]):
            labeled_data[i, t, 0] = char_to_int[char]
    return labeled_data

def labels_to_smiles(labeled_data, int_to_char):
    return np.array([''.join([int_to_char[label] for label in labels]).strip(' ')
                     for labels in labeled_data], dtype=np.str)

In [35]:
output_labels = np.argmax(outputs, axis=-1)
smiles = labels_to_smiles(output_labels, int_to_char)

In [36]:
smiles

array(['CCCC111cc](O)ccc)nn1OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO',
       'CCCc1cccc(())cc(N)n1OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO',
       'CCCc11nnc((O)ccc)nn1OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO',
       'CCCc11nnc((O)ccN)nn1OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO'],
      dtype='<U120')