In [None]:
import os
import numpy as np
import pandas as pd
from image import ImageObject, ImageSetObject, show_image

from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
# train labels
train_labels = pd.read_csv("bms-molecular-translation/train_labels.csv")
train_labels['InChI'] = train_labels['InChI'].apply(lambda x: x.replace('InChI=', ''))
train_labels = train_labels.set_index("image_id")
print(f"Size training set: {len(train_labels)}")

In [None]:
# sample_submission
sample_submission = pd.read_csv("bms-molecular-translation/sample_submission.csv")

In [None]:
path = 'bms-molecular-translation/train/0/0/0/'
list_names = os.listdir(path)

# sample
list_names = list_names[:16]
list_paths = [path for _ in list_names]

In [None]:
ImageSet = ImageSetObject(list_names, list_paths)
ImageSet.load_set()
ImageSet.resize_images()
ImageSet.list_targets = list(train_labels.loc[ImageSet.image_ids, 'InChI'])

In [None]:
data = ImageSet.array.reshape((ImageSet.shape[0], ImageSet.shape[1], ImageSet.shape[2], 1))

In [None]:
del train_labels

In [None]:
# Text processing
text = ''.join(ImageSet.list_targets)

# Vocab
vocab = [' '] + sorted(set(text))
vocab_size = len(vocab)

# Mapping
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

In [None]:
# Targets
targets = [[char2idx[x] for x in target] for target in ImageSet.list_targets]
targets = pad_sequences(targets, padding='post')
max_len = max([len(x) for x in targets])

In [None]:
from model import loss_function, plot_history
from tensorflow.keras import Sequential
from tensorflow.keras.layers import (
    InputLayer,
    Dense,
    Flatten,
    Reshape,
    RepeatVector,
    Conv2D,
    MaxPool2D,
    GRU,
    BatchNormalization
)



def get_model(image_data_shape, max_len, vocab, filters=4, latent_dim=8):
        model = Sequential(
            [
                InputLayer(input_shape=image_data_shape[1:]),
                Conv2D(filters=filters, kernel_size=3,
                       strides=(2, 2), activation='tanh'),
                BatchNormalization(),
                MaxPool2D(),
                Conv2D(filters=filters, kernel_size=3,
                       strides=(2, 2), activation='tanh'),
                MaxPool2D(),
                Conv2D(filters=filters, kernel_size=3,
                       strides=(2, 2), activation='tanh'),
                MaxPool2D(),
                Flatten(),
                Dense(latent_dim),
                RepeatVector(max_len),
                GRU(len(vocab), return_sequences=True,)
            ]
        )
        model.summary()
        return model


In [None]:
# Random seed
random_state=0

In [None]:
# Parameters
batch_size = 1

epochs = 1000
patience = 10
lr=1e-3

In [None]:
# Split train/test
TRAIN_BUF = int(data.shape[0]*0.8) - (int(data.shape[0]*0.8) % batch_size)
TEST_BUF = int(data.shape[0]*0.2) - (int(data.shape[0]*0.2) % batch_size)

data_train = data[:TRAIN_BUF]
data_validation = data[TRAIN_BUF:TRAIN_BUF+TEST_BUF]
targets_train = targets[:TRAIN_BUF]
targets_validation = targets[TRAIN_BUF:TRAIN_BUF+TEST_BUF]

In [None]:
# Create tf model
model = get_model(data.shape, max_len, vocab, filters=4, latent_dim=8)
name=f'gsk'

In [None]:
# Callbacks and compil
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
mc = ModelCheckpoint(f'outputs/{name}.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

optimizer = Adam(learning_rate=lr)
model.compile(optimizer=optimizer, loss=loss_function)

In [None]:
# Train
history = model.fit(data_train, targets_train, 
              validation_data = (data_validation, targets_validation), 
              epochs=epochs, 
              batch_size=batch_size, 
              verbose=0,
              callbacks=[es, mc])

In [None]:
# Plot history
plot_history(history)