In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

import plotly.graph_objects as go
from tensorflow.keras import Sequential
from tensorflow.keras.layers import InputLayer, Dense, RepeatVector, Reshape, Flatten, Dropout, TimeDistributed, Conv2D, MaxPool2D, GRU, LSTM, BatchNormalization

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from target import TargetSet
from image import ImageSetObject, show_image
from model import VAE, loss_function, get_text_from_predict, score

In [None]:
def get_predictive_network(max_len, vocab_size):
    predictive_network = Sequential(
    [
        Conv2D(filters=32, kernel_size=8,
               strides=(2, 2), activation='relu'),
        MaxPool2D(),
        BatchNormalization(),
        Conv2D(filters=32, kernel_size=8,
               strides=(2, 2), activation='relu'),
        MaxPool2D(),
        BatchNormalization(),
        Conv2D(filters=32, kernel_size=8,
               strides=(2, 2), activation='relu'),
        MaxPool2D(),
        BatchNormalization(),
        Flatten(),
        RepeatVector(max_len),
        LSTM(128, return_sequences=True, activation='relu'),
        Dropout(0.25),
        BatchNormalization(),
        TimeDistributed(Dense(vocab_size, activation='softmax'))
    ])
    return predictive_network

In [None]:
# Folders
folders = '0123456789abcdef'

# Random seed
random_state=0

# Parameters
new_shape=[256, 256]

# Load target file

In [None]:
TargetBMS = TargetSet()

In [None]:
dico_targets_shape = {}

for name_target in ['target_A', 'target_B', 'target_C']:
    df_targets = TargetBMS.targets[name_target]

    # Text processing
    text = ''.join(df_targets.values)

    # Vocab
    vocab = [' '] + sorted(set(text))
    vocab_size = len(vocab)

    # Mapping
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)

    # Max length
    max_len = max([len(x) for x in df_targets])
    
    print(f"{name_target} > vocab_size={vocab_size} | max_len={max_len}")
    
    dico_targets_shape[name_target] = {'vocab_size':vocab_size, 'max_len':max_len, 'idx2char':idx2char, 'char2idx':char2idx}

# Load image data

In [None]:
# # Images data 
# dataset = 'train'
# i = folders[0]
# j = folders[0]

# list_data = []
# list_id = []

# for k in tqdm(folders):

#     path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

#     # Files
#     list_names = os.listdir(path)
#     list_path = [path]*len(list_names)

#     # Image data
#     ImageSet = ImageSetObject(list_names, list_path)
#     ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
#     data = ImageSet.X
#     list_data.append(data)
#     list_id.append(ImageSet.image_ids)

# # Select data
# data = np.concatenate(list_data)
# list_id = np.concatenate(list_id)

In [None]:
# # Show
# n = 5
# fig = make_subplots(rows=1, cols=n)
# for i in range(n):
#     fig.add_trace(show_image(data[i, :, :, 0]).data[0], row = 1, col = i+1)
# fig.update_layout(
#     height=150,
#     coloraxis_showscale=False, 
#     margin={"l": 0, "r": 0, "t": 0, "b": 0}
# )
# fig.update_xaxes(showticklabels=False)
# fig.update_yaxes(showticklabels=False)

# Training

## Target A

In [None]:
name_target = 'target_A'

# Target parameters    
df_targets = TargetBMS.targets[name_target]
max_len = dico_targets_shape[name_target]['max_len']
vocab_size = dico_targets_shape[name_target]['vocab_size']
char2idx = dico_targets_shape[name_target]['char2idx']
idx2char = dico_targets_shape[name_target]['idx2char']

# Create model
big_epochs=3
epochs=1
batch_size=128
patience=10
lr=1e-4
optimizer = Adam(learning_rate=lr)
name=f'BMS_predictive_end_to_end_{name_target}'

predictive_network = get_predictive_network(max_len, vocab_size)
predictive_network.build(input_shape=(None, new_shape[0], new_shape[1], 1))
predictive_network.compile(optimizer=optimizer, loss=loss_function)
predictive_network.summary()

In [None]:
dataset = 'train'
for _ in tqdm(range(big_epochs)):
    for i in tqdm(folders[:4]):
        for j in tqdm(folders):
            list_data = []
            list_id = []

            for k in folders:

                path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

                # Files
                list_names = os.listdir(path)
                list_path = [path]*len(list_names)

                # Image data
                ImageSet = ImageSetObject(list_names, list_path)
                ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
                data = ImageSet.X
                list_data.append(data)
                list_id.append(ImageSet.image_ids)

            # Select data
            data = np.concatenate(list_data)
            list_id = np.concatenate(list_id)

            # Text targets
            targets = df_targets[list_id].values
            targets = [[char2idx[x] for x in target] for target in targets]
            targets = pad_sequences(targets, padding='post', maxlen=max_len)

            # Split train/test
            data, targets = shuffle(data, targets, random_state=0)

#             TRAIN_BUF = int(data.shape[0]*0.8)
#             data_train = data[:TRAIN_BUF]
#             data_validation = data[TRAIN_BUF:]
#             targets_train = targets[:TRAIN_BUF]
#             targets_validation = targets[TRAIN_BUF:]
#             print(f"train: {len(data_train)} / validation: {len(data_validation)}")

            # Callbacks and compil
#             es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
#             mc = ModelCheckpoint(f'outputs/{name}.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

            predictive_network.fit(
                data, 
                targets, 
#                 validation_data = (data_validation, targets_validation), 
                epochs=epochs, 
                batch_size=batch_size, 
                verbose=1,
#                 callbacks=[es, mc]
            )
    predictive_network.save(f"outputs/{name})

In [None]:
predictive_network.save(f"outputs/{name}")

In [None]:
# predictive_network.load_weights(f"outputs/{name}")

In [None]:
# Images data 
dataset = 'train'
i = folders[0]
j = folders[0]
k = folders[0]

list_data = []
list_id = []

path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

# Files
list_names = os.listdir(path)
list_path = [path]*len(list_names)

# Image data
ImageSet = ImageSetObject(list_names, list_path)
ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
data = ImageSet.X
list_data.append(data)
list_id.append(ImageSet.image_ids)

# Select data
data = np.concatenate(list_data)
list_id = np.concatenate(list_id)

# Text targets
targets = df_targets[list_id].values
targets = [[char2idx[x] for x in target] for target in targets]
targets = pad_sequences(targets, padding='post', maxlen=max_len)

In [None]:
# Train Score
limit=128
y_true=[''.join([idx2char[y] for y in yy]) for yy in targets[:limit]]
y_predict=get_text_from_predict(predictive_network, data[:limit], idx2char)
print(f"\t> Train Score: {score(y_true, y_predict)}")
[f"{x} -> {y}" for x,y in zip(y_true[0:8], y_predict[0:8])]

In [None]:
# Images data 
dataset = 'train'
i = folders[5]
j = folders[0]
k = folders[0]

list_data = []
list_id = []

path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

# Files
list_names = os.listdir(path)
list_path = [path]*len(list_names)

# Image data
ImageSet = ImageSetObject(list_names, list_path)
ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
data = ImageSet.X
list_data.append(data)
list_id.append(ImageSet.image_ids)

# Select data
data = np.concatenate(list_data)
list_id = np.concatenate(list_id)

# Text targets
targets = df_targets[list_id].values
targets = [[char2idx[x] for x in target] for target in targets]
targets = pad_sequences(targets, padding='post', maxlen=max_len)

In [None]:
# Test Score
y_true=[''.join([idx2char[y] for y in yy]) for yy in targets[:limit]]
y_predict=get_text_from_predict(predictive_network, data[:limit], idx2char)
print(f"\t> Test Score: {score(y_true, y_predict)}")
[f"{x} -> {y}" for x,y in zip(y_true[0:8], y_predict[0:8])]

# Predict and export submission

In [None]:
# Images data 
dataset = 'test'

list_data = []

# Sample_submission
submission = pd.read_csv("outputs/submission.csv")
submission = submission.set_index('image_id')

for i in tqdm(folders):
    for j in tqdm(folders):
        for k in folders:
            path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

            # Files
            list_names = os.listdir(path)
            list_path = [path]*len(list_names)

            # Image data
            ImageSet = ImageSetObject(list_names, list_path)
            ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
            data = ImageSet.X
            list_id = ImageSet.image_ids

            output = ['InChI=1S']*len(data)
            for name_target in ['target_A']: #, 'target_B', 'target_C']:

                name=f'BMS_predictive_end_to_end_{name_target}'

                max_len = dico_targets_shape[name_target]['max_len']
                vocab_size = dico_targets_shape[name_target]['vocab_size']
                idx2char = dico_targets_shape[name_target]['idx2char']

                predictive_network = get_predictive_network(max_len, vocab_size)
                predictive_network.build(input_shape=(None, new_shape[0], new_shape[1], 1))
                predictive_network.load_weights(f"outputs/{name}.h5")

                y = get_text_from_predict(predictive_network, data, idx2char)

                output = [o + f'/{y[i]}' if y[i] != max_len*' ' else o for i,o in enumerate(output)]
                output = [o.replace(' ', '') for o in output]

            # Prepare df
            df_output = pd.DataFrame([list_id, output], index=['image_id', 'InChI']).transpose().set_index('image_id')
            submission.loc[df_output.index,'InChI'] = df_output['InChI']

        # Export
        submission.reset_index().to_csv('outputs/submission.csv', index=False)