In [None]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from target import TargetSet
from image import ImageSetObject, show_image
from model import get_predictive_network, loss_function, get_text_from_predict, score

In [None]:
# Folders
folders = '0123456789abcdef'

# Random seed
random_state=0

# Parameters
new_shape=[256, 256]

batch_size=128
lr=1e-3
optimizer = Adam(learning_rate=lr)

# Load target file

In [None]:
TargetBMS = TargetSet()

In [None]:
dico_targets_shape = {}

for name_target in ['target_A', 'target_B', 'target_C']:
    df_targets = TargetBMS.targets[name_target]

    # Text processing
    text = ''.join(df_targets.values)

    # Vocab
    vocab = [' '] + sorted(set(text))
    vocab_size = len(vocab)

    # Mapping
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)

    # Max length
    max_len = max([len(x) for x in df_targets])
    
    print(f"{name_target} > vocab_size={vocab_size} | max_len={max_len}")
    
    dico_targets_shape[name_target] = {'vocab_size':vocab_size, 'max_len':max_len, 'idx2char':idx2char, 'char2idx':char2idx}

# Training

## Target A

In [None]:
name_target = 'target_A'

# Target parameters    
df_targets = TargetBMS.targets[name_target]
max_len = dico_targets_shape[name_target]['max_len']
vocab_size = dico_targets_shape[name_target]['vocab_size']
char2idx = dico_targets_shape[name_target]['char2idx']
idx2char = dico_targets_shape[name_target]['idx2char']

# Create model
name=f'BMS_predictive_end_to_end_{name_target}'

predictive_network_A = get_predictive_network(max_len, vocab_size)
predictive_network_A.build(input_shape=(None, new_shape[0], new_shape[1], 1))
predictive_network_A.compile(optimizer=optimizer, loss=loss_function)
predictive_network_A.summary()

In [None]:
# # Load model
# predictive_network_A = tf.keras.models.load_model(f"outputs/BMS_predictive_end_to_end_target_A", compile=False)
# predictive_network_A.compile(optimizer=optimizer, loss=loss_function)

In [None]:
dataset = 'train'
big_epochs=15
epochs=1
for _ in tqdm(range(big_epochs)):
    for i in folders[0:1]:
        for j in folders:
            list_data = []
            list_id = []

            for k in folders:

                path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

                # Files
                list_names = os.listdir(path)
                list_path = [path]*len(list_names)

                # Image data
                ImageSet = ImageSetObject(list_names, list_path)
                ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
                data = ImageSet.X
                list_data.append(data.astype('float16'))
                list_id.append(ImageSet.image_ids)

            # Select data
            data = np.concatenate(list_data)
            list_id = np.concatenate(list_id)

            # Text targets
            targets = df_targets[list_id].values
            targets = [[char2idx[x] for x in target] for target in targets]
            targets = pad_sequences(targets, padding='post', maxlen=max_len)

            # Split train/test
            data, targets = shuffle(data, targets, random_state=0)

            predictive_network_A.fit(
                data, 
                targets, 
                epochs=epochs, 
                batch_size=batch_size, 
                verbose=1,
            )
    predictive_network_A.save(f"outputs/{name}")

### Show images

In [None]:
# Show images
n = 5
fig = make_subplots(rows=1, cols=n)
for i in range(n):
    fig.add_trace(show_image(data[i, :, :, 0]).data[0], row = 1, col = i+1)
fig.update_layout(
    height=150,
    coloraxis_showscale=False, 
    margin={"l": 0, "r": 0, "t": 0, "b": 0}
)
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)

### Examples

In [None]:
# Images data 
dataset = 'train'
i = folders[0]
j = folders[0]
k = folders[0]

list_data = []
list_id = []

path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

# Files
list_names = os.listdir(path)
list_path = [path]*len(list_names)

# Image data
ImageSet = ImageSetObject(list_names, list_path)
ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
data = ImageSet.X
list_data.append(data)
list_id.append(ImageSet.image_ids)

# Select data
data = np.concatenate(list_data)
list_id = np.concatenate(list_id)

# Text targets
targets = df_targets[list_id].values
targets = [[char2idx[x] for x in target] for target in targets]
targets = pad_sequences(targets, padding='post', maxlen=max_len)

In [None]:
# Train Score
limit=128
y_true=[''.join([idx2char[y] for y in yy]) for yy in targets[:limit]]
y_predict=get_text_from_predict(predictive_network_A, data[:limit], idx2char)
print(f"\t> Train Score: {score(y_true, y_predict)}")
[f"{x} -> {y}" for x,y in zip(y_true[0:8], y_predict[0:8])]

## Target B

In [None]:
name_target = 'target_B'

# Target parameters    
df_targets = TargetBMS.targets[name_target]
max_len = dico_targets_shape[name_target]['max_len']
vocab_size = dico_targets_shape[name_target]['vocab_size']
char2idx = dico_targets_shape[name_target]['char2idx']
idx2char = dico_targets_shape[name_target]['idx2char']

# Create model
name=f'BMS_predictive_end_to_end_{name_target}'

In [None]:
# Create predictive_network_double_input
predictive_network_B = get_predictive_network(max_len, vocab_size)
predictive_network_B.compile(optimizer=optimizer, loss=loss_function)
predictive_network_B.build(input_shape=(None, new_shape[0], new_shape[1], 1))
predictive_network_B.summary()

In [None]:
# # Load model
# predictive_network_B = tf.keras.models.load_model(f"outputs/BMS_predictive_end_to_end_target_B", compile=False)
# predictive_network_B.compile(optimizer=optimizer, loss=loss_function)

In [None]:
dataset = 'train'
big_epochs=15
epochs=1
for _ in tqdm(range(big_epochs)):
    for i in folders[0:1]:
        for j in folders:
            list_data = []
            list_id = []

            for k in folders:

                path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

                # Files
                list_names = os.listdir(path)
                list_path = [path]*len(list_names)

                # Image data
                ImageSet = ImageSetObject(list_names, list_path)
                ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
                data = ImageSet.X
                list_data.append(data.astype('float16'))
                list_id.append(ImageSet.image_ids)

            # Select data
            data = np.concatenate(list_data)
            list_id = np.concatenate(list_id)

            # Text targets
            targets = df_targets[list_id].values
            targets = [[char2idx[x] for x in target] for target in targets]
            targets = pad_sequences(targets, padding='post', maxlen=max_len)

            # Split train/test
            data, targets = shuffle(data, targets, random_state=0)

            predictive_network_B.fit(
                data, 
                targets, 
                epochs=epochs, 
                batch_size=batch_size, 
                verbose=1
            )
    predictive_network_B.save(f"outputs/{name}")

### Examples

In [None]:
# Images data 
dataset = 'train'
i = folders[0]
j = folders[0]
k = folders[0]

list_data = []
list_id = []

path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

# Files
list_names = os.listdir(path)
list_path = [path]*len(list_names)

# Image data
ImageSet = ImageSetObject(list_names, list_path)
ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
data = ImageSet.X
list_data.append(data)
list_id.append(ImageSet.image_ids)

# Select data
data = np.concatenate(list_data)
list_id = np.concatenate(list_id)

# Text targets
targets = df_targets[list_id].values
targets = [[char2idx[x] for x in target] for target in targets]
targets = pad_sequences(targets, padding='post', maxlen=max_len)

In [None]:
# Train Score
limit=128
y_true=[''.join([idx2char[y] for y in yy]) for yy in targets[:limit]]
y_predict=get_text_from_predict(predictive_network_B, data[:limit], idx2char)
print(f"\t> Train Score: {score(y_true, y_predict)}")
[f"{x} -> {y}" for x,y in zip(y_true[0:8], y_predict[0:8])]

## Target C

In [None]:
name_target = 'target_C'

# Target parameters    
df_targets = TargetBMS.targets[name_target]
max_len = dico_targets_shape[name_target]['max_len']
vocab_size = dico_targets_shape[name_target]['vocab_size']
char2idx = dico_targets_shape[name_target]['char2idx']
idx2char = dico_targets_shape[name_target]['idx2char']

# Create model
name=f'BMS_predictive_end_to_end_{name_target}'

In [None]:
# Create predictive_network_double_input
predictive_network_C = get_predictive_network(max_len, vocab_size)
predictive_network_C.compile(optimizer=optimizer, loss=loss_function)
predictive_network_C.build(input_shape=(None, new_shape[0], new_shape[1], 1))
predictive_network_C.summary()

In [None]:
# # Load model
# predictive_network_C = tf.keras.models.load_model(f"outputs/BMS_predictive_end_to_end_target_C", compile=False)
# predictive_network_C.compile(optimizer=optimizer, loss=loss_function)

In [None]:
dataset = 'train'
big_epochs = 15
epochs = 1
for _ in tqdm(range(big_epochs)):
    for i in folders[:1]:
        for j in folders:
            list_data = []
            list_id = []

            for k in folders:

                path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

                # Files
                list_names = os.listdir(path)
                list_path = [path]*len(list_names)

                # Image data
                ImageSet = ImageSetObject(list_names, list_path)
                ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
                data = ImageSet.X
                list_data.append(data.astype('float16'))
                list_id.append(ImageSet.image_ids)

            # Select data
            data = np.concatenate(list_data)
            list_id = np.concatenate(list_id)

            # Text targets
            targets = df_targets[list_id].values
            targets = [[char2idx[x] for x in target] for target in targets]
            targets = pad_sequences(targets, padding='post', maxlen=max_len)

            # Split train/test
            data, targets = shuffle(data, targets, random_state=0)

            predictive_network_C.fit(
                data, 
                targets, 
                epochs=epochs, 
                batch_size=batch_size, 
                verbose=1,
            )
    predictive_network_C.save(f"outputs/{name}")

### Examples

In [None]:
# Images data 
dataset = 'train'
i = folders[0]
j = folders[0]
k = folders[0]

list_data = []
list_id = []

path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

# Files
list_names = os.listdir(path)
list_path = [path]*len(list_names)

# Image data
ImageSet = ImageSetObject(list_names, list_path)
ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
data = ImageSet.X
list_data.append(data)
list_id.append(ImageSet.image_ids)

# Select data
data = np.concatenate(list_data)
list_id = np.concatenate(list_id)

# Text targets
targets = df_targets[list_id].values
targets = [[char2idx[x] for x in target] for target in targets]
targets = pad_sequences(targets, padding='post', maxlen=max_len)

In [None]:
# Train Score
limit=128
y_true=[''.join([idx2char[y] for y in yy]) for yy in targets[:limit]]
y_predict=get_text_from_predict(predictive_network_C, data[:limit], idx2char)
print(f"\t> Train Score: {score(y_true, y_predict)}")
[f"{x} -> {y}" for x,y in zip(y_true[0:8], y_predict[0:8])]

# Predict and export submission

In [None]:
# Load models
predictive_network = tf.keras.models.load_model(f"outputs/BMS_predictive_end_to_end_target_A", compile=False)
predictive_network_B = tf.keras.models.load_model(f"outputs/BMS_predictive_end_to_end_target_B", compile=False)
predictive_network_C = tf.keras.models.load_model(f"outputs/BMS_predictive_end_to_end_target_C", compile=False)

# Sample_submission
submission = pd.read_csv("outputs/sample_submission.csv")
submission = submission.set_index('image_id')

In [None]:
dataset = 'test'

for i in tqdm(folders):
    for j in tqdm(folders):
        for k in folders:

            path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

            # Files
            list_names = os.listdir(path)
            list_path = [path]*len(list_names)

            # Image data
            ImageSet = ImageSetObject(list_names, list_path)
            ImageSet.prepare_data(new_shape, filtering=False, adjust=False)
            data = ImageSet.X.astype('float16')
            list_id = ImageSet.image_ids

            # Prediction
            output = ['InChI=1S']*len(data)

            y = get_text_from_predict(predictive_network_A, data, dico_targets_shape['target_A']['idx2char'])
            output = [o + f'/{y[i]}' for i,o in enumerate(output)]

            y = get_text_from_predict(predictive_network_B, data, dico_targets_shape['target_B']['idx2char'])
            output = [o + f'/{y[i]}' for i,o in enumerate(output)]           

            y = get_text_from_predict(predictive_network_C, data, dico_targets_shape['target_C']['idx2char'])
            output = [o + f'/{y[i]}' for i,o in enumerate(output)]

            output = [o.replace(' ', '') for o in output]

            # Prepare df
            df_output = pd.DataFrame([list_id, output], index=['image_id', 'InChI']).transpose().set_index('image_id')
            submission.loc[df_output.index,'InChI'] = df_output['InChI']

# Export
submission.reset_index().to_csv(f'outputs/submission_final.csv', index=False)