In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
from tensorflow.keras.optimizers import Adam

from image import ImageSetObject, show_image
from model import get_model, get_text_from_predict, score, loss_function

# Load .csv file

In [None]:
# train labels
train_labels = pd.read_csv("bms-molecular-translation/train_labels.csv")
train_labels['InChI'] = train_labels['InChI'].apply(lambda x: x.replace('InChI=', ''))
train_labels = train_labels.set_index("image_id")
print(f"Size training set: {len(train_labels)}")

In [None]:
# Text processing
text = ''.join(train_labels['InChI'].values)

# Vocab
vocab = [' '] + sorted(set(text))
vocab_size = len(vocab)

# Mapping
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Max length
max_len = max([len(x) for x in train_labels['InChI']])

# Load image data, create target and model

In [None]:
# Folders
folders = '0123456789abcdef'

# Random seed
random_state=0

# Parameters
epochs = 1000
batch_size = 128
lr=1e-3
name=f'gsk'
new_shape=[128, 128]

# Optimizer
optimizer = Adam(learning_rate=lr)

In [None]:
# # Model 
# model = get_model(max_len, vocab)
# model.compile(optimizer=optimizer, loss=loss_function)
# model.summary()

In [None]:
import time

In [None]:
t = time.time()
# Images data 
dataset = 'train'

i = folders[0]
j = folders[0]
k = folders[0]

print(f"### {i} - {j} - {k} ###")

path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

# Files
list_names = os.listdir(path)
list_path = [path]*len(list_names)

# Image data
ImageSet = ImageSetObject(list_names, list_path)
ImageSet.prepare_data(new_shape, filtering=False, adjust=True)
data = ImageSet.X

t = time.time() - t
print(t)

In [None]:
# Train

In [None]:
# Images data 
dataset = 'train'

i = folders[0]
j = folders[0]
k = folders[0]

# for i in folders[0:1]:
#     for j in folders[0:1]:
#         for k in folders[1:2]:

print(f"### {i} - {j} - {k} ###")

path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

for e in range(1):
    
    print(f"# Epoch: {e}")
    
    score_list = []

    # Files
    list_names = os.listdir(path)
    list_path = [path]*len(list_names)

    # Image data
    ImageSet = ImageSetObject(list_names, list_path)
    ImageSet.prepare_data(new_shape, filtering=False, adjust=True)
    data = ImageSet.X

    # Text targets
    list_id = [x.split('.')[0] for x in ImageSet.list_names]
    targets = train_labels.loc[list_id, 'InChI'].values
    targets = [[char2idx[x] for x in target] for target in targets]
    targets = pad_sequences(targets, padding='post', maxlen=max_len)

    # Train
    history = model.fit(data, targets, epochs=epochs, batch_size=batch_size, verbose=1)
    model.save_weights(f'outputs/{name}.h5')

    # Score 
    y_true=[''.join([idx2char[int(y)] for y in yy]) for yy in targets]
    y_predict=get_text_from_predict(model, data, idx2char)
    score_list.append(score(y_true, y_predict))
    print(f"\t> Score: {np.mean(score_list)}")

In [None]:
# Train score
y_true=[''.join([idx2char[int(y)] for y in yy]) for yy in targets]
y_predict=get_text_from_predict(model, data, idx2char)
score(y_true, y_predict)

# Validation

In [None]:
# Images data 
dataset = 'train'

i = folders[5]
j = folders[5]
k = folders[5]

print(f"### {i} - {j} - {k} ###")

path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

In [None]:
# Files
list_names = os.listdir(path)
list_path = [path]*len(list_names)

# Image data
ImageSet = ImageSetObject(list_names, list_path)
ImageSet.prepare_data(new_shape, filtering=False, adjust=True)
data_validation = ImageSet.X

# Text targets
list_id = [x.split('.')[0] for x in ImageSet.list_names]
targets = train_labels.loc[list_id, 'InChI'].values
targets = [[char2idx[x] for x in target] for target in targets]
targets = pad_sequences(targets, padding='post', maxlen=max_len)

In [None]:
# Predict
y_val_true=[''.join([idx2char[int(y)] for y in yy]) for yy in targets_validation]
y_val_predict=get_text_from_predict(model, data_validation, idx2char)
score(y_val_true, y_val_predict)

# Submission

In [None]:
# Load weights
model.load_weights(f'outputs/{name}.h5')
print("Loaded model from disk")
model.compile(optimizer=optimizer, loss=loss_function)

In [None]:
# sample_submission
sample_submission = pd.read_csv("bms-molecular-translation/sample_submission.csv")
sample_submission = sample_submission.set_index('image_id')

In [None]:
# Images data 
dataset = 'test'

# i = folders[5]
# j = folders[5]
# k = folders[5]

for i in folders[0:1]:
    for j in folders[0:1]:
        for k in tqdm(folders):

            path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

            # Files
            list_names = os.listdir(path)
            list_path = [path]*len(list_names)

            # Image data
            ImageSet = ImageSetObject(list_names, list_path)
            ImageSet.prepare_data(new_shape, filtering=False, adjust=True)
            data_test = ImageSet.X

            # Text targets
            list_id = [x.split('.')[0] for x in ImageSet.list_names]

            # Predict
            y_test_predict=get_text_from_predict(model, data_test, idx2char)
            y_test_predict=['InChI='+x for x in y_test_predict]

            df_y_test_predict = pd.DataFrame([list_id, y_test_predict], index = ['image_id','InChI']).transpose().set_index('image_id')

            sample_submission.loc[df_y_test_predict.index, 'InChI'] = df_y_test_predict['InChI']

In [None]:
# Export
sample_submission.reset_index().to_csv('outputs/submission.csv', index=False)

In [None]:
sample_submission