In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from target import TargetSet
from image import ImageSetObject, show_image
from model import VAE, loss_function, get_text_from_predict, score

# Load .csv file

In [None]:
TargetBMS = TargetSet()

In [None]:
# Choice of the target
df_targets = TargetBMS.targets['target_A']

In [None]:
# Text processing
text = ''.join(df_targets.values)

# Vocab
vocab = [' '] + sorted(set(text))
vocab_size = len(vocab)

# Mapping
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Max length
max_len = max([len(x) for x in df_targets])

# Load image data, create features from VAE

In [None]:
# Folders
folders = '0123456789abcdef'

# Random seed
random_state=0

# Parameters
lr=1e-3
name=f'BMS_VAE'
new_shape=[128, 128]
latent_dim=64

# Optimizer
optimizer = Adam(learning_rate=lr)

In [None]:
# VAE Model
input_shape = [None, new_shape[0], new_shape[1], 1]
model = VAE(name, latent_dim, input_shape)
model.load_model(batch_size=None)

In [None]:
# Images data 
dataset = 'train'
i = folders[0]
j = folders[0]

list_data = []
list_data_z = []
list_targets = []

for k in tqdm(folders[0:1]):

    path = f'bms-molecular-translation/{dataset}/{i}/{j}/{k}/'

    # Files
    list_names = os.listdir(path)
    list_path = [path]*len(list_names)

    # Image data
    ImageSet = ImageSetObject(list_names, list_path)
    ImageSet.prepare_data(new_shape, filtering=False, adjust=True)
    data = ImageSet.X
    list_data.append(data)
    
    Z, _ = model.encode(data)
    list_data_z.append(Z)
    
#     # Text targets
#     list_id = [x.split('.')[0] for x in ImageSet.list_names]
#     targets = df_targets[list_id].values
#     targets = [[char2idx[x] for x in target] for target in targets]
#     targets = pad_sequences(targets, padding='post', maxlen=max_len)
#     list_targets.append(targets)

In [None]:
k = 0
data = list_data[k]
data_decoded = model.decode(list_data_z[k])

In [None]:
# Show
n = 5
fig = make_subplots(rows=2, cols=n)
for i in range(n):
    fig.add_trace(show_image(data[i, :, :, 0]).data[0], row = 1, col = i+1)
    fig.add_trace(show_image(data_decoded[i, :, :, 0]).data[0], row = 2, col = i+1)
fig.update_layout(
    coloraxis_showscale=False, 
    margin={"l": 0, "r": 0, "t": 0, "b": 0}
)
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)

# Train predictive model

In [None]:
import plotly.graph_objects as go
from tensorflow.keras import Sequential
from tensorflow.keras.layers import InputLayer, Dense, Reshape, Conv1DTranspose, LSTM, RepeatVector, Dropout

def get_model_predict_after_vae(max_len, vocab, latent_dim):
    model = Sequential(
        [
            InputLayer((latent_dim)),
            Dense(256, activation='relu'),
            Dropout(0.5),
            Dense(max_len*len(vocab), activation='relu'),
            Dropout(0.5),
            Reshape((max_len, len(vocab))),
            Dense(len(vocab), activation='softmax')
        ]
    )
    model.build()
    return model

def plot_history(history):
    '''Plot the train and test loss function for each epoch'''
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=history.history['loss'], name='training loss'))
    fig.add_trace(go.Scatter(y=history.history['val_loss'], name='validation loss'))

    fig.update_layout(
        xaxis_title="Epochs",
        yaxis_title="Loss",
        title="Training history"
    )
    return fig

In [None]:
# Data
data_z = np.concatenate(list_data_z)

# Targets
targets = np.concatenate(list_targets)

In [None]:
# Train
epochs=1000
batch_size=64
patience=10
name=f'gsk_VAE_predictive'

In [None]:
# Split train/test
data_z, targets = shuffle(data_z, targets, random_state=0)

TRAIN_BUF = int(data_z.shape[0]*0.8)
data_train = data_z[:TRAIN_BUF]
data_validation = data_z[TRAIN_BUF:]

targets_train = targets[:TRAIN_BUF]
targets_validation = targets[TRAIN_BUF:]

In [None]:
predictive_model = get_model_predict_after_vae(max_len, vocab, latent_dim)

In [None]:
# Callbacks and compil
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
mc = ModelCheckpoint(f'outputs/{name}.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

optimizer = Adam(learning_rate=lr)
predictive_model.compile(optimizer=optimizer, loss=loss_function)
predictive_model.summary()

In [None]:
history = predictive_model.fit(
    data_train, 
    targets_train, 
    validation_data = (data_validation, targets_validation), 
    epochs=epochs, 
    batch_size=batch_size, 
    verbose=0,
    callbacks=[es, mc]
)

In [None]:
plot_history(history)

In [None]:
# Train Score 
y_true=[''.join([idx2char[y] for y in yy]) for yy in targets_train[0:batch_size]]
y_predict=get_text_from_predict(predictive_model, data_train[0:batch_size], idx2char)
print(f"\t> Train Score: {score(y_true, y_predict)}")

# Validation score 
y_true=[''.join([idx2char[y] for y in yy]) for yy in targets_validation[0:batch_size]]
y_predict=get_text_from_predict(predictive_model, data_validation[0:batch_size], idx2char)
print(f"\t> Validation Score: {score(y_true, y_predict)}")

In [None]:
y_true[20]

In [None]:
y_predict