In [1]:
# tf.__version__ == 2.4.1
# tf.keras.__version__ == 2.4.0
# np.__version__ == 1.19.5
# tf.executing_eagerly() == True
# py version == 3.7.9
 
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Conv2D, Input, MaxPool2D, BatchNormalization, LSTM, concatenate, Softmax, RNN
import numpy as np
import random
import os
import cv2
from bleu_score import sentence_bleu, corpus_bleu
from datetime import datetime
# from nltk.translate.bleu_score import sentence_bleu
# import nltk

print(tf.__version__)
print(tf.keras.__version__)
print(np.__version__)

tf.executing_eagerly()

2.4.1
2.4.0
1.19.5


True

In [2]:
H = None #700
W = None #1500
C = 1
vocab_size = 503
embedding_dim = 80
ENC_DIM = 256 # Hidden state dimension of encoder RNN
DEC_DIM = 512 # Hidden state dimension of decoder RNN

# **Define all layers in the model**

In [3]:
layers = {}

layers['conv1'] = Conv2D(filters=64, kernel_size=[3, 3], padding='same', activation='relu')
layers['maxpool1'] = MaxPool2D(pool_size=[2, 2], strides=[2, 2])
layers['conv2'] = Conv2D(filters=128, kernel_size=[3, 3], padding='same', activation='relu')
layers['maxpool2'] = MaxPool2D(pool_size=[2, 2], strides=[2, 2])
layers['conv3'] = Conv2D(filters=256, kernel_size=[3, 3], padding='same', activation='relu')
layers['bn1'] = BatchNormalization()
layers['conv4'] = Conv2D(filters=256, kernel_size=[3, 3], padding='same', activation='relu')
layers['maxpool3'] = MaxPool2D(pool_size=[1, 2], strides=[1, 2])
layers['conv5'] = Conv2D(filters=512, kernel_size=[3, 3], padding='same', activation='relu')
layers['bn2'] = BatchNormalization()
layers['maxpool4'] = MaxPool2D(pool_size=[2, 1], strides=[2, 1])
layers['conv6'] = Conv2D(filters=512, kernel_size=[3, 3], padding='same', activation='relu')
layers['bn3'] = BatchNormalization()


class EncoderCell(keras.layers.Layer):
    '''
    Splits the convolution output vertically along height (dim == 1) and
    runs RNN on each vertical cross section of conv output
    '''
    def __init__(self, encoder, state_size, output_size, **kwargs):
        self.encoder = encoder
        self.state_size = state_size
        self.output_size = output_size
        super(EncoderCell, self).__init__(**kwargs)
 
    def build(self, input_shape):
        self.built = True
 
    def call(self, inputs, states):
        output = self.encoder(inputs)
        return output, states


encoder_fw_cell = EncoderCell(LSTM(ENC_DIM, return_sequences=True), state_size=tf.TensorShape([1]), output_size=tf.TensorShape([None, ENC_DIM]))
encoder_bw_cell = EncoderCell(LSTM(ENC_DIM, return_sequences=True, go_backwards=True), state_size=tf.TensorShape([1]), output_size=tf.TensorShape([None, ENC_DIM]))

layers['encoder_fw'] = RNN(encoder_fw_cell, return_sequences=True)
layers['encoder_bw'] = RNN(encoder_bw_cell, return_sequences=True)


class AttentionCell(keras.layers.Layer):
 
    def __init__(self, decoder_out_shape, state_size, output_size, **kwargs):
        self.decoder_out_shape = decoder_out_shape
        self.state_size = state_size # encoder_hid_st_shape
        self.output_size = output_size
        super(AttentionCell, self).__init__(**kwargs)
 
    def build(self, input_shape):
        
        self.Wa = self.add_weight(shape=(self.decoder_out_shape[1], self.decoder_out_shape[1]),
                                  initializer='uniform',
                                  trainable=True,
                                  name='Wa')  # (512, 512)
        self.Ba = self.add_weight(shape=(self.decoder_out_shape[1], 1),
                                  initializer='zeros',
                                  trainable=True,
                                  name='Ba')  # (512, 1)
        
        self.Wc = self.add_weight(shape=(self.state_size[1] + self.decoder_out_shape[1], self.decoder_out_shape[1]),
                                  initializer='uniform',
                                  trainable=True,
                                  name='Wc')  # (512 + 512, 512)
        self.Bc = self.add_weight(shape=(1, self.decoder_out_shape[1]),
                                  initializer='zeros',
                                  trainable=True,
                                  name='Bc')  # (1, 512)
 
        self.Ws = self.add_weight(shape=(self.decoder_out_shape[1], self.output_size[0]),
                                  initializer='uniform',
                                  trainable=True,
                                  name='Ws')  # (512, vocab_size)
        self.Bs = self.add_weight(shape=(1, self.output_size[0]),
                                  initializer='zeros',
                                  trainable=True,
                                  name='Bs')  # (1, vocab_size)
        
        self.built = True
 
 
    def call(self, inputs, states):
        
        ht = inputs
        hs = states[0]
 
        ht = tf.expand_dims(ht, axis=-1)
        
        Wa_ht = tf.linalg.matmul(self.Wa, ht) + self.Ba
        score = tf.linalg.matmul(hs, Wa_ht)
        score = tf.squeeze(score, axis=-1)
        at = Softmax(axis=-1)(score)
        at = tf.expand_dims(at, axis=-2)
        ct = tf.linalg.matmul(at, hs)
        ht = tf.squeeze(ht, axis=-1)
        ht = tf.expand_dims(ht, axis=-2)
        ht_bar = tf.math.tanh(tf.linalg.matmul(tf.concat([ct, ht], axis=-1), self.Wc) + self.Bc)
        Ws_ht_bar = tf.linalg.matmul(ht_bar, self.Ws) + self.Bs
        Ws_ht_bar = tf.squeeze(Ws_ht_bar, axis=-2)
        output = Softmax(axis=-1)(Ws_ht_bar)
        
        return output, [hs]


layers['embedding'] = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
layers['decoder'] = LSTM(DEC_DIM, return_sequences=True)
cell = AttentionCell(decoder_out_shape=tf.TensorShape([None, DEC_DIM]), state_size=tf.TensorShape([None, ENC_DIM*2]),
                     output_size=tf.TensorShape([vocab_size]))
layers['attention_layer'] = RNN(cell, return_sequences=True)

# Build the model with the layers

In [4]:
def build_model(image, latex_seq, encoder_hid_st_input=None):
    # encoder
    img = image-128
    img = img/128

    x = layers['conv1'](img)
    x = layers['maxpool1'](x)
    # x -> (H/2, W/2, 64)

    x = layers['conv2'](x)
    x = layers['maxpool2'](x)
    # x -> (H/4, W/4, 128)

    x = layers['conv3'](x)
    x = layers['bn1'](x)
    # x -> (H/4, W/4, 256)

    x = layers['conv4'](x)
    x = layers['maxpool3'](x)
    # x -> (H/4, W/8, 256)

    x = layers['conv5'](x)
    x = layers['bn2'](x)
    x = layers['maxpool4'](x)
    # x -> (H/8, W/8, 512)

    x = layers['conv6'](x)
    x = layers['bn3'](x)
    # x -> (H/8, W/8, 512)

    encoder_fw_hid_st = layers['encoder_fw'](x)
    encoder_fw_hid_st = tf.reshape(encoder_fw_hid_st,[tf.shape(encoder_fw_hid_st)[0],-1,tf.shape(encoder_fw_hid_st)[-1]])

    encoder_bw_hid_st = layers['encoder_bw'](x)
    encoder_bw_hid_st = tf.reshape(encoder_bw_hid_st,[tf.shape(encoder_bw_hid_st)[0],-1,tf.shape(encoder_bw_hid_st)[-1]])

    encoder_hid_st = concatenate([encoder_fw_hid_st, encoder_bw_hid_st], axis=-1)

    # decoder
    if encoder_hid_st_input is None:
        latex_emb = layers['embedding'](latex_seq)
        decoder_hid_st = layers['decoder'](latex_emb)
        latex_pred = layers['attention_layer'](decoder_hid_st, encoder_hid_st)
        print(latex_pred)
        return keras.Model(inputs=[image, latex_seq], outputs=latex_pred)
    else:
        latex_emb = layers['embedding'](latex_seq)
        decoder_hid_st = layers['decoder'](latex_emb)
        latex_pred = layers['attention_layer'](decoder_hid_st, encoder_hid_st_input)

        return keras.Model(inputs=image, outputs=encoder_hid_st), keras.Model(inputs=[latex_seq, encoder_hid_st_input], outputs=latex_pred)


In [5]:
load_model = input('Load model? (y/n):') == 'y'
stage2_training_model = build_model(Input(shape=(None, None, C)), 
                                    Input(shape=tf.TensorShape([None])), 
                                    None)
if load_model:
    stage2_training_model.load_weights('model_checkpoints_7/cp-0002.ckpt')
    print("x----------Model loaded----------x")

lr = 0.1
clipnorm = 5.0
# optimizer = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.0, nesterov=False, name='SGD', clipnorm=clipnorm)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr, clipnorm=5.0)
stage2_training_model.compile(optimizer=optimizer, loss=tf.keras.losses.CategoricalCrossentropy(), metrics=['accuracy', 'crossentropy'])
stage2_training_model.optimizer.learning_rate.assign(lr)
print('lr:', stage2_training_model.optimizer.learning_rate.numpy())
print('clipnorm:', stage2_training_model.optimizer.clipnorm)
print('optimizer:', stage2_training_model.optimizer)

stage2_inference_encoder_model, stage2_inference_decoder_model = build_model(Input(shape=(H, W, C), batch_size=1), 
                                                                             Input(shape=tf.TensorShape([1]), batch_size=1), 
                                                                             Input(shape=(None, 512), batch_size=1))
stage2_inference_encoder_model.compile()
stage2_inference_decoder_model.compile()
print("x----------Model built----------x")

Load model? (y/n): y


KerasTensor(type_spec=TensorSpec(shape=(None, None, 503), dtype=tf.float32, name=None), name='rnn_2/transpose_1:0', description="created by layer 'rnn_2'")
x----------Model loaded----------x
lr: 0.1
clipnorm: 5.0
optimizer: <tensorflow.python.keras.optimizer_v2.adam.Adam object at 0x7f22180636d0>
x----------Model built----------x


# Check model with dummy input

In [None]:
# y = np.ones((4, 10))
# y = tf.one_hot(y, depth=vocab_size,axis=1)
# y = tf.transpose(y, [0, 2, 1])
# stage2_training_model.fit(x=[np.ones((4, 256, 256, 1)), np.ones((4, 10))], y=y, steps_per_epoch=2, epochs=1)

In [None]:
# batch_dummy = 1
# img_dummy = np.ones((batch_dummy, 256, 256, 1))
# latex_seq_dummy = np.ones((batch_dummy, 1))
# train_output = stage2_training_model.predict(x=[img_dummy, latex_seq_dummy], batch_size=batch_dummy)
# enc_output = stage2_inference_encoder_model.predict(x=img_dummy, batch_size=batch_dummy)
# dec_output = stage2_inference_decoder_model.predict(x=[latex_seq_dummy, enc_output])

# # print(train_output)
# # print(dec_output)
# print((train_output == dec_output).all())

# Train the model

In [6]:
train_batch = 20

tfr_description = {
        'image': tf.io.FixedLenSequenceFeature([], tf.string, allow_missing=True),
        'latex_seq_in': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
        'latex_seq_out': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
}

def _parse_image_function(example_proto):
  # Parse the input tf.train.Example proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, tfr_description)

def filter_long_seqs(sample_input, sample_output):
    return tf.shape(sample_output)[-1] < 120

def get_data(tfr_dir, subset, batch_size=None, filter_func=None):
    files = tf.io.matching_files(os.path.join(tfr_dir, '{}_100K_??of??.tfrecord'.format(subset)))
    dataset = tf.data.TFRecordDataset(files)
    dataset = dataset.map(_parse_image_function)
    dataset = dataset.map(lambda sample: ((tf.image.decode_jpeg(sample['image'][0]), sample['latex_seq_in']), sample['latex_seq_out']))
    if not filter_func is None:
        dataset = dataset.filter(filter_func)
    c = dataset.reduce(np.int64(0), lambda x, _: x + 1)
    dataset = dataset.repeat().padded_batch(batch_size, padding_values=((np.array(255, dtype=np.uint8), np.array(0, dtype=np.int64)), np.array(0, dtype=np.int64)))
    dataset = dataset.map(lambda _, latex_seq_out: (_, tf.one_hot(latex_seq_out, depth=vocab_size, axis=-1)))
    return dataset, c

train_dataset, train_size = get_data('./100K_tfrecords_2', 'train', batch_size=train_batch, filter_func=filter_long_seqs)
val_dataset, val_size = get_data('./100K_tfrecords_2', 'val', batch_size=train_batch)
test_dataset, test_size = get_data('./100K_tfrecords_2', 'test', batch_size=train_batch)
print(train_dataset, "samples:", train_size.numpy())
print(val_dataset, "samples:", val_size.numpy())
print(test_dataset, "samples:", test_size.numpy())

<MapDataset shapes: (((None, None, None, None), (None, None)), (None, None, 503)), types: ((tf.uint8, tf.int64), tf.float32)> samples: 76154
<MapDataset shapes: (((None, None, None, None), (None, None)), (None, None, 503)), types: ((tf.uint8, tf.int64), tf.float32)> samples: 9297
<MapDataset shapes: (((None, None, None, None), (None, None)), (None, None, 503)), types: ((tf.uint8, tf.int64), tf.float32)> samples: 10325


In [7]:
initial_epoch = 1

In [10]:
checkpoint_path = r"./model_checkpoints_7/cp-{epoch:04d}.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=0,
                                                 save_freq = 200)#'epoch')                    

tbcallback = tf.keras.callbacks.TensorBoard(
    log_dir='./tb_logs_7', histogram_freq=0, write_graph=False,
    write_images=True, update_freq=200, profile_batch=0,
    embeddings_freq=0, embeddings_metadata=None)


class CustomCallback(keras.callbacks.Callback):
    
    def __init__(self, **kwargs):
        self.train_losses = []
        self.val_losses = []
        self.best_perp = 724.6295594167904 #np.iinfo(np.int32).max
        super(CustomCallback, self).__init__(**kwargs)
    
    def on_epoch_begin(self, epoch, logs=None):
        print(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
        print('lr =', self.model.optimizer.learning_rate.numpy(), 'optimizer =', self.model.optimizer)
        
    def on_epoch_end(self, epoch, logs=None):
        mean_loss_train = np.mean(self.train_losses)
        mean_perp_train = np.mean(list(map(lambda x: np.power(np.e,x), self.train_losses)))
        print("Mean train loss:", mean_loss_train,",Mean train perplexity:", mean_perp_train)
        mean_loss_val = np.mean(self.val_losses)
        mean_perp_val = np.mean(list(map(lambda x: np.power(np.e,x), self.val_losses)))
        print("Mean val loss:", mean_loss_val,",Mean val perplexity:", mean_perp_val)
        if mean_perp_val < self.best_perp:
            self.best_perp = mean_perp_val
        else:s
            self.model.optimizer.learning_rate.assign(self.model.optimizer.learning_rate.numpy() / 2)
        print("Best perplexity:", self.best_perp)
        self.train_losses = []
        self.val_losses = []
        
    def on_train_batch_end(self, batch, logs=None):
        self.train_losses.append(logs['loss'])
        
    def on_test_batch_end(self, batch, logs=None):
        self.val_losses.append(logs['loss'])
        

In [11]:
epochs = 15
stage2_training_model.fit(train_dataset, steps_per_epoch=np.array(train_size//train_batch, dtype=np.int64),
                          epochs=initial_epoch+epochs, initial_epoch=initial_epoch,
                          validation_data=val_dataset, validation_steps=np.array(val_size//train_batch, dtype=np.int64),
                          callbacks=[cp_callback, CustomCallback(), tbcallback])
initial_epoch = initial_epoch+epochs

In [79]:
def bleu_metric(y_true, y_pred):
    y_true = np.argmax(y_true, axis=-1)
    y_true = np.expand_dims(y_true, axis=[1])
    
    y_pred = np.argmax(y_pred, axis=-1)
    
    return corpus_bleu(y_true.tolist(), y_pred.tolist())
    

bleu_list = []
test_itr = iter(test_dataset)

for _ in range(test_size//train_batch):
    batch = next(test_itr)
    prediction = stage2_training_model.predict(batch[0])
    bleu_list.append(bleu_metric(batch[1].numpy(), prediction))

bleu = np.mean(bleu_list)
print(bleu)

0.575162558445461


# Save the inference models

In [None]:
# tf.saved_model.save(stage2_inference_encoder_model, "stage2_inference_encoder_model")

In [None]:
# converter = tf.lite.TFLiteConverter.from_keras_model(stage2_inference_decoder_model)
# stage2_inference_decoder_model_tflite = converter.convert()
# with open('stage2_inference_decoder_model_tflite.tflite', 'wb') as f:
#   f.write(stage2_inference_decoder_model_tflite)