In [2]:
from model import load_from_pickle, Model
import os
import tensorflow as tf

In [6]:
data = load_from_pickle('./data')
model_path = './output/lipReaderModel.keras'

if os.path.exists(model_path):
    model = tf.keras.models.load_model(model_path)
    print("Loading saved model")
else:
    model = Model(len(data["idx2word"].keys()), hidden_size=32, window_size=20)
    print("Creating new model")

model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.005), 
    # metrics=[acc_metric],
)

Creating new model


In [11]:
epochs = 3

for e in range(epochs):
    train_acc = model.train(
        tf.convert_to_tensor(data["train_captions"]), 
        tf.convert_to_tensor(data["train_videos"]), 
        data["train_video_mappings"], data["word2idx"]['<pad>'], batch_size=15)
    print(f"epoch:{e}, train_acc:{train_acc}")
    test_acc = model.test(
        tf.convert_to_tensor(data["test_captions"]), 
        tf.convert_to_tensor(data["test_videos"]), 
        data["test_video_mappings"], data["word2idx"]['<pad>'], batch_size=15
    )
    print(f"epoch {e}, test acc{test_acc}")

[Valid 14/14]	 loss=35281.285	 acc: 0.034	 perp: inf
epoch:0, train_acc:(35281.28515625, 0.03356596827507019, inf)
[Valid 14/14]	 loss=41533.199	 acc: 0.040	 perp: inf
epoch 0, test acc(41533.19921875, 0.040163375437259674, inf)
[Valid 14/14]	 loss=25854.203	 acc: 0.059	 perp: inf
epoch:1, train_acc:(25854.203125, 0.05898033827543259, inf)
[Valid 14/14]	 loss=24323.879	 acc: 0.225	 perp: inf
epoch 1, test acc(24323.87890625, 0.22543352842330933, inf)
[Valid 14/14]	 loss=14758.586	 acc: 0.210	 perp: inf
epoch:2, train_acc:(14758.5859375, 0.20967741310596466, inf)
[Valid 14/14]	 loss=13142.505	 acc: 0.275	 perp: inf
epoch 2, test acc(13142.5048828125, 0.27460962533950806, inf)


In [10]:
import numpy as np

def gen_caption_temperature(model, image_embedding, wordToIds, padID, temp, window_length):
    """
    Function used to generate a caption using an ImageCaptionModel given
    an image embedding. 
    """
    idsToWords = {id: word for word, id in wordToIds.items()}
    unk_token = wordToIds['<unk>']
    caption_so_far = [wordToIds['<start>']]
    while len(caption_so_far) < window_length and caption_so_far[-1] != wordToIds['<end>']:
        caption_input = np.array([caption_so_far + ((window_length - len(caption_so_far)) * [padID])])
        logits = model(np.expand_dims(image_embedding, 0), caption_input)
        logits = logits[0][len(caption_so_far) - 1]
        probs = tf.nn.softmax(logits / temp).numpy()
        next_token = unk_token
        attempts = 0
        while next_token == unk_token and attempts < 5:
            next_token = np.random.choice(len(probs), p=probs)
            attempts += 1
        caption_so_far.append(next_token)
    return ' '.join([idsToWords[x] for x in caption_so_far][1:-1])


INDEED A INDEED PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE
INDEED A INDEED PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE
INDEED A INDEED PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE
INDEED A INDEED PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE PLACE
