In [1]:
import numpy as np
import cv2
import tensorflow as tf
import time

import configparser
config = configparser.ConfigParser()
config.read("config.ini")

# from test import *
# from models.utilities import *
from models.subclasses import *
from models.predict import *

In [2]:
# Static values
PATH = 'testVideos/IMG_9367.MOV'

image, _ = load_image("testVideos/35506150_cbdb630f4f.jpg")

#how many frame to play until pause
SHOW_FRAME = 60

save_path = config["config"]["save_path"]

units = int(config['config']['units'])
embedding_dim = int(config['config']['embedding_dim'])

In [3]:
img_name_train, cap_train, img_name_val, cap_val, vocabulary = load_dataset()
word_to_index, index_to_word = index_vocab(vocabulary)

In [4]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, len(vocabulary))

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [5]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
ckpt.restore(ckpt_manager.latest_checkpoint)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x122e21cacd0>

In [6]:
_, _, image_features_extract_model = load_models()



In [7]:
def predict_all(frame):
    img = tf.keras.layers.Resizing(224, 224)(frame)
    img = tf.keras.applications.resnet50.preprocess_input(img)
    result = predict_image(img, encoder, decoder,
            image_features_extract_model,
            word_to_index, index_to_word)
    temp = Image.fromarray(frame, 'RGB')
#     temp.show()
    print(result)
    return result

## Run the app

In [8]:
#loop over all frames  'space' = next frame | 'q' = quit
cap = cv2.VideoCapture(PATH)
if (cap.isOpened()== False):
    print("Error opening video stream or file")
else:
    print("Statred capturing")

total_frames = 0
while cap.isOpened():
    HasFrames, frame = cap.read()
    #if vidoe is not done do
    if HasFrames:
        total_frames += 1
        cv2.imshow('Video', frame)
        #when you reach the pause frame do
        if((total_frames % SHOW_FRAME) == 0):
            #Caption 'frame'
            start = time.time()
            # print(frame.shape)
            predict_all(frame)
            print(f'Time taken for 1 image {time.time()-start:.4f} sec\n')
            #press 'E' to get next frame
            if(cv2.waitKey(5000) == ord('e')):
                continue

        if(cv2.waitKey(25) == ord('q')):
            break    

    else:
        break

cap.release()
cv2.destroyAllWindows()

Statred capturing
['a', 'car', 'at', 'a', 'city', 'street', 'with', 'a', 'city', 'at', 'a', 'city', 'outside', 'a', 'city']
Time taken for 1 image 2.3515 sec

['a', 'silver', 'building.', '<end>']
Time taken for 1 image 0.1220 sec

['a', 'lot', 'of', 'motorcycles', 'are', 'parked', 'on', 'the', 'wall', 'next', 'to', 'the', 'beginning', 'of', 'the']
Time taken for 1 image 0.2630 sec

