In [3]:
import numpy as np
import cv2
import tensorflow as tf
import time
from gtts import gTTS
from playsound import playsound

import configparser
config = configparser.ConfigParser()
config.read("config.ini")

# from test import *
# from models.utilities import *
from models.subclasses import *
from models.predict import *

In [4]:
# Static values

test_image, _ = load_image("testVideos/35506150_cbdb630f4f.jpg")

test_image2, _ = load_image("testVideos/IMG_3004.jpg")

#how many frame to play until pause
SHOW_FRAME = 60 * 1

units = int(config['config']['units'])
embedding_dim = int(config['config']['embedding_dim'])

vocabulary_size = int(config['config']['vocabulary_size'])


use_glove = bool(config['config']['use_glove'])
glove_dim = int(config['config']['glove_dim'])

In [5]:
vocabulary = load_vocab()

In [6]:
embeddings_index = {}

if use_glove:
    new_glove_path = f"./dataset/glove.6B/new_glove.6B.{glove_dim}d.pkl"
    tuned_glove = pickle.load(open(new_glove_path, "rb"))
    len(tuned_glove)
    glove_path = f"./dataset/glove.6B/glove.6B.{glove_dim}d.txt"

    with open(glove_path, encoding="utf-8") as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    embeddings_index.update(tuned_glove)

    print("Found %s word vectors." % len(embeddings_index))


    word_index = dict(zip(vocabulary, range(len(vocabulary))))

    num_tokens = len(vocabulary)
    embedding_dim = 100
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))

Found 401581 word vectors.
Converted 23871 words (5771 misses)


In [7]:
word_to_index, index_to_word =index_vocab(vocabulary)

In [8]:
encoder = CNN_Encoder(embedding_dim)
if use_glove:
    decoder = RNN_Decoder(embedding_dim, units, num_tokens, embedding_matrix)
else:
    decoder = RNN_Decoder(embedding_dim, units, tokenizer_train.vocabulary_size(), None)
    
image_features_extract_model = get_feature_extractor()

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
ckpt.restore(ckpt_manager.latest_checkpoint)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x20f91dd0190>

In [9]:
def predict_from_image(frame):
    img = tf.keras.layers.Resizing(256, 256)(frame)
    img = tf.keras.applications.resnet.preprocess_input(img)
    result = predict_image(img, encoder, decoder,
            image_features_extract_model,
            word_to_index, index_to_word)

    if result[-1] == '<end>':
        result.remove('<end>')
    
    return result

In [10]:
def speak(text, fName, count, remove = False, override= True):
    tts = gTTS(text=text, lang="en")
    filename = f"TestSamples/{fName}_{count}.mp3"
    if os.path.exists(filename):
        if override:
            os.remove(filename)
        else:
            print("File exist")
            return
        
    tts.save(filename)
    playsound(filename)
    if remove:
        os.remove(filename)

## Run the app

In [48]:
PATH = 'testVideos/Hare - 81204.mp4'

In [49]:
#loop over all frames  'space' = next frame | 'q' = quit
cap = cv2.VideoCapture(PATH)
if (cap.isOpened()== False):
    print("Error opening video stream or file")
else:
    print("Statred capturing")

fName = PATH.split('/')[-1].split('.')[0]
count = 0
total_frames = 0
while cap.isOpened():
    HasFrames, frame = cap.read()
    #if vidoe is not done do
    if HasFrames:
        total_frames += 1
        cv2.imshow('Video', frame)
        #when you reach the pause frame do
        if((total_frames % SHOW_FRAME) == 0):
            #Caption 'frame'
            start = time.time()
            # print(frame.shape)
            result = predict_from_image(frame)
            speak(' '.join(result), fName, count)
            count += 1
            end = time.time()-start
            print(f'Time taken for 1 image {end:.4f} sec\n')
            #press 'E' to get next frame
            if(cv2.waitKey(5000) == ord('e')):
                continue

        if(cv2.waitKey(25) == ord('q')):
            break    

    else:
        break

cap.release()
cv2.destroyAllWindows()

Statred capturing
Time taken for 1 image 3.7919 sec

Time taken for 1 image 3.6179 sec

Time taken for 1 image 6.4213 sec

Time taken for 1 image 4.5719 sec

Time taken for 1 image 4.6402 sec

Time taken for 1 image 4.3169 sec

Time taken for 1 image 3.9394 sec

