In [1]:
import sys
import tensorflow as tf
# physical_devices = tf.config.list_physical_devices('GPU') 
# tf.config.experimental.set_memory_growth(physical_devices[1], True)
# print("Num GPUs Available: ", tf.config.list_physical_devices())
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from skimage.metrics import structural_similarity
import matplotlib.pyplot as plt

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image, ImageOps
import pickle
import cv2

In [2]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

In [3]:
image_features_extract_model = tf.saved_model.load('features_extract_model')

In [4]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
#vocab_size = top_k + 1
vocab_size = 5000 + 1

#num_steps = len(img_name_train) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
features_shape = 2048
attention_features_shape = 64
max_length = 52

In [5]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features1, hidden1):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden1, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features1) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features1
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [6]:
class CNN_Encoder(tf.keras.Model):
    # This encoder passes the extracted features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)
        
    def call(self, x):
        inp = x
        y = self.fc(inp)
        z = tf.nn.relu(y)
        return z

In [7]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    #@tf.function(input_signature = [tf.TensorSpec(shape=[64, 1], dtype=tf.int32), tf.TensorSpec(shape=[64, 64, 256], dtype=tf.float32),tf.TensorSpec(shape=[64, 512], dtype=tf.float32)])
    @tf.function
    def __call__(self, x, features1, hidden):
        # defining attention as a separate model
        hidden1 = hidden
        context_vector, attention_weights = self.attention(features1, hidden)
    
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
    
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
    
        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)
    
        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))
    
        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)
    
        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [8]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

optimizer = tf.keras.optimizers.legacy.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [9]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

ckpt.restore(ckpt_manager.latest_checkpoint)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1b82fd0b550>

In [10]:
print(encoder.trainable_weights)

[]


In [11]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [12]:
def evaluate(image):
    attention_plot = np.zeros((max_length, attention_features_shape))

    hidden = decoder.reset_state(batch_size=1)

    temp_input = tf.expand_dims(load_image(image)[0], 0)
    img_tensor_val = image_features_extract_model(temp_input)
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
    
    features = encoder(img_tensor_val)

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = []
    
    for i in range(max_length):
        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)

        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        result.append(tokenizer.index_word[predicted_id])

        if tokenizer.index_word[predicted_id] == '<end>':
            return result, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    attention_plot = attention_plot[:len(result), :]
    return result, attention_plot

In [13]:
def plot_attention(image, result, attention_plot):
    temp_image = np.array(Image.open(image))

    fig = plt.figure(figsize=(10, 10))

    len_result = len(result)
    for l in range(len_result):
        temp_att = np.resize(attention_plot[l], (8, 8))
        ax = fig.add_subplot(len_result//2, len_result//2, l+1)
        ax.set_title(result[l])
        img = ax.imshow(temp_image)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.tight_layout()
    plt.show()

In [14]:
def similarity(X, Y):
    # tokenization
    X_list = word_tokenize(X) 
    Y_list = word_tokenize(Y)

    # sw contains the list of stopwords
    sw = stopwords.words('english')
    sw.append("<end>")
    
    l1 =[];l2 =[]

    # remove stop words from the string
    X_set = {w for w in X_list if not w in sw} 
    Y_set = {w for w in Y_list if not w in sw}

    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) # create a vector
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
    c = 0

    # cosine formula 
    for i in range(len(rvector)):
            c+= l1[i]*l2[i]
    cosine = c / float((sum(l1)*sum(l2))**0.5)
    return cosine

In [15]:
def mse(imageA, imageB):
    err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
    err /= float(imageA.shape[0] * imageA.shape[1])
    # return the MSE, the lower the error, the more "similar"
    # the two images are
    return err

def compare_images(imageA, imageB):
    imageA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
    imageB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)

    #m = mse(imageA, imageB)
    s = structural_similarity(imageA, imageB)
    return s

In [16]:
filenames = glob("../keyFrames/summ/*.png")
filenames.sort()

caption = []
for img in filenames:
    result, attention_plot = evaluate(img)
    result = " ".join(result)
    if len(caption) == 0:
        caption.append(result)
        old_img = img
        continue
    if compare_images(cv2.imread(old_img), cv2.imread(img)) > 0.9:
        old_img = img
        continue
    if similarity(result, caption[-1]) < 0.8:
        caption.append(result)
    old_img = img
    
ind = 1
remove_words = ["<end>"]
caption_final = []
for i in caption:
    for word in remove_words:
        i = i.replace(word, '')
        caption_final.append(i)
        
    print("{}) {}".format(ind, i))
    ind += 1

1) the man is posing next to a drink 
2) a close up of a leap up looking at a television that's in them between 4 
3) a close up of two laptops sitting on a table 
4) about to make a arrow sign next to a yellow tray with a sign 
5) two very big images of a car and several ties 
6) a man wearing a man and white backside of a microphone 
7) a man is in a suit with a couple takes pictures in a suit watching <unk> to a school behind it 
8) a girl on a ground in front of a <unk> 
9) there are two men standing besides a business suit 
10) a man talking on a cell phone outside an elegant business suits 
11) a woman talking on a cell phone in the background 
12) a man in various safety outfits behind him in an bins 
13) a person riding a bike in the street 
14) a man talking on in front of microphone and skateboarding tricks 
15) a man with a man flying over the bottom of microphones 
16) a woman cutting while eating a cake 
17) a man in formal formal <unk> <unk> for the camera 
18) a group of

In [17]:
import glob
import cv2

filenames = glob.glob("../keyFrames/summ/*.png")
filenames.sort()

caption = []
for img in filenames:
    result, attention_plot = evaluate(img)
    result = " ".join(result)
    if len(caption) == 0:
        caption.append(result)
        old_img = img
        continue
    if compare_images(cv2.imread(old_img), cv2.imread(img)) > 0.9:
        old_img = img
        continue
    if similarity(result, caption[-1]) < 0.8:
        caption.append(result)
    old_img = img

ind = 1
remove_words = ["<end>"]
caption_final = []
for i in caption:
    for word in remove_words:
        i = i.replace(word, '')
        caption_final.append(i)

    print("{}) {}".format(ind, i))
    ind += 1

# Save captions to caption.txt file
with open("captions.txt", "w") as file:
    for caption in caption_final:
        file.write(caption + "\n")

        

1) two males standing woman is talking to touch a man 
2) a picture of computer mouses in the room 
3) a lap that has a <unk> picture with his pink head 
4) a woman with curved signs on a skateboard 
5) several men wearing costumes with hats 
6) several young men with both one is showing their cell phones 
7) a man adjusts his suit and tie 
8) a very young woman walking near a lunch display case smiling 
9) a person wearing a dress shirt pose for the <unk> 
10) a man wearing a nice looking wine by a vest 
11) a man is talking on a cellphone 
12) a woman holding a video floor and a <unk> in <unk> puts his cell phone 
13) a man on a crowded street in their hands 
14) a man in the blue shirt pulls a skateboard on a bus with two bags 
15) there is a man holding an <unk> showing of people in the sky 
16) a woman is <unk> some bedding 
17) a man blow <unk> <unk> next to the hat holding a laptop with a slice sits on a chair and keyboard with birthday <unk> their own toy 
18) a mother helping 

# Timestamp

In [18]:
def getTime(frame, videoFile):
    cap = cv2.VideoCapture(videoFile)
    fps = (cap.get(5))
    return (int) (frame / fps)

In [19]:
import glob

In [23]:
filenames = glob.glob("../keyFrames/time/*.png")
filenames.sort()

caption = []
for img in filenames:
    result, attention_plot = evaluate(img)
    result = " ".join(result)
    if len(caption) == 0:
        caption.append(result)
        old_img = img
        continue
    if compare_images(cv2.imread(old_img), cv2.imread(img)) > 0.9:
        old_img = img
        continue
    if similarity(result, caption[-1]) < 0.8:
        caption.append(result)
    old_img = img
    
remove_words = ["<end>"]
event_final = []

print("Seconds\t\tEvent")
for img, event in zip(filenames, caption):
    if len(img) == 33:
        frame = img[33:-4]
    else:
        frame = img[34:-4]

    try:
        sec = getTime(int(frame), "../videos/NYTravel.mp4")
    except ValueError:
        print("Invalid frame number:", frame)
        sec = -1  # Assign a default or placeholder value

    for word in remove_words:
        event = event.replace(word, '')
    event_final.append(event)
    print("{}\t\t{}".format(sec, event))



Seconds		Event
104		a man and a woman wearing a blue suit and tie talking in front of a trophy and smiling 
115		a man in an orange dress talking on the phone while he holds a <unk> picture in front of a white wall 
119		a woman and her cell phone 
139		a man sits on the statue 
141		a town that sits on a street next to man by ball next to bicycles 
144		smiling as she talks on a cell phone 
175		a man standing next to it 
185		a man holding a cake 
197		some dressed in costumes posing in a old dress party gear to a cake 
199		a sink with two pictures and a bathroom sink 
202		a man and holding their wedding cake on <unk> with some ice cream from a cake 
205		a woman has <unk> a tie in a room 
220		women are wearing then a woman standing next to the sons 
3		a man and hat is being cast over on the building 
225		asian woman taking a picture 
226		a man standing next to doughnut 
227		girl wearing a hat and a cigarette 
229		a long haired woman is in a decorative hat stands dimly lit in

In [24]:
import glob
import cv2

filenames = glob.glob("../keyFrames/time/*.png")
filenames.sort()

caption = []
for img in filenames:
    result, attention_plot = evaluate(img)
    result = " ".join(result)
    if len(caption) == 0:
        caption.append(result)
        old_img = img
        continue
    if compare_images(cv2.imread(old_img), cv2.imread(img)) > 0.9:
        old_img = img
        continue
    if similarity(result, caption[-1]) < 0.8:
        caption.append(result)
    old_img = img

remove_words = ["<end>"]
event_final = []

with open("captions.txt", "a") as file:
    file.write("Event\n")
    for img, event in zip(filenames, caption):
        if len(img) == 33:
            frame = img[33:-4]
        else:
            frame = img[34:-4]
        sec = getTime(int(frame), "../videos/NYTravel.mp4")
        for word in remove_words:
            event = event.replace(word, '')
            event_final.append(event)
        
        file.write("{}\n".format(event))


In [1]:
import cv2
import numpy as np
from collections import Counter

from ultralytics import YOLO
import supervision as sv

ZONE_POLYGON = np.array([
    [0, 0],
    [0.5, 0],
    [0.5, 1],
    [0, 1]
])


def main(video_path):
    frame_width, frame_height = [1280, 720] 

    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, frame_width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_height)

    model = YOLO("yolov8x.pt") 

    box_annotator = sv.BoxAnnotator(
        thickness=2,
        text_thickness=2,
        text_scale=1
    )

    zone_polygon = (ZONE_POLYGON * np.array([frame_width, frame_height])).astype(int)
    zone = sv.PolygonZone(polygon=zone_polygon, frame_resolution_wh=(frame_width, frame_height))
    zone_annotator = sv.PolygonZoneAnnotator(
        zone=zone, 
        color=sv.Color.red(),
        thickness=2,
        text_thickness=4,
        text_scale=2
    )
    
    classes_detected = []
    existing_detections = []
    frame_count = 0
    start_time = cv2.getTickCount()
    frame_interval = 10

    while True:
        ret, frame = cap.read()

        if not ret:
            break

        if frame_count % frame_interval == 0:
            result = model(frame, agnostic_nms=True)[0]
            detections = sv.Detections.from_yolov8(result)
            labels = [
                f"{model.model.names[class_id]} {confidence:0.2f}"
                for _, confidence, class_id, _
                in detections
            ]
            frame = box_annotator.annotate(
                scene=frame, 
                detections=detections, 
                labels=labels
            )

            zone.trigger(detections=detections)
            frame = zone_annotator.annotate(scene=frame)      

            cv2.imshow("yolov8x", frame)

            if cv2.waitKey(30) == 27:
                break

        filtered_detections = []
        for detection in detections:
            if detection[1] > 0.35 and similarity_score(detection[0], existing_detections) < 0.67:
                filtered_detections.append(detection)
                existing_detections.append(detection[0])
                classes_detected.append(model.model.names[detection[2]])

        frame_count += 1

    cv2.destroyAllWindows()

    # Count the frequencies of detected classes
    class_counts = Counter(classes_detected)

    # Append the results to the captions.txt file
    captions_file = open('captions.txt', 'a')
    captions_file.write("Total classes detected: " + str(len(class_counts)) + "\n")
    for class_name, count in class_counts.items():
        captions_file.write(f"The count for {class_name} is {count}\n")
    captions_file.close()


def similarity_score(new_detection, existing_detections):
    new_bbox = np.array(new_detection)
    existing_bboxes = [np.array(det) for det in existing_detections]

    ious = [calculate_iou(new_bbox, bbox) for bbox in existing_bboxes]
    max_iou = max(ious) if ious else 0.0

    return max_iou


def calculate_iou(bbox1, bbox2):
    x1 = max(bbox1[0], bbox2[0])
    y1 = max(bbox1[1], bbox2[1])
    x2 = min(bbox1[2], bbox2[2])
    y2 = min(bbox1[3], bbox2[3])
    intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)


    bbox1_area = (bbox1[2] - bbox1[0] + 1) * (bbox1[3] - bbox1[1] + 1)
    bbox2_area = (bbox2[2] - bbox2[0] + 1) * (bbox2[3] - bbox2[1] + 1)
    union_area = bbox1_area + bbox2_area - intersection_area


    iou = intersection_area / union_area

    return iou


if __name__ == "__main__":
    video_path = "../videos/NYTravel.mp4"  # Specify the path to your video file
    main(video_path)



0: 384x640 (no detections), 845.5ms
Speed: 15.6ms preprocess, 845.5ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 boats, 831.9ms
Speed: 2.0ms preprocess, 831.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 boats, 813.7ms
Speed: 7.2ms preprocess, 813.7ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 boats, 817.2ms
Speed: 17.2ms preprocess, 817.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 boats, 841.7ms
Speed: 2.0ms preprocess, 841.7ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 boats, 808.3ms
Speed: 0.0ms preprocess, 808.3ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 boats, 886.1ms
Speed: 0.0ms preprocess, 886.1ms inference, 7.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 boats, 845.0ms
Speed: 0.2ms preprocess, 845.0ms inference, 2.0ms postprocess per image 


0: 384x640 (no detections), 862.7ms
Speed: 2.0ms preprocess, 862.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 839.0ms
Speed: 2.0ms preprocess, 839.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 840.0ms
Speed: 5.0ms preprocess, 840.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 cake, 844.0ms
Speed: 2.0ms preprocess, 844.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 bicycle, 1 backpack, 1 potted plant, 1 book, 832.0ms
Speed: 2.0ms preprocess, 832.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bicycle, 1 backpack, 1 potted plant, 816.0ms
Speed: 1.0ms preprocess, 816.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bicycle, 1 backpack, 1 potted plant, 1 book, 884.0ms
Speed: 2.0ms preprocess, 884.0ms inference, 2.0


0: 384x640 7 persons, 1 bicycle, 1 backpack, 1 handbag, 862.0ms
Speed: 2.0ms preprocess, 862.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 bicycle, 1 backpack, 1 handbag, 885.0ms
Speed: 2.0ms preprocess, 885.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 bicycle, 2 handbags, 874.0ms
Speed: 3.0ms preprocess, 874.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bicycle, 1 bench, 1 backpack, 1 handbag, 887.0ms
Speed: 2.0ms preprocess, 887.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bicycle, 2 handbags, 807.0ms
Speed: 3.0ms preprocess, 807.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bicycle, 2 handbags, 848.0ms
Speed: 3.0ms preprocess, 848.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bicycle, 2 handbags, 


0: 384x640 1 person, 1 stop sign, 1 tie, 846.0ms
Speed: 2.0ms preprocess, 846.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 tie, 863.0ms
Speed: 2.0ms preprocess, 863.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 tie, 883.0ms
Speed: 3.0ms preprocess, 883.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 car, 857.1ms
Speed: 2.0ms preprocess, 857.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 suitcase, 851.0ms
Speed: 3.0ms preprocess, 851.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 814.0ms
Speed: 3.0ms preprocess, 814.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 836.0ms
Speed: 3.0ms preprocess, 836.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 836.5ms
Speed

Speed: 3.0ms preprocess, 804.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 2 bicycles, 1 handbag, 1 tie, 808.4ms
Speed: 4.0ms preprocess, 808.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 bicycle, 1 handbag, 1 potted plant, 852.8ms
Speed: 2.0ms preprocess, 852.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 bicycle, 1 tie, 1 potted plant, 877.0ms
Speed: 2.0ms preprocess, 877.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 car, 875.0ms
Speed: 2.0ms preprocess, 875.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 cars, 833.3ms
Speed: 2.0ms preprocess, 833.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 cars, 818.0ms
Speed: 2.0ms preprocess, 818.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)




0: 384x640 3 persons, 1 tie, 845.0ms
Speed: 2.0ms preprocess, 845.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 tie, 894.0ms
Speed: 3.0ms preprocess, 894.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 2 ties, 881.1ms
Speed: 3.0ms preprocess, 881.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 2 ties, 878.5ms
Speed: 2.0ms preprocess, 878.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 scissors, 866.0ms
Speed: 3.0ms preprocess, 866.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 scissors, 862.0ms
Speed: 2.0ms preprocess, 862.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 scissors, 862.0ms
Speed: 2.0ms preprocess, 862.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 cup, 1

Speed: 2.0ms preprocess, 876.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 4 chairs, 1 cell phone, 840.0ms
Speed: 2.0ms preprocess, 840.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 chairs, 1 potted plant, 859.0ms
Speed: 3.0ms preprocess, 859.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 chair, 1 couch, 1 potted plant, 835.0ms
Speed: 2.0ms preprocess, 835.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 cup, 3 chairs, 1 dining table, 851.0ms
Speed: 2.0ms preprocess, 851.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 3 cups, 2 chairs, 1 dining table, 849.5ms
Speed: 3.0ms preprocess, 849.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 cup, 2 chairs, 1 dining table, 798.5ms
Speed: 3.0ms preprocess, 798.5ms inference


0: 384x640 15 persons, 1 tie, 1 cup, 10 chairs, 1 potted plant, 2 dining tables, 852.0ms
Speed: 3.0ms preprocess, 852.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 cup, 6 chairs, 1 potted plant, 2 dining tables, 1 laptop, 875.8ms
Speed: 4.0ms preprocess, 875.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 2 bottles, 7 chairs, 1 potted plant, 1 dining table, 1 laptop, 843.0ms
Speed: 3.0ms preprocess, 843.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 2 chairs, 1 dining table, 1 laptop, 1000.0ms
Speed: 2.0ms preprocess, 1000.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 bicycle, 1 cell phone, 911.9ms
Speed: 2.0ms preprocess, 911.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 bicycle, 1 cell phone, 852.5ms
Speed: 3.0ms preprocess, 852.5ms inference, 2.0ms p


0: 384x640 2 persons, 2 wine glasss, 868.0ms
Speed: 2.0ms preprocess, 868.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 tie, 2 wine glasss, 1 vase, 817.5ms
Speed: 2.0ms preprocess, 817.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 chair, 2 dining tables, 2 vases, 837.0ms
Speed: 2.0ms preprocess, 837.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 dining table, 1 vase, 851.1ms
Speed: 2.0ms preprocess, 851.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 vase, 846.0ms
Speed: 2.0ms preprocess, 846.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 1 tie, 2 vases, 906.5ms
Speed: 2.0ms preprocess, 906.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 wine glasss, 2 vases, 897.0ms
Speed: 2.0ms preprocess, 897.0ms inference,


0: 384x640 3 persons, 1 bicycle, 815.9ms
Speed: 2.0ms preprocess, 815.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 836.8ms
Speed: 2.0ms preprocess, 836.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 798.0ms
Speed: 3.0ms preprocess, 798.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 1 bicycle, 814.0ms
Speed: 2.0ms preprocess, 814.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 2 bicycles, 1 sink, 861.7ms
Speed: 2.0ms preprocess, 861.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 bicycles, 1 tie, 1 sink, 875.0ms
Speed: 2.0ms preprocess, 875.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 2 vases, 875.0ms
Speed: 2.0ms preprocess, 875.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person


0: 384x640 9 persons, 1 car, 2 handbags, 6 chairs, 2 potted plants, 1 dining table, 869.0ms
Speed: 2.0ms preprocess, 869.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 car, 5 chairs, 1 potted plant, 1 dining table, 889.0ms
Speed: 3.0ms preprocess, 889.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 handbag, 6 chairs, 2 potted plants, 3 dining tables, 862.0ms
Speed: 2.0ms preprocess, 862.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 cup, 6 chairs, 1 dining table, 945.0ms
Speed: 2.0ms preprocess, 945.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 car, 6 chairs, 2 potted plants, 2 dining tables, 813.9ms
Speed: 3.0ms preprocess, 813.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 10 persons, 1 car, 4 chairs, 2 dining tables, 991.1ms
Speed: 2.0ms preprocess, 991.1ms infere


0: 384x640 3 persons, 1 handbag, 1 cup, 1 laptop, 1 vase, 845.0ms
Speed: 2.0ms preprocess, 845.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 bottles, 845.0ms
Speed: 2.0ms preprocess, 845.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 bottle, 1 couch, 848.0ms
Speed: 2.0ms preprocess, 848.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 bottle, 1 couch, 1 laptop, 1 book, 828.0ms
Speed: 3.0ms preprocess, 828.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 bottle, 1 donut, 1 couch, 1 laptop, 845.0ms
Speed: 2.0ms preprocess, 845.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 motorcycle, 1 backpack, 1 handbag, 1 cell phone, 872.0ms
Speed: 2.0ms preprocess, 872.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 bananas, 855.0ms
Speed: 


0: 384x640 7 persons, 3 cups, 3 chairs, 5 dining tables, 880.0ms
Speed: 3.0ms preprocess, 880.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 backpack, 2 bottles, 17 chairs, 4 potted plants, 12 dining tables, 857.0ms
Speed: 2.0ms preprocess, 857.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 backpack, 1 cup, 15 chairs, 4 potted plants, 8 dining tables, 961.0ms
Speed: 3.0ms preprocess, 961.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 1 backpack, 1 cup, 14 chairs, 4 potted plants, 7 dining tables, 840.7ms
Speed: 5.8ms preprocess, 840.7ms inference, 15.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 14 persons, 1 backpack, 9 chairs, 2 potted plants, 9 dining tables, 854.0ms
Speed: 2.0ms preprocess, 854.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 persons, 1 backpack, 1 cup, 12 chairs, 2 potted


0: 384x640 2 persons, 840.2ms
Speed: 2.0ms preprocess, 840.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 880.5ms
Speed: 2.0ms preprocess, 880.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 865.0ms
Speed: 2.0ms preprocess, 865.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 846.0ms
Speed: 2.0ms preprocess, 846.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 886.0ms
Speed: 2.0ms preprocess, 886.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 857.0ms
Speed: 2.0ms preprocess, 857.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 tie, 961.0ms
Speed: 3.0ms preprocess, 961.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 827.0ms
Speed: 2.0ms preprocess, 827.0ms inference, 2.0ms postproce

In [2]:
!pip install -q imageio
!pip install -q opencv-python
!pip install -q git+https://github.com/tensorflow/docs

In [3]:
#@title Import the necessary modules
# TensorFlow and TF-Hub modules.
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow_docs.vis import embed

logging.set_verbosity(logging.ERROR)

# Some modules to help with reading the UCF101 dataset.
import random
import re
import os
import tempfile
import ssl
import cv2
import numpy as np

# Some modules to display an animation using imageio.
import imageio
from IPython import display

from urllib import request  # requires python3

In [4]:
#@title Helper functions for the UCF101 dataset

# Utilities to fetch videos from UCF101 dataset
UCF_ROOT = "https://www.crcv.ucf.edu/THUMOS14/UCF101/UCF101/"
_VIDEO_LIST = None
_CACHE_DIR = tempfile.mkdtemp()

unverified_context = ssl._create_unverified_context()

def list_ucf_videos():
  """Lists videos available in UCF101 dataset."""
  global _VIDEO_LIST
  if not _VIDEO_LIST:
    index = request.urlopen(UCF_ROOT, context=unverified_context).read().decode("utf-8")
    videos = re.findall("(v_[\w_]+\.avi)", index)
    _VIDEO_LIST = sorted(set(videos))
  return list(_VIDEO_LIST)

def fetch_ucf_video(video):
  """Fetchs a video and cache into local filesystem."""
  cache_path = os.path.join(_CACHE_DIR, video)
  if not os.path.exists(cache_path):
    urlpath = request.urljoin(UCF_ROOT, video)
    print("Fetching %s => %s" % (urlpath, cache_path))
    data = request.urlopen(urlpath, context=unverified_context).read()
    open(cache_path, "wb").write(data)
  return cache_path

# Utilities to open video files using CV2
def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(y, x)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

def load_video(path, max_frames=0, resize=(224, 224), frame_skip=4):
    cap = cv2.VideoCapture(path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    if max_frames == 0 or max_frames > total_frames:
        max_frames = total_frames

    try:
        frame_count = 0
        while frame_count < max_frames:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_skip == 0:
                frame = crop_center_square(frame)
                frame = cv2.resize(frame, resize)
                frame = frame[:, :, [2, 1, 0]]
                frames.append(frame)
                frame_count += 1

            # Skip frames based on desired frame rate (e.g., 15 frames per second)
            skip_frames = int(fps / 15) - 1
            for _ in range(skip_frames):
                cap.read()

            frame_count += skip_frames + 1

    finally:
        cap.release()

    return np.array(frames) / 255.0



def to_gif(images):
  converted_images = np.clip(images * 255, 0, 255).astype(np.uint8)
  imageio.mimsave('./animation.gif', converted_images, duration=40)
  return embed.embed_file('./animation.gif')

In [5]:
labels = ['abseiling',
 'air drumming',
 'answering questions',
 'applauding',
 'applying cream',
 'archery',
 'arm wrestling',
 'arranging flowers',
 'assembling computer',
 'auctioning',
 'baby waking up',
 'baking cookies',
 'balloon blowing',
 'bandaging',
 'barbequing',
 'bartending',
 'beatboxing',
 'bee keeping',
 'belly dancing',
 'bench pressing',
 'bending back',
 'bending metal',
 'biking through snow',
 'blasting sand',
 'blowing glass',
 'blowing leaves',
 'blowing nose',
 'blowing out candles',
 'bobsledding',
 'bookbinding',
 'bouncing on trampoline',
 'bowling',
 'braiding hair',
 'breading or breadcrumbing',
 'breakdancing',
 'brush painting',
 'brushing hair',
 'brushing teeth',
 'building cabinet',
 'building shed',
 'bungee jumping',
 'busking',
 'canoeing or kayaking',
 'capoeira',
 'carrying baby',
 'cartwheeling',
 'carving pumpkin',
 'catching fish',
 'catching or throwing baseball',
 'catching or throwing frisbee',
 'catching or throwing softball',
 'celebrating',
 'changing oil',
 'changing wheel',
 'checking tires',
 'cheerleading',
 'chopping wood',
 'clapping',
 'clay pottery making',
 'clean and jerk',
 'cleaning floor',
 'cleaning gutters',
 'cleaning pool',
 'cleaning shoes',
 'cleaning toilet',
 'cleaning windows',
 'climbing a rope',
 'climbing ladder',
 'climbing tree',
 'contact juggling',
 'cooking chicken',
 'cooking egg',
 'cooking on campfire',
 'cooking sausages',
 'counting money',
 'country line dancing',
 'cracking neck',
 'crawling baby',
 'crossing river',
 'crying',
 'curling hair',
 'cutting nails',
 'cutting pineapple',
 'cutting watermelon',
 'dancing ballet',
 'dancing charleston',
 'dancing gangnam style',
 'dancing macarena',
 'deadlifting',
 'decorating the christmas tree',
 'digging',
 'dining',
 'disc golfing',
 'diving cliff',
 'dodgeball',
 'doing aerobics',
 'doing laundry',
 'doing nails',
 'drawing',
 'dribbling basketball',
 'drinking',
 'drinking beer',
 'drinking shots',
 'driving car',
 'driving tractor',
 'drop kicking',
 'drumming fingers',
 'dunking basketball',
 'dying hair',
 'eating burger',
 'eating cake',
 'eating carrots',
 'eating chips',
 'eating doughnuts',
 'eating hotdog',
 'eating ice cream',
 'eating spaghetti',
 'eating watermelon',
 'egg hunting',
 'exercising arm',
 'exercising with an exercise ball',
 'extinguishing fire',
 'faceplanting',
 'feeding birds',
 'feeding fish',
 'feeding goats',
 'filling eyebrows',
 'finger snapping',
 'fixing hair',
 'flipping pancake',
 'flying kite',
 'folding clothes',
 'folding napkins',
 'folding paper',
 'front raises',
 'frying vegetables',
 'garbage collecting',
 'gargling',
 'getting a haircut',
 'getting a tattoo',
 'giving or receiving award',
 'golf chipping',
 'golf driving',
 'golf putting',
 'grinding meat',
 'grooming dog',
 'grooming horse',
 'gymnastics tumbling',
 'hammer throw',
 'headbanging',
 'headbutting',
 'high jump',
 'high kick',
 'hitting baseball',
 'hockey stop',
 'holding snake',
 'hopscotch',
 'hoverboarding',
 'hugging',
 'hula hooping',
 'hurdling',
 'hurling (sport)',
 'ice climbing',
 'ice fishing',
 'ice skating',
 'ironing',
 'javelin throw',
 'jetskiing',
 'jogging',
 'juggling balls',
 'juggling fire',
 'juggling soccer ball',
 'jumping into pool',
 'jumpstyle dancing',
 'kicking field goal',
 'kicking soccer ball',
 'kissing',
 'kitesurfing',
 'knitting',
 'krumping',
 'laughing',
 'laying bricks',
 'long jump',
 'lunge',
 'making a cake',
 'making a sandwich',
 'making bed',
 'making jewelry',
 'making pizza',
 'making snowman',
 'making sushi',
 'making tea',
 'marching',
 'massaging back',
 'massaging feet',
 'massaging legs',
 "massaging person's head",
 'milking cow',
 'mopping floor',
 'motorcycling',
 'moving furniture',
 'mowing lawn',
 'news anchoring',
 'opening bottle',
 'opening present',
 'paragliding',
 'parasailing',
 'parkour',
 'passing American football (in game)',
 'passing American football (not in game)',
 'peeling apples',
 'peeling potatoes',
 'petting animal (not cat)',
 'petting cat',
 'picking fruit',
 'planting trees',
 'plastering',
 'playing accordion',
 'playing badminton',
 'playing bagpipes',
 'playing basketball',
 'playing bass guitar',
 'playing cards',
 'playing cello',
 'playing chess',
 'playing clarinet',
 'playing controller',
 'playing cricket',
 'playing cymbals',
 'playing didgeridoo',
 'playing drums',
 'playing flute',
 'playing guitar',
 'playing harmonica',
 'playing harp',
 'playing ice hockey',
 'playing keyboard',
 'playing kickball',
 'playing monopoly',
 'playing organ',
 'playing paintball',
 'playing piano',
 'playing poker',
 'playing recorder',
 'playing saxophone',
 'playing squash or racquetball',
 'playing tennis',
 'playing trombone',
 'playing trumpet',
 'playing ukulele',
 'playing violin',
 'playing volleyball',
 'playing xylophone',
 'pole vault',
 'presenting weather forecast',
 'pull ups',
 'pumping fist',
 'pumping gas',
 'punching bag',
 'punching person (boxing)',
 'push up',
 'pushing car',
 'pushing cart',
 'pushing wheelchair',
 'reading book',
 'reading newspaper',
 'recording music',
 'riding a bike',
 'riding camel',
 'riding elephant',
 'riding mechanical bull',
 'riding mountain bike',
 'riding mule',
 'riding or walking with horse',
 'riding scooter',
 'riding unicycle',
 'ripping paper',
 'robot dancing',
 'rock climbing',
 'rock scissors paper',
 'roller skating',
 'running on treadmill',
 'sailing',
 'salsa dancing',
 'sanding floor',
 'scrambling eggs',
 'scuba diving',
 'setting table',
 'shaking hands',
 'shaking head',
 'sharpening knives',
 'sharpening pencil',
 'shaving head',
 'shaving legs',
 'shearing sheep',
 'shining shoes',
 'shooting basketball',
 'shooting goal (soccer)',
 'shot put',
 'shoveling snow',
 'shredding paper',
 'shuffling cards',
 'side kick',
 'sign language interpreting',
 'singing',
 'situp',
 'skateboarding',
 'ski jumping',
 'skiing (not slalom or crosscountry)',
 'skiing crosscountry',
 'skiing slalom',
 'skipping rope',
 'skydiving',
 'slacklining',
 'slapping',
 'sled dog racing',
 'smoking',
 'smoking hookah',
 'snatch weight lifting',
 'sneezing',
 'sniffing',
 'snorkeling',
 'snowboarding',
 'snowkiting',
 'snowmobiling',
 'somersaulting',
 'spinning poi',
 'spray painting',
 'spraying',
 'springboard diving',
 'squat',
 'sticking tongue out',
 'stomping grapes',
 'stretching arm',
 'stretching leg',
 'strumming guitar',
 'surfing crowd',
 'surfing water',
 'sweeping floor',
 'swimming backstroke',
 'swimming breast stroke',
 'swimming butterfly stroke',
 'swing dancing',
 'swinging legs',
 'swinging on something',
 'sword fighting',
 'tai chi',
 'taking a shower',
 'tango dancing',
 'tap dancing',
 'tapping guitar',
 'tapping pen',
 'tasting beer',
 'tasting food',
 'testifying',
 'texting',
 'throwing axe',
 'throwing ball',
 'throwing discus',
 'tickling',
 'tobogganing',
 'tossing coin',
 'tossing salad',
 'training dog',
 'trapezing',
 'trimming or shaving beard',
 'trimming trees',
 'triple jump',
 'tying bow tie',
 'tying knot (not on a tie)',
 'tying tie',
 'unboxing',
 'unloading truck',
 'using computer',
 'using remote controller (not gaming)',
 'using segway',
 'vault',
 'waiting in line',
 'walking the dog',
 'washing dishes',
 'washing feet',
 'washing hair',
 'washing hands',
 'water skiing',
 'water sliding',
 'watering plants',
 'waxing back',
 'waxing chest',
 'waxing eyebrows',
 'waxing legs',
 'weaving basket',
 'welding',
 'whistling',
 'windsurfing',
 'wrapping present',
 'wrestling',
 'writing',
 'yawning',
 'yoga',
 'zumba']

In [6]:
# Get the list of videos in the dataset.
ucf_videos = list_ucf_videos()

categories = {}
for video in ucf_videos:
  category = video[2:-12]
  if category not in categories:
    categories[category] = []
  categories[category].append(video)
print("Found %d videos in %d categories." % (len(ucf_videos), len(categories)))

for category, sequences in categories.items():
  summary = ", ".join(sequences[:2])
  print("%-20s %4d videos (%s, ...)" % (category, len(sequences), summary))


Found 13320 videos in 101 categories.
ApplyEyeMakeup        145 videos (v_ApplyEyeMakeup_g01_c01.avi, v_ApplyEyeMakeup_g01_c02.avi, ...)
ApplyLipstick         114 videos (v_ApplyLipstick_g01_c01.avi, v_ApplyLipstick_g01_c02.avi, ...)
Archery               145 videos (v_Archery_g01_c01.avi, v_Archery_g01_c02.avi, ...)
BabyCrawling          132 videos (v_BabyCrawling_g01_c01.avi, v_BabyCrawling_g01_c02.avi, ...)
BalanceBeam           108 videos (v_BalanceBeam_g01_c01.avi, v_BalanceBeam_g01_c02.avi, ...)
BandMarching          155 videos (v_BandMarching_g01_c01.avi, v_BandMarching_g01_c02.avi, ...)
BaseballPitch         150 videos (v_BaseballPitch_g01_c01.avi, v_BaseballPitch_g01_c02.avi, ...)
BasketballDunk        131 videos (v_BasketballDunk_g01_c01.avi, v_BasketballDunk_g01_c02.avi, ...)
Basketball            134 videos (v_Basketball_g01_c01.avi, v_Basketball_g01_c02.avi, ...)
BenchPress            160 videos (v_BenchPress_g01_c01.avi, v_BenchPress_g01_c02.avi, ...)
Biking              

In [7]:
i3d = hub.load("https://tfhub.dev/deepmind/i3d-kinetics-400/1").signatures['default']

In [8]:
def predict(sample_video, batch_size):
  # Add a batch axis to the sample video.
  model_input = tf.constant(sample_video, dtype=tf.float32)
  model_input = tf.expand_dims(model_input, axis=0)

  logits = i3d(model_input)['default'][0]
  probabilities = tf.nn.softmax(logits)

  print("Top 5 actions:")
  for i in np.argsort(probabilities)[::-1][:5]:
    print(f"  {labels[i]:22}: {probabilities[i] * 100:5.2f}%")

In [9]:
import glob

In [10]:
video_path = "../videos/NYTravel.mp4"

In [11]:
video_path

'../videos/NYTravel.mp4'

In [13]:
import cv2
import numpy as np
import tensorflow as tf

def load_video(path, max_frames=0, resize=(224, 224), frame_skip=4):
    cap = cv2.VideoCapture(path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    if max_frames == 0 or max_frames > total_frames:
        max_frames = total_frames

    try:
        frame_count = 0
        while frame_count < max_frames:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_skip == 0:
                frame = crop_center_square(frame)
                frame = cv2.resize(frame, resize)
                frame = frame[:, :, [2, 1, 0]]
                frames.append(frame)
                frame_count += 1

          
            skip_frames = int(fps / 15) - 1
            for _ in range(skip_frames):
                cap.read()

            frame_count += skip_frames + 1

    finally:
        cap.release()

    return np.array(frames) / 255.0

def predict_in_batches(video_path, batch_size, threshold):
  
    sample_video = load_video(video_path)
    to_gif(sample_video)

   
    num_frames = sample_video.shape[0]
    num_batches = (num_frames + batch_size - 1) // batch_size

    with open('captions.txt', 'a') as file:
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = min(start_idx + batch_size, num_frames)

          
            batch = sample_video[start_idx:end_idx]

    
            model_input = tf.constant(batch, dtype=tf.float32)
            model_input = tf.expand_dims(model_input, axis=0)

            logits = i3d(model_input)['default'][0]
            probabilities = tf.nn.softmax(logits)
            print("Top 5 actions:")
            for i in np.argsort(probabilities)[::-1][:5]:
                print(f"  {labels[i]:22}: {probabilities[i] * 100:5.2f}%")
            for j, score in enumerate(probabilities):
                if score*100 > threshold:
                    score = score*100
                    action = labels[j]
                    timestamp = start_idx + j
                    file.write(f"Detected  {action} motion with percentage {score}% in video at timestamp {timestamp} \n")
            
                    print(f"{timestamp}: {action}")

batch_size = 10
threshold = 50

predict_in_batches(video_path, batch_size, threshold)


Top 5 actions:
  hopscotch             : 46.59%
  skydiving             : 22.52%
  ice fishing           :  4.64%
  walking the dog       :  3.99%
  kitesurfing           :  3.35%
Top 5 actions:
  hopscotch             : 99.90%
  skydiving             :  0.06%
  catching or throwing frisbee:  0.01%
  playing cricket       :  0.00%
  playing squash or racquetball:  0.00%
166: hopscotch
Top 5 actions:
  hopscotch             : 99.99%
  skydiving             :  0.00%
  presenting weather forecast:  0.00%
  folding napkins       :  0.00%
  catching or throwing frisbee:  0.00%
176: hopscotch
Top 5 actions:
  hopscotch             : 99.97%
  skydiving             :  0.01%
  cleaning pool         :  0.01%
  catching or throwing frisbee:  0.00%
  checking tires        :  0.00%
186: hopscotch
Top 5 actions:
  hopscotch             : 99.88%
  cleaning pool         :  0.05%
  checking tires        :  0.03%
  skydiving             :  0.01%
  playing monopoly      :  0.00%
196: hopscotch
Top 5 acti

Top 5 actions:
  answering questions   : 36.73%
  tasting food          :  4.15%
  laughing              :  3.42%
  wrapping present      :  2.84%
  texting               :  2.82%
Top 5 actions:
  tasting food          : 23.69%
  trimming or shaving beard:  7.86%
  answering questions   :  5.74%
  eating ice cream      :  4.63%
  doing nails           :  3.70%
Top 5 actions:
  trimming or shaving beard: 12.36%
  giving or receiving award:  6.03%
  brushing teeth        :  5.80%
  washing hands         :  4.96%
  tying bow tie         :  4.61%
Top 5 actions:
  spraying              : 13.83%
  spray painting        :  7.75%
  flying kite           :  4.13%
  catching or throwing frisbee:  2.76%
  answering questions   :  2.55%
Top 5 actions:
  balloon blowing       : 13.41%
  kissing               : 12.94%
  flying kite           :  9.14%
  smoking               :  4.86%
  sniffing              :  4.61%
Top 5 actions:
  laughing              : 15.83%
  gargling              :  8.17%
  te

Top 5 actions:
  cleaning windows      : 19.69%
  washing hands         :  6.10%
  taking a shower       :  5.14%
  brushing teeth        :  5.07%
  fixing hair           :  3.80%
Top 5 actions:
  trimming or shaving beard: 34.32%
  washing hands         :  9.94%
  giving or receiving award:  9.24%
  cleaning windows      :  8.14%
  grooming dog          :  3.96%
Top 5 actions:
  playing trumpet       : 33.28%
  drinking beer         : 21.13%
  playing harmonica     : 11.43%
  drinking              : 10.07%
  playing recorder      :  4.89%
Top 5 actions:
  plastering            : 15.23%
  doing nails           : 12.45%
  trimming or shaving beard: 12.19%
  applying cream        :  8.89%
  contact juggling      :  8.19%
Top 5 actions:
  busking               :  4.99%
  cleaning windows      :  4.97%
  playing bagpipes      :  4.72%
  giving or receiving award:  4.25%
  stretching arm        :  4.09%
Top 5 actions:
  cleaning windows      : 15.54%
  stretching leg        :  7.02%
  washi

Top 5 actions:
  walking the dog       :  4.87%
  stretching arm        :  3.99%
  catching or throwing frisbee:  3.67%
  shaking hands         :  3.39%
  jogging               :  3.06%
Top 5 actions:
  washing hands         :  5.83%
  answering questions   :  3.75%
  surfing crowd         :  3.30%
  eating hotdog         :  3.22%
  catching or throwing frisbee:  2.32%
Top 5 actions:
  washing hands         : 34.93%
  flipping pancake      :  2.79%
  playing basketball    :  2.67%
  dancing macarena      :  2.63%
  playing volleyball    :  2.61%
Top 5 actions:
  applauding            : 10.72%
  dining                :  7.20%
  celebrating           :  5.53%
  giving or receiving award:  4.70%
  eating hotdog         :  3.50%
Top 5 actions:
  tossing coin          : 21.71%
  giving or receiving award:  4.43%
  rock scissors paper   :  4.09%
  unboxing              :  3.80%
  reading newspaper     :  3.51%
Top 5 actions:
  playing squash or racquetball: 35.63%
  folding napkins       : 1

Top 5 actions:
  washing hands         : 22.35%
  cleaning pool         :  6.01%
  doing nails           :  2.72%
  cleaning windows      :  2.53%
  eating hotdog         :  2.17%
Top 5 actions:
  yoga                  : 12.66%
  stretching arm        : 11.79%
  stretching leg        :  7.57%
  washing hands         :  4.46%
  counting money        :  2.78%
Top 5 actions:
  playing monopoly      :  4.62%
  cleaning pool         :  4.40%
  reading newspaper     :  3.91%
  cleaning windows      :  2.73%
  spray painting        :  2.39%
Top 5 actions:
  washing hands         :  7.46%
  catching or throwing baseball:  5.64%
  reading newspaper     :  4.37%
  eating hotdog         :  3.98%
  eating ice cream      :  3.82%
Top 5 actions:
  hopscotch             : 11.96%
  playing volleyball    :  5.00%
  catching or throwing frisbee:  3.36%
  training dog          :  3.20%
  washing hands         :  3.19%
Top 5 actions:
  cleaning windows      :  5.10%
  washing hands         :  4.86%
  jogg