**This file allows one to run our system with a pre-loaded video and pre-trained image captioning model. This does not include our tests, performance benchmarks, model training, etc... for the various components.**

*These can be found in video_to_frames.ipynb, Image_captioning_with_transformers_final.ipynb, and image_captions_to_story.py, respectively*

When running this file, make sure a video is loaded in the directory or runtime environment, and change the video variable in the last cell to the video file name

## Imports

In [1]:
!pip install "deeplake<4"
!pip install --upgrade transformers
!pip install nltk
!pip install torchvision
!pip install pycocoevalcap
!pip install --upgrade opencv-python opencv-python-headless
!pip install wordfreq



In [3]:
import torch
import torch.nn as nn

import deeplake
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import cv2
import os

from torchvision import transforms
from transformers import BertTokenizer, BertModel
import nltk
import random

from torch.utils.data import Dataset
from PIL import Image
from transformers import pipeline
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
import math
from wordfreq import word_frequency
import re
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joshdeleeuw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/joshdeleeuw/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Part I : Video to Frames

In [4]:
def extract_frames_with_opencv(video_path, output_dir, gap):
    """
    Extracts and saves frames from a video at specified intervals.

    Parameters:
    - video_path (str): Path to the video file.
    - output_dir (str): Directory to save the extracted frames.
    - gap (int): Interval between frames to save (e.g., every 10th frame).

    Returns:
    - list of str: Paths to the saved frame images, or an empty list if the video cannot be opened.

    The function captures frames from the specified video file, saving every nth frame defined by the gap parameter.
    It logs the process, including the number of frames processed and saved. Errors in opening the video are also logged.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video file {video_path}")
        return []

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    print(f"Video FPS: {fps}, Total frames: {total_frames}")

    frame_count = 0
    saved_count = 0
    image_paths = []

    while True:
        ret, frame = cap.read()
        if not ret:
            print(f"End of video or frame read failed at frame {frame_count}.")
            break

        if frame_count % gap == 0:
            image_path = os.path.join(output_dir, f"frame_{saved_count*gap}.jpg")
            cv2.imwrite(image_path, frame)
            image_paths.append(image_path)
            saved_count += 1
            print(f"Saved frame {frame_count} to {image_path}")

        frame_count += 1

    cap.release()
    print(f"Total frames processed: {frame_count}")
    print(f"Total frames saved: {saved_count}")
    return image_paths

# Part II : Image Captioning

## Preprocessing Data, Tokenizing

In [5]:
ds = deeplake.load('hub://activeloop/flickr30k')



Opening dataset in read-only mode as you don't have write permissions.


|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/flickr30k



/

hub://activeloop/flickr30k loaded successfully.



-

In [6]:
images = ds.image
captions = ds.caption_0 ## NextSteps: currently only training on caption0, could include other set of captions

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [8]:
caption_max_length = 20
vocab_size = len(tokenizer)

def process_image(image):
  """
  Image transformation functions
  Resizes images to 224x224, converts to tensor, and normalizes
  The normalization parameters are mean, SD of of r, g, b pixel values
  """
  image_transform = transforms.Compose([
      transforms.Resize((224, 224)),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
  ])
  image = image.numpy()
  image = Image.fromarray(image)
  image = image_transform(image)
  return image


def process_caption(caption_text):
  """
  Tokenizes captions
  Returns tensor of tokenized captions, with a max length of 20,
  padding until this length, and truncating if the caption is longer
  The individual entries in the tensor are integers
  """

  tokenized = tokenizer(caption_text,
                        max_length=caption_max_length,
                        padding='max_length',
                        truncation=True,
                        return_tensors='pt')

  input_ids = tokenized['input_ids']
  attention_mask = tokenized['attention_mask']
  return input_ids

def untokenize(tokenized_text):
    """
    Untokenizes a tensor
    Takes a tensor of integers, and according to tokenizer, returns
    original words
    """
    return tokenizer.decode(tokenized_text, skip_special_tokens=True)


## Transformer-Based Image Captioning
Idea from : https://www.tensorflow.org/text/tutorials/image_captioning

Steps :  

1. Feature Extraction of Image (CNN)

2. Word Embedding : word + positional embedding

3. Decode Layer : Self attention, Cross Attention, Feed Forward Neural Network

4. Output : Post Processing of probability vector

## Feature Extraction

In [9]:
class CNN_feature_extraction(nn.Module):
  def __init__(self):
    """
    Initializes convolutional neural network
    """
    super(CNN_feature_extraction, self).__init__()
    self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1).to(device) # (224,224,64)
    self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1).to(device)# (224,224,128)
    self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1).to(device)# (224,224,256)

    self.relu = nn.ReLU()
    self.fc = nn.Linear(256 * 28 * 28, 512).to(device)
    self.fc2 = nn.Linear(512, 256).to(device)
    self.flatten = nn.Flatten()

    self.maxPool = nn.MaxPool2d(2, 2).to(device)

    self.batchNorm1 = nn.BatchNorm2d(64).to(device)
    self.batchNorm2 = nn.BatchNorm2d(128).to(device)
    self.batchNorm3 = nn.BatchNorm2d(256).to(device)

  def forward(self, x):
    """
    Forward pass of convolutional neural network
    Note: Output is already flattened
    """
    # for each layer, run convolutional layer, batch normalization layer,
    # then maxPool layer
    layer1 = self.maxPool(self.relu(self.batchNorm1(self.conv1(x)))) # batch_size, 64, 112, 112 (b, c , h, w )
    layer2 = self.maxPool(self.relu(self.batchNorm2(self.conv2(layer1)))) # batch_size, 128, 56, 56
    layer3 = self.maxPool(self.relu(self.batchNorm3(self.conv3(layer2)))) # batch_size, 256, 28, 28
    x1 = self.flatten(layer3)  # 1, 200704 = 256*28*28
    hidden = self.relu(self.fc(x1))
    x = self.fc2(hidden) # 1, 256
    return x

## Embedding Layer
Embedding = Word + Positional Embedding.

In [10]:
## Hyperparameter
embed_dim = 256

In [11]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, max_length, dim):
        """
        Combines token and positional embeddings to produce sequence embeddings for Transformers.
        Input:
          vocab_size: size of vocab. Used for token embedding
          max_length: max length of caption. Used for positional embedding
          dim: Dimension of embedding vectors. Dimension of both token and positional Embedding
        """

        super().__init__()
        self.pos_embedding = nn.Embedding(num_embeddings=max_length, embedding_dim=dim).to(device)
        self.token_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=dim, padding_idx=0).to(device) # ie padding_idx that = 0 will not be trained

    def forward(self, caption):
        """
          Input: Caption (batch, max_length = 20)
          Output: Embedding (batch, max_length, dim)
        """
        _, max_length = caption.shape

        token_embed = self.token_embedding(caption)

        # Create a positional array with length = caption length (20)
        positional_indices = torch.arange(max_length, device=caption.device).unsqueeze(0).to(device) # 1, 20
        position_embed = self.pos_embedding(positional_indices)

        return token_embed + position_embed

## Decoder
Contains self attention, cross attention, feed forward neural network

In [12]:
## Hyperparameters
num_heads = 1
dropout =  0.1

### Self Attention Layer

In [13]:
class SelfAttention(nn.Module):
    """
    Self-Attention mechanism for token embeddings. Capture dependencies between words.

    Inputs:
        embed_dim: Embedding dimension = embed_dim
        num_heads: The number of attention heads. Each head learns different aspects of the relationships between tokens.
        dropout: Dropout rate used to prevent overfitting
    """
    def __init__(self,  num_heads = 1, embed_dim = embed_dim, dropout=0.1):
        super(SelfAttention, self).__init__()


        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout).to(device)
        self.layernorm = nn.LayerNorm(embed_dim).to(device)

    def forward(self, x_token):
        """
        input shape : (batch, max_length, dim)
        output shape : (batch, max_length, dim)
        """
        attn_output, attn_output_weights = self.attention(query=x_token, key=x_token, value=x_token)
        x = x_token + attn_output  #Residual connection prevent vanishing grad
        return self.layernorm(x)

### Cross Attention Layer

In [14]:
class CrossAttention(nn.Module):
    """
    Cross Attention between caption and image.

    Input:
        Caption Embedding : (batch, seq_length, embed_dim)
        Image Feature Extraction : (batch, 256)
    """

    def __init__(self, embed_dim, num_heads=1, dropout=0.1):
        super(CrossAttention, self).__init__()
        self.mha = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout).to(device)
        self.layernorm = nn.LayerNorm(embed_dim).to(device)

    def forward(self, caption, image):
        batch_size_c, seq_length, embed_dim = caption.shape
        batch_size_i, image_embed = image.shape
        assert batch_size_c == batch_size_i, "Batch Dimension of image and caption does not match"
        assert image_embed == 256, "Image Shape Incorrect"
        assert image_embed == embed_dim , "Image dimension does not match Token dimension"

        # create the same image for seq_length
        image_broadcasted = image.unsqueeze(1).repeat(1, seq_length, 1) # batch, seq_length, image_dim
        attn_output, attention_scores = self.mha(query=caption, key=image_broadcasted, value=image_broadcasted)
        caption = caption + attn_output  # Residual connection
        self.last_attention_scores = attention_scores
        return self.layernorm(caption)  #(batch, max_length, image_dim)

### Feed Forward Neural Network

In [15]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, dropout_rate=0.1):
        super(FeedForward, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(embed_dim, 2 * embed_dim).to(device),
            nn.ReLU(),
            nn.Linear(2 * embed_dim, embed_dim).to(device),
            nn.Dropout(dropout_rate)
        )
        self.layernorm = nn.LayerNorm(embed_dim).to(device)

    def forward(self, x):
        x = x + self.layer(x).to(device)
        return self.layernorm(x)


### Decoder Layer

In [16]:
class DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads=1, dropout=0.1):
        super(DecoderLayer, self).__init__()

        self.self_attention = SelfAttention(num_heads, embed_dim, dropout)
        self.cross_attention = CrossAttention(embed_dim, num_heads, dropout)
        self.ffnn = FeedForward(embed_dim, dropout)

    def forward(self, inputs):
        image, caption = inputs
        caption_1 = self.self_attention(caption)
        output_seq = self.cross_attention(caption_1, image)
        out_seq = self.ffnn(output_seq)

        self.last_attention_scores = self.cross_attention.last_attention_scores

        return out_seq # batch, seq_length, embed_dim

## PostProcessing

1. Problem: Some tokens occur much more frequently than others in the dataset.
Without adjustment, the model may develop a bias toward predicting these frequent tokens, regardless of context.
Incorporate token frequency information in the logits using the bias term.
For instance, less frequent tokens are given a higher weight, encouraging the model to predict them when appropriate.

2. Problem: Some tokens, like [UNK] or [START], are placeholders or special tokens not intended for prediction.
Solution:
Add a large negative bias (-1e9) to these tokens' logits to make their probabilities effectively zero after softmax.
This guarantees they are not predicted during decoding.

In [17]:
import collections
from tqdm import tqdm

In [18]:
class PostProcessing(nn.Module):
    def __init__(self, embed_dim, tokenizer, banned_tokens=('[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]', 'the', 'a', 'in')):

        super(PostProcessing, self).__init__()
        self.vocab_size = tokenizer.vocab_size
        self.tokenizer = tokenizer
        self.fc = nn.Linear(embed_dim, out_features=tokenizer.vocab_size).to(device) ## Goal is to map to vocab size # batch, seq_length, vocab_size
        self.banned_tokens = banned_tokens

        self.bias = None

    def adapt(self, dataset):
        """
            vocab_dict: a dictionary to map vocab word to assigned index (from 0 to vocab_size)
            counts : a dictionary holding frequency of each word(token), key: id value :freq
        """
        counts = collections.Counter()
        vocab_dict = {name: id for id, name in enumerate(self.tokenizer.vocab)} # map name to id


        for tokens in tqdm(dataset):
            counts.update(tokens.numpy().flatten())  # Update freq for token in the sentence

        # Turn Frequency into an array of all vocabulary
        # fill vocab array with freq
        # array indices = token_id
        counts_arr = np.zeros(shape=(self.vocab_size,))
        indices = np.array(list(counts.keys()), dtype=np.int32)
        counts_arr[indices] = list(counts.values())


        for token in self.banned_tokens:
            id = vocab_dict[token]
            counts_arr[id] = 0

        total = counts_arr.sum()
        p = counts_arr / total
        p[counts_arr == 0] = 1.0  # Prevent log(0)
        log_p = np.log(p)  # log(1) == 0

        entropy = -(log_p * p).sum()

        # print(f"\nUniform entropy: {np.log(self.tokenizer.vocab_size):0.2f}")
        # print(f"Marginal entropy: {entropy:0.2f}")

        self.bias = torch.tensor(log_p, dtype=torch.float32)
        self.bias[counts_arr == 0] = -1e9  # Set large negative values for banned tokens

    def forward(self, input):
        input = input.float()
        x = self.fc(input)
        # x = x + self.bias.to(x.device)  # Ensure bias is on the same device as the input
        return x

## Image Captioner : Put it all together

In [19]:
banned_tokens=('[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]', 'the', 'a', 'in')

class ImageCaption(nn.Module):
    def __init__(self, tokenizer, vocab_size, num_layers=1, embed_dim=256, max_length=20, num_heads=2, dropout=0.1):
        super(ImageCaption, self).__init__()

        self.feature_extractor_model = CNN_feature_extraction()
        self.embedding = Embedding(vocab_size , max_length, embed_dim)
        self.decoder_layer = DecoderLayer(embed_dim)
        self.post_processing_model = PostProcessing(256, tokenizer, banned_tokens = banned_tokens)

        self.tokenizer = tokenizer

        # self.decoder_layers = nn.ModuleList([
        #     DecoderLayer(embed_dim)
        #     for _ in range(num_layers)
        # ])


    def forward(self, inputs):
        image, caption = inputs

        # Step 1: Extract Features
        extracted_image = self.feature_extractor_model(image)

        # Step 2: Embed the tokens
        tokens = self.embedding(caption)

        # Step 3: Decode
        token_output = self.decoder_layer((extracted_image, tokens))

        output = self.post_processing_model(token_output)

        return output

## Predicting

In [20]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def predict_one(model, dataset, image, idx, device, max_length=20):
    """
    Predict the caption for an image from the dataset at a given index.
    Args:
        model: The trained ImageCaption model.
        dataset: The deeplake dataset containing images. (not needed anymore)
        idx: The index of the image in the dataset. (not needed anymore)
        device: The device (CPU or GPU) to run the model on.
        max_length: Maximum length of the generated caption.
    Returns:
        The predicted caption as a string.
    """
    # Load and preprocess the image
    # image = dataset.image[idx].numpy()
    # processed_image = process_image(image).unsqueeze(0).to(device)
    image = image.to(device)

    # Start the generation process
    with torch.no_grad():
        # Step 1: Extract image features

        extracted_image = model.feature_extractor_model(image)

        # Step 2: Start with the <START> token
        captions = torch.full(
            (1, 1), tokenizer.convert_tokens_to_ids("<START>"),
            dtype=torch.long, device=device
        )

        # Step 3: Autoregressively generate caption
        for _ in range(max_length):
            # Embed tokens
            token_embeddings = model.embedding(captions)

            # Decode with cross-attention
            token_output = model.decoder_layer((extracted_image, token_embeddings))

            # Post-process to predict the next token
            logits = model.post_processing_model(token_output[:, -1, :])  # Use the last token's output
            next_token = logits.argmax(dim=-1).unsqueeze(1)  # Select the most probable token

            # Append the next token to the captions
            captions = torch.cat([captions, next_token], dim=1)

            # Stop if the <END> token is generated
            if next_token.item() == tokenizer.convert_tokens_to_ids("<END>"):
                break

    # Untokenize the generated tokens
    return untokenize(captions.squeeze(0).tolist())

# Part III: Caption to Story

In [21]:
def captions_to_story_pretrained(captions):
    """
    Function to take in captions, and output a story (a summary of the
    video, with overall insights)
    """
    long_text = '.'.join(captions)
    summarizer = pipeline("summarization")
    result = summarizer(long_text)
    print (result)


def preprocessing(captions):
    """
    Takes in a list of captions (or just a long string of text), returns
    1. tokenized matrix of words in each sentence (list of lists)
    2. the actual sentences (list of strings)
    """
    # handles strings, splitting into list
    if type(captions) == str:
        captions = re.split(r"(?<=[.?!])", captions)
    # ensure every caption ends in a period
    for i in range(len(captions)):
        if len(captions[i]) == 0:
          continue
        if captions[i][-1] not in  ".?!":
            captions[i] = captions[i] + "."
    paragraph = ' '.join(captions)
    sentences = nltk.sent_tokenize(paragraph)
    res = []
    for sentence in sentences:
        res.append(nltk.word_tokenize(sentence))
    return res, sentences

def build_frequency_table(tokens, stops):
    """
    Builds frequency table of words in a sentence (a mapping of words
    to the number of times they appear). Does not include "stop" words
    (e.g. , or . or ! or ?)
    """
    table = {}
    for word in tokens:
        if word not in stops:
            to_add = word.lower()
            if to_add in table:
                table[to_add] += 1
            else:
                table[to_add] = 1
    return table

def build_frequency_matrix(sentences, stops):
    """
    Builds frequency matrix. For each sentence, a mapping from words to the
    number of times they appear in the setence. Excludes words in "stops" (
    , or . or ! or ?)
    """
    matrix = {}
    for i in range(len(sentences)):
        table = build_frequency_table(sentences[i], stops)
        matrix[i] = table
    return matrix

def build_tf_matrix(matrix):
    """
    Takes in word frequency matrix, outputs TF matrix (a matrix of
    word probabilities in each sentence). Essentially, normalizing by the
    number of words in each sentence, so the sum of each row is 1
    """
    res = {}
    for key, table in matrix.items():
        tot = sum(table.values())
        freq_table = {}
        for word, count in table.items():
            freq_table[word] = count/tot
        res[key] = freq_table
    return res

def build_appearing_matrix(matrix):
    """
    Takes in word frequency (or tf) matrix - works with either - and produces
    a mapping from word to number of sentences it appears in. This is needed
    to produce idf matrix
    """
    appearances = {}
    for key, sentence in matrix.items():
        for word in sentence:
            if word in appearances:
                appearances[word] += 1
            else:
                appearances[word] = 1
    return appearances

def build_idf_matrix(freq_matrix, appearing_matrix):
    """
    Takes in frequence of each word in each sentence (freq matrix), and the
    number of sentences each word appears in (appearing_matrix) and produces
    the idf matrix
    """
    idf_matrix = {}
    num_sentences = len(freq_matrix)
    for key, sentence in freq_matrix.items():
        idf_table = {}
        for word in sentence:
            idf_table[word] = math.log(num_sentences/appearing_matrix[word])
        idf_matrix[key] = idf_table
    return idf_matrix

def build_tf_idf_matrix(tf_matrix, idf_matrix):
    """
    Builds tf-idf matrix for tf-idf algorithm, using both tf and idf
    matrices
    """
    tf_idf_matrix = {}
    for key, sentence in tf_matrix.items():
        idf_sentence = idf_matrix[key]
        tf_idf_table = {}
        for word in idf_sentence:
            tf_idf_table[word] = sentence[word] * idf_sentence[word]
        tf_idf_matrix[key] = tf_idf_table
    return tf_idf_matrix



In [22]:
def sentence_scoring(tf_idf_matrix):
    """
    Returns average tf-idf score of words in a sentence. To be used
    when selecting sentences in the tf-idf algorithm
    """
    scores = {}
    for key, sentence in tf_idf_matrix.items():
        avg_score = np.mean(list(sentence.values()))
        scores[key] = avg_score
    return scores

def choose_sentences_tfidf(sentence_scores, threshold):
    """
    Takes in sentences scores (as calculated by
    tf-idf algo) and a threshold for inclusion (e.g. 1.3). Returns
    sentences to be included in summary
    """
    scores = list(sentence_scores.values())
    average = np.mean(scores)
    sd = np.std(scores)
    res = []
    for key in sentence_scores:
        if sentence_scores[key] > (average + sd * threshold):
            res.append(key)
    return res

def choose_sentences_tfidf_modified(sentence_scores, idf_matrix, threshold):
    """
    Modified version of the tf-idf algorithm. Chooses sentences both according
    to tf-idf, and sentences with relatively uncommon english words that appear
    frequently in the text
    """
    # first, find tf-idf sentences to include
    scores = list(sentence_scores.values())
    average = np.mean(scores)
    sd = np.std(scores)
    res = []
    for key in sentence_scores:
        if sentence_scores[key] > (average + threshold * sd):
            res.append(key)
    # now, find relatively unique words that appear frequently
    # include their sentences in the summary as well
    rare_words_used = set()
    for key, sentence in idf_matrix.items():
        average_idf = sum(sentence.values())/len(sentence)
        for word in sentence:
            if word not in rare_words_used:
              if word_frequency(word, "en") < 5e-5 and sentence[word] < 0.8 * average_idf:
                rare_words_used.add(word)
                res.append(key)
                break
    res.sort()
    return res

In [23]:
def write_summary(sentences, lst):
    """
    Takes in the actual sentences, and a list of indices to be included,
    creates text summary
    """
    if len(lst) == 0:
        return ""
    summary = ""
    for key in lst:
        summary += sentences[key]
        summary += " "
    return summary

def print_caption(caption):
    """
    Utility function fo printing captions
    """
    if type(caption) == str:
        print (caption)
        return
    for i in range(len(caption)):
        if len(caption) == 0 or caption[i][-1] not in ".!?":
            caption[i] += "."
    res = ' '.join(caption)
    print (res)

def full_pipeline(caption):
    """
    Takes a caption (either string or list of strings) and returns it summary,
    according to a modified version of the tf-idf algorithm
    """
    tokens, sentences = preprocessing(caption)
    # build relevant matrices
    freq_matrix = build_frequency_matrix(tokens, ",.!?")
    tf_matrix = build_tf_matrix(freq_matrix)
    appearing_matrix = build_appearing_matrix(freq_matrix)
    idf_matrix = build_idf_matrix(freq_matrix, appearing_matrix)
    tf_idf_matrix = build_tf_idf_matrix(tf_matrix, idf_matrix)
    # score sentences according to tf-idf metrics and modifications
    sentence_scores = sentence_scoring(tf_idf_matrix)
    # choose sentences to include, write summary
    to_include = choose_sentences_tfidf_modified(sentence_scores, idf_matrix, 0.5)
    summary = write_summary(sentences, to_include)
    return summary

# Part IV : Entire System

In [38]:
video_path = "try.mp4" # change to video title!
checkpoint_path = "checkpoint_final.pth"

# video to frames
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ImageCaption(tokenizer, vocab_size)
if torch.cuda.is_available():
  model.load_state_dict(torch.load(checkpoint_path)['model_state_dict'])
else:
  model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cpu'))['model_state_dict'])
image_directory = "extracted_images"
gap = 20
extracted_images = extract_frames_with_opencv(video_path, image_directory, gap)

# image retreival, processing, captioning
image_files = [file for file in os.listdir(image_directory) if file.endswith(".jpg")]

caption_list = []
for file in image_files:
  image = Image.open(os.path.join(image_directory, file))
  image = image.convert('RGB')
  transformation = transforms.ToTensor()
  image_tensor = transformation(image)
  image_tensor = image_tensor.permute(1, 2, 0)
  image_tensor_fixed = image_tensor.to(torch.uint8)

  processed_image = process_image(image_tensor_fixed).unsqueeze(0).to(device)
  predicted_caption = predict_one(model, None, processed_image, -1, device)
  # predicted_caption = predict(cnn_feature_extraction, embedder, decoder, post_processor, processed_test_image, device)
  words = predicted_caption.split()
  new_words = list(filter(lambda x : x not in ".!?", words))
  for i in range(len(new_words)):
    if new_words[i][-1] in ".!?":
      new_words[i] = new_words[i][:-1]
  caption_list.append(' '.join(new_words))
  caption_list[-1] = caption_list[-1] + "."

print (caption_list)

print ()
print ("ORIGINAL TEXT IS: \n")
print_caption(caption_list)
print ()
summary = full_pipeline(caption_list)
print ("SUMMARIZED TEXT IS: \n")
print (summary)
print ()


Video FPS: 25, Total frames: 216
Saved frame 0 to extracted_images/frame_0.jpg
Saved frame 20 to extracted_images/frame_20.jpg
Saved frame 40 to extracted_images/frame_40.jpg
Saved frame 60 to extracted_images/frame_60.jpg
Saved frame 80 to extracted_images/frame_80.jpg
Saved frame 100 to extracted_images/frame_100.jpg
Saved frame 120 to extracted_images/frame_120.jpg
Saved frame 140 to extracted_images/frame_140.jpg
Saved frame 160 to extracted_images/frame_160.jpg
Saved frame 180 to extracted_images/frame_180.jpg
Saved frame 200 to extracted_images/frame_200.jpg
End of video or frame read failed at frame 216.
Total frames processed: 216
Total frames saved: 11
['a man in a blue shirt and a blue and a few others are standing in a red.', 'a man in a white shirt is standing in the street with a firetr,.', 'a man in a blue shirt is cooking with a blue shirt, while standing in a cloth.', 'a man in a blue shirt and a blue and a few of a white shirt are standing.', 'a man in a blue shirt is 