In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import torch
print(torch.cuda.is_available())

True


In [None]:
# Install the required libraries
!pip install numpy torch torchvision nltk



In [None]:
#video and json split

import json
import os
import random
import shutil

def split_json_data(input_json_path, videos_dir, output_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    # Load JSON data
    with open(input_json_path, 'r') as f:
        data = json.load(f)

    # Shuffle the video data to ensure randomness
    video_entries = data['videos']
    random.shuffle(video_entries)

    # Calculate the split sizes
    total_videos = len(video_entries)
    train_size = int(total_videos * train_ratio)
    val_size = int(total_videos * val_ratio)
    test_size = total_videos - train_size - val_size

    # Split the data
    train_videos = video_entries[:train_size]
    val_videos = video_entries[train_size:train_size + val_size]
    test_videos = video_entries[train_size + val_size:]

    # Organize sentences based on video_ids for each split
    video_sentences = {video['video_id']: [] for video in video_entries}
    for sentence in data['sentences']:
        video_sentences[sentence['video_id']].append(sentence)

    def create_split_data(split_videos, split_name):
        split_data = {
            "videos": split_videos,
            "sentences": []
        }
        split_videos_dir = os.path.join(output_dir, split_name, "videos")
        os.makedirs(split_videos_dir, exist_ok=True)

        # Add captions and copy video files
        for video in split_videos:
            video_id = video['video_id']
            split_data['sentences'].extend(video_sentences[video_id])

            # Copy video file to the split directory
            video_filename = f"{video_id}.avi"
            src_video_path = os.path.join(videos_dir, video_filename)
            dst_video_path = os.path.join(split_videos_dir, video_filename)
            if os.path.exists(src_video_path):
                shutil.copy(src_video_path, dst_video_path)
            else:
                print(f"Warning: Video file {video_filename} not found in {videos_dir}.")

        # Save the JSON file for the split
        split_json_path = os.path.join(output_dir, split_name, f"{split_name}_captions.json")
        with open(split_json_path, 'w') as f:
            json.dump(split_data, f, indent=4)

    # Create each split
    create_split_data(train_videos, "train")
    create_split_data(val_videos, "val")
    create_split_data(test_videos, "test")

    print("Data split and saved successfully.")

# Define paths based on your directory structure
input_json_path = '/content/drive/MyDrive/msvd_captions.json'  # Path to the original JSON file
videos_dir = '/content/drive/MyDrive/YouTubeClips'  # Directory where video files are stored
output_dir = '/content/drive/MyDrive/msvd_split'  # Directory where you want to save the splits

# Run the split function
split_json_data(input_json_path, videos_dir, output_dir)

Data split and saved successfully.


In [None]:
#verification for split

import os
import json

def verify_split_integrity(split_name, base_dir):
    """
    Verifies that each video in the split has a corresponding entry in the JSON file
    and each entry in the JSON file has a corresponding video file.

    Args:
        split_name (str): Name of the split (e.g., 'train', 'val', 'test').
        base_dir (str): Base directory where splits are stored (e.g., '/content/drive/MyDrive/msvd_split/').
    """
    video_dir = os.path.join(base_dir, split_name, 'videos')
    json_path = os.path.join(base_dir, split_name, f"{split_name}_captions.json")

    # Load JSON data
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    # Get the list of video files and remove file extensions
    video_files = {os.path.splitext(file)[0] for file in os.listdir(video_dir) if file.endswith('.avi')}

    # Get the list of video_ids from JSON "videos" section
    json_video_ids = {video['video_id'] for video in json_data['videos']}

    # Check that each video file has a corresponding entry in JSON "videos" section
    missing_in_json_videos = video_files - json_video_ids
    if missing_in_json_videos:
        print(f"[{split_name}] Videos present in folder but missing in JSON 'videos' section: {missing_in_json_videos}")
    else:
        print(f"[{split_name}] All videos in folder have matching entries in JSON 'videos' section.")

    # Check that each JSON entry in "videos" section has a corresponding video file in the folder
    missing_in_videos_folder = json_video_ids - video_files
    if missing_in_videos_folder:
        print(f"[{split_name}] Entries in JSON 'videos' section but missing video files: {missing_in_videos_folder}")
    else:
        print(f"[{split_name}] All entries in JSON 'videos' section have matching video files in folder.")

    # Get the list of video_ids from JSON "sentences" section
    json_caption_video_ids = {sentence['video_id'] for sentence in json_data['sentences']}

    # Check that each video file has a corresponding caption in JSON "sentences" section
    missing_captions_for_videos = video_files - json_caption_video_ids
    if missing_captions_for_videos:
        print(f"[{split_name}] Videos present in folder but missing captions in JSON 'sentences' section: {missing_captions_for_videos}")
    else:
        print(f"[{split_name}] All videos in folder have matching captions in JSON 'sentences' section.")

    # Check that each JSON entry in "sentences" section has a corresponding video file in the folder
    missing_videos_for_captions = json_caption_video_ids - video_files
    if missing_videos_for_captions:
        print(f"[{split_name}] Captions in JSON 'sentences' section but missing video files: {missing_videos_for_captions}")
    else:
        print(f"[{split_name}] All captions in JSON 'sentences' section have matching video files in folder.")

# Run verification for each split
base_dir = '/content/drive/MyDrive/msvd_split'  # Change to your base directory if different
for split in ['train', 'val', 'test']:
    verify_split_integrity(split, base_dir)

[train] Videos present in folder but missing in JSON 'videos' section: {'vZa13vJugGU_0_30', 'O_NWtDShLeg_21_25', 'BApIQn69EVE_10_16', '_WRC7HXBJpU_395_401', 'emblM4a76jg_5_15', 'VahnQw2gTQY_315_320', 'IHIa75B9AhI_2_26', '7NNg0_n-bS8_21_30', '9Q0JfdP36kI_167_170', 'ItFqogTmAvQ_389_395', 'lb8J2zCQTlo_3_8', '0SMOK2ql7Pg_5_12', 'ub-aYLzCF_Q_1_10', 'IhwPQL9dFYc_78_88', 'DKgHYLDebx0_0_8', '9IrWyZ0KZuk_160_166', 'EiylMb_mWk4_2_20', 'jjl2ZMdFCsw_17_35', 'm1NR0uNNs5Y_160_166', '6t0BpjwYKco_118_127', 'NMlKMfiHSho_1_15', 'R8FDJgVW3Vc_0_4', 'bLqmf8x7rLI_2_8', 'JK1R9k1WDpc_6_15', 'RMznbCn5sQs_0_10', 'De815YpTBic_41_48', 'ACOmKiJDkA4_121_128', 'UgUFP5baQ9Y_0_7', 'z0Si1XxMibg_0_30', 'zbAk0gX7kas_16_24', 'akXjIEoecNs_2_12', 'W6_XuNhgtrM_2_5', 'q5ZRMvjzhXQ_15_29', 'ywHBKayhyvQ_19_28', '9OvXb1fot74_21_29', 'PeUHy0A1GF0_68_73', 'B4foOe9kUgY_0_8', '4z3b4mnw5y4_56_60', 'sWqi41wyXcQ_68_79', 'cwkjJrGpoaU_30_41', 'HM-ZDoRWiH4_0_5', 'kWLNZzuo3do_217_222', 'fF89MasBFLw_321_326', '8MVo7fje_oE_125_130', 'jxdubZzQ

In [None]:
#removing caption of which video is not present

import os
import json

def clean_json_for_existing_videos(json_path, video_dir):
    # Load JSON data
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Get the set of video_ids from the actual video files in the folder
    video_files = {os.path.splitext(file)[0] for file in os.listdir(video_dir) if file.endswith('.avi')}

    # Filter out entries in 'videos' and 'sentences' that don't have a corresponding video file
    data['videos'] = [video for video in data['videos'] if video['video_id'] in video_files]
    data['sentences'] = [sentence for sentence in data['sentences'] if sentence['video_id'] in video_files]

    # Save the cleaned JSON data back to the file
    with open(json_path, 'w') as f:
        json.dump(data, f, indent=4)

    print(f"Cleaned JSON file saved to {json_path}")

# Define paths to each split's JSON file and video folder
base_dir = '/content/drive/MyDrive/msvd_split'
splits = {
    'train': {'json_path': os.path.join(base_dir, 'train', 'train_captions.json'),
              'video_dir': os.path.join(base_dir, 'train', 'videos')},
    'val': {'json_path': os.path.join(base_dir, 'val', 'val_captions.json'),
            'video_dir': os.path.join(base_dir, 'val', 'videos')},
    'test': {'json_path': os.path.join(base_dir, 'test', 'test_captions.json'),
             'video_dir': os.path.join(base_dir, 'test', 'videos')}
}

# Clean each JSON file based on existing video files
for split, paths in splits.items():
    print(f"Cleaning {split} JSON file...")
    clean_json_for_existing_videos(paths['json_path'], paths['video_dir'])
    print(f"{split} JSON file cleaned.\n")

Cleaning train JSON file...
Cleaned JSON file saved to /content/drive/MyDrive/msvd_split/train/train_captions.json
train JSON file cleaned.

Cleaning val JSON file...
Cleaned JSON file saved to /content/drive/MyDrive/msvd_split/val/val_captions.json
val JSON file cleaned.

Cleaning test JSON file...
Cleaned JSON file saved to /content/drive/MyDrive/msvd_split/test/test_captions.json
test JSON file cleaned.



In [None]:
#vocabulary building

import json
from collections import Counter

def build_vocabulary(json_paths, min_freq=1, save_path='/content/drive/MyDrive/msvd_split/vocab.json'):
    special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
    word_counter = Counter()

    # Process each JSON file to gather word frequencies
    for json_path in json_paths:
        with open(json_path, 'r') as f:
            data = json.load(f)
            for sentence in data['sentences']:
                words = sentence['caption'].lower().split()
                word_counter.update(words)

    # Create vocabulary by adding special tokens and frequent words
    vocab = {token: idx for idx, token in enumerate(special_tokens)}
    for word, freq in word_counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)

    # Save vocabulary as a JSON file
    with open(save_path, 'w') as f:
        json.dump(vocab, f, indent=4)

    print(f"Vocabulary built with {len(vocab)} words. Saved to {save_path}")

# Define paths to JSON files
json_paths = [
    '/content/drive/MyDrive/msvd_split/train/train_captions.json',
    '/content/drive/MyDrive/msvd_split/val/val_captions.json',
    '/content/drive/MyDrive/msvd_split/test/test_captions.json'
]

# Build vocabulary
build_vocabulary(json_paths)

Vocabulary built with 10109 words. Saved to /content/drive/MyDrive/msvd_split/vocab.json


In [None]:
!pip install torchvision
!pip install opencv-python



In [None]:
!pip install pretrainedmodels



In [None]:
#feature extraction

import os
import shutil
import subprocess
import glob
import numpy as np
import json
from tqdm import tqdm
import torch
import pretrainedmodels
from pretrainedmodels import utils

# Constants for input dimensions
C, H, W = 3, 224, 224

# Function to extract frames from a video
def extract_frames(video_path, dst):
    if os.path.exists(dst):
        shutil.rmtree(dst)
    os.makedirs(dst)
    video_to_frames_command = [
        "ffmpeg",
        '-y',
        '-i', video_path,
        '-vf', "scale=400:300",
        '-qscale:v', "2",
        f"{dst}/%06d.jpg"
    ]
    subprocess.call(video_to_frames_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

# Function to extract features from frames
def extract_feats(params, model, load_image_fn, split):
    model.eval()
    dir_fc = os.path.join(params['output_dir'], split, 'features')  # Store features in respective split folder
    os.makedirs(dir_fc, exist_ok=True)

    # Load video list from JSON file for the split
    json_path = params[f'{split}_json']
    with open(json_path, 'r') as f:
        data = json.load(f)
        video_ids = {video['video_id'] for video in data['videos']}

    # Process each video in the specified directory
    video_dir = os.path.join(params['video_path'], split, 'videos')  # Use respective split folder
    video_list = glob.glob(os.path.join(video_dir, '*.avi'))
    for video in tqdm(video_list, desc=f"Processing {split} videos"):
        video_id = os.path.splitext(os.path.basename(video))[0]
        if video_id not in video_ids:
            continue

        # Extract frames
        dst = os.path.join(params['tmp_dir'], video_id)
        extract_frames(video, dst)

        # Load frames and extract features
        image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
        samples = np.round(np.linspace(0, len(image_list) - 1, params['n_frame_steps'])).astype(int)
        image_list = [image_list[sample] for sample in samples]
        images = torch.zeros((len(image_list), C, H, W))

        for i, img_path in enumerate(image_list):
            img = load_image_fn(img_path)
            images[i] = img

        # Move images to GPU for feature extraction
        images = images.cuda()
        with torch.no_grad():
            fc_feats = model(images).cpu().squeeze()

        # Save features
        outfile = os.path.join(dir_fc, f"{video_id}.npy")
        np.save(outfile, fc_feats.numpy())

        # Clean up
        shutil.rmtree(dst)

    print(f"Feature extraction for {split} set is complete.")

# Main code setup with hardcoded parameters
params = {
    'output_dir': '/content/drive/MyDrive/msvd_split',
    'video_path': '/content/drive/MyDrive/msvd_split',
    'n_frame_steps': 40,
    'tmp_dir': '/content/tmp_frames',
    'train_json': '/content/drive/MyDrive/msvd_split/train/train_captions.json',
    'val_json': '/content/drive/MyDrive/msvd_split/val/val_captions.json',
    'test_json': '/content/drive/MyDrive/msvd_split/test/test_captions.json',
    'model': 'resnet152'  # Set your model choice here (resnet152, inception_v3, or inception_v4)
}

# Set up model and image loader
if params['model'] == 'inception_v3':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv3(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'resnet152':
    C, H, W = 3, 224, 224
    model = pretrainedmodels.resnet152(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'inception_v4':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
else:
    raise ValueError(f"Model {params['model']} is not supported")

model.last_linear = utils.Identity()  # Remove final classification layer
model = model.cuda()  # Use GPU

# Extract features for each split
for split in ['train', 'val', 'test']:
    extract_feats(params, model, load_image_fn, split)


Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /root/.cache/torch/hub/checkpoints/resnet152-b121ed2d.pth
100%|██████████| 230M/230M [00:01<00:00, 204MB/s]
Processing train videos:   0%|          | 0/1535 [00:00<?, ?it/s]

In [None]:
#verifying feature extracted correctly or not

import os
import json

# Function to verify features extraction for each split by cross-checking both video files and JSON entries
def verify_features_extraction(split, params):
    # Paths for the split
    video_dir = os.path.join(params['output_dir'], split, 'videos')
    features_dir = os.path.join(params['output_dir'], split, 'features')
    json_path = os.path.join(params['output_dir'], split, f"{split}_captions.json")

    # Load JSON file for the split
    with open(json_path, 'r') as f:
        data = json.load(f)
        video_ids_in_json = {video['video_id'] for video in data['videos']}

    # Check from Video Files
    missing_features_from_videos = []
    extra_features_in_folder_from_videos = []
    video_files = {os.path.splitext(video)[0] for video in os.listdir(video_dir) if video.endswith('.avi')}
    for video_id in video_files:
        feature_path = os.path.join(features_dir, f"{video_id}.npy")
        if not os.path.exists(feature_path):
            missing_features_from_videos.append(video_id)

    # Check from JSON Entries
    missing_features_from_json = []
    for video_id in video_ids_in_json:
        feature_path = os.path.join(features_dir, f"{video_id}.npy")
        if not os.path.exists(feature_path):
            missing_features_from_json.append(video_id)

    # Check for extra feature files
    extra_features = []
    for feature_file in os.listdir(features_dir):
        video_id = os.path.splitext(feature_file)[0]
        if video_id not in video_files and video_id not in video_ids_in_json:
            extra_features.append(video_id)

    # Print results
    if missing_features_from_videos:
        print(f"[{split}] Missing features for videos in the video folder: {len(missing_features_from_videos)}")
        print("Missing video IDs from video files:", missing_features_from_videos)
    else:
        print(f"[{split}] All video files have corresponding features.")

    if missing_features_from_json:
        print(f"[{split}] Missing features for videos in the JSON file: {len(missing_features_from_json)}")
        print("Missing video IDs from JSON:", missing_features_from_json)
    else:
        print(f"[{split}] All JSON entries have corresponding features.")

    if extra_features:
        print(f"[{split}] Extra feature files found that do not match any video or JSON entry: {len(extra_features)}")
        print("Extra feature file video IDs:", extra_features)
    else:
        print(f"[{split}] No extra feature files found.")

# Define your parameters
params = {
    'output_dir': '/content/drive/MyDrive/msvd_split',
}

# Verify for each split
for split in ['train', 'val', 'test']:
    verify_features_extraction(split, params)

[train] All video files have corresponding features.
[train] All JSON entries have corresponding features.
[train] No extra feature files found.
[val] All video files have corresponding features.
[val] All JSON entries have corresponding features.
[val] No extra feature files found.
[test] All video files have corresponding features.
[test] All JSON entries have corresponding features.
[test] No extra feature files found.


In [None]:
#video and json split

import json
import os
import random
import shutil

def split_json_data(input_json_path, videos_dir, output_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    # Load JSON data
    with open(input_json_path, 'r') as f:
        data = json.load(f)

    # Shuffle the video data to ensure randomness
    video_entries = data['videos']
    random.shuffle(video_entries)

    # Calculate the split sizes
    total_videos = len(video_entries)
    train_size = int(total_videos * train_ratio)
    val_size = int(total_videos * val_ratio)
    test_size = total_videos - train_size - val_size

    # Split the data
    train_videos = video_entries[:train_size]
    val_videos = video_entries[train_size:train_size + val_size]
    test_videos = video_entries[train_size + val_size:]

    # Organize sentences based on video_ids for each split
    video_sentences = {video['video_id']: [] for video in video_entries}
    for sentence in data['sentences']:
        video_sentences[sentence['video_id']].append(sentence)

    def create_split_data(split_videos, split_name):
        split_data = {
            "videos": split_videos,
            "sentences": []
        }
        split_videos_dir = os.path.join(output_dir, split_name, "videos")
        os.makedirs(split_videos_dir, exist_ok=True)

        # Add captions and copy video files
        for video in split_videos:
            video_id = video['video_id']
            split_data['sentences'].extend(video_sentences[video_id])

            # Copy video file to the split directory
            video_filename = f"{video_id}.avi"
            src_video_path = os.path.join(videos_dir, video_filename)
            dst_video_path = os.path.join(split_videos_dir, video_filename)
            if os.path.exists(src_video_path):
                shutil.copy(src_video_path, dst_video_path)
            else:
                print(f"Warning: Video file {video_filename} not found in {videos_dir}.")

        # Save the JSON file for the split
        split_json_path = os.path.join(output_dir, split_name, f"{split_name}_captions.json")
        with open(split_json_path, 'w') as f:
            json.dump(split_data, f, indent=4)

    # Create each split
    create_split_data(train_videos, "train")
    create_split_data(val_videos, "val")
    create_split_data(test_videos, "test")

    print("Data split and saved successfully.")

# Define paths based on your directory structure
input_json_path = '/content/drive/MyDrive/msvd_captions.json'  # Path to the original JSON file
videos_dir = '/content/drive/MyDrive/YouTubeClips'  # Directory where video files are stored
output_dir = '/content/drive/MyDrive/msvd_split'  # Directory where you want to save the splits

# Run the split function
split_json_data(input_json_path, videos_dir, output_dir)

In [None]:
import json
from collections import Counter

def build_vocabulary(
    json_paths,
    min_freq=1,
    save_path='/content/drive/MyDrive/msvd_split/vocab.json',
    reverse_path='/content/drive/MyDrive/msvd_split/vocab_rev.json',
    top_k_preview=10
):
    special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
    word_counter = Counter()

    # Step 1: Count all words
    for json_path in json_paths:
        with open(json_path, 'r') as f:
            data = json.load(f)
            for sentence in data['sentences']:
                words = sentence['caption'].lower().split()
                word_counter.update(words)

    # Step 2: Initialize vocab with special tokens
    vocab = {token: idx for idx, token in enumerate(special_tokens)}

    # Step 3: Add frequent words sorted by frequency (descending)
    for word, freq in word_counter.most_common():
        if freq >= min_freq:
            if word not in vocab:  # Avoid conflict with special tokens
                vocab[word] = len(vocab)

    # Step 4: Create reverse vocab
    vocab_rev = {idx: word for word, idx in vocab.items()}

    # Step 5: Save both files
    with open(save_path, 'w') as f:
        json.dump(vocab, f, indent=4)
    with open(reverse_path, 'w') as f:
        json.dump(vocab_rev, f, indent=4)

    # Step 6: Show summary
    print(f"✅ Vocabulary built with {len(vocab)} tokens.")
    print(f"📁 Saved vocab to: {save_path}")
    print(f"📁 Saved reverse vocab to: {reverse_path}")
    print(f"📊 Top {top_k_preview} frequent words:")
    for word, freq in word_counter.most_common(top_k_preview):
        print(f"   {word:<15} : {freq}")

# === Paths to your cleaned split JSON files
json_paths = [
    '/content/drive/MyDrive/msvd_split/train/train_captions.json',
    '/content/drive/MyDrive/msvd_split/val/val_captions.json',
    '/content/drive/MyDrive/msvd_split/test/test_captions.json'
]

# === Run vocabulary builder
build_vocabulary(json_paths, min_freq=1)


✅ Vocabulary built with 10109 tokens.
📁 Saved vocab to: /content/drive/MyDrive/msvd_split/vocab.json
📁 Saved reverse vocab to: /content/drive/MyDrive/msvd_split/vocab_rev.json
📊 Top 10 frequent words:
   a               : 60338
   is              : 29601
   the             : 19526
   man             : 15648
   woman           : 6619
   on              : 6274
   in              : 6214
   playing         : 5341
   are             : 4795
   of              : 4436


In [None]:
#feature extraction

import os
import shutil
import subprocess
import glob
import numpy as np
import json
from tqdm import tqdm
import torch
import pretrainedmodels
from pretrainedmodels import utils

# Constants for input dimensions
C, H, W = 3, 224, 224

# Function to extract frames from a video
def extract_frames(video_path, dst):
    if os.path.exists(dst):
        shutil.rmtree(dst)
    os.makedirs(dst)
    video_to_frames_command = [
        "ffmpeg",
        '-y',
        '-i', video_path,
        '-vf', "scale=400:300",
        '-qscale:v', "2",
        f"{dst}/%06d.jpg"
    ]
    subprocess.call(video_to_frames_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

# Function to extract features from frames
def extract_feats(params, model, load_image_fn, split):
    model.eval()
    dir_fc = os.path.join(params['output_dir'], split, 'features')  # Store features in respective split folder
    os.makedirs(dir_fc, exist_ok=True)

    # Load video list from JSON file for the split
    json_path = params[f'{split}_json']
    with open(json_path, 'r') as f:
        data = json.load(f)
        video_ids = {video['video_id'] for video in data['videos']}

    # Process each video in the specified directory
    video_dir = os.path.join(params['video_path'], split, 'videos')  # Use respective split folder
    video_list = glob.glob(os.path.join(video_dir, '*.avi'))
    for video in tqdm(video_list, desc=f"Processing {split} videos"):
        video_id = os.path.splitext(os.path.basename(video))[0]
        if video_id not in video_ids:
            continue

        # Extract frames
        dst = os.path.join(params['tmp_dir'], video_id)
        extract_frames(video, dst)

        # Load frames and extract features
        # image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
        # samples = np.round(np.linspace(0, len(image_list) - 1, params['n_frame_steps'])).astype(int)
        # image_list = [image_list[sample] for sample in samples]
        image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
        if len(image_list) == 0:
            print(f"[WARNING] No frames extracted for video {video_id}. Skipping...")
            shutil.rmtree(dst)
            continue  # Skip to next video

        # Safe sampling
        n_frames = len(image_list)
        sample_count = min(params['n_frame_steps'], n_frames)
        samples = np.round(np.linspace(0, n_frames - 1, sample_count)).astype(int)
        image_list = [image_list[sample] for sample in samples]

        images = torch.zeros((len(image_list), C, H, W))

        for i, img_path in enumerate(image_list):
            img = load_image_fn(img_path)
            images[i] = img

        # Move images to GPU for feature extraction
        images = images.cuda()
        with torch.no_grad():
            fc_feats = model(images).cpu().squeeze()

        # Save features
        outfile = os.path.join(dir_fc, f"{video_id}.npy")
        np.save(outfile, fc_feats.numpy())

        # Clean up
        shutil.rmtree(dst)

    print(f"Feature extraction for {split} set is complete.")

# Main code setup with hardcoded parameters
params = {
    'output_dir': '/content/drive/MyDrive/VideoCaptioning/msvd_split',
    'video_path': '/content/drive/MyDrive/VideoCaptioning/msvd_split',
    'n_frame_steps': 40,
    'tmp_dir': '/content/tmp_frames',
    'train_json': '/content/drive/MyDrive/VideoCaptioning/msvd_split/train/train_captions.json',
    'val_json': '/content/drive/MyDrive/VideoCaptioning/msvd_split/val/val_captions.json',
    'test_json': '/content/drive/MyDrive/VideoCaptioning/msvd_split/test/test_captions.json',
    'model': 'resnet152'  # Set your model choice here (resnet152, inception_v3, or inception_v4)
}

# Set up model and image loader
if params['model'] == 'inception_v3':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv3(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'resnet152':
    C, H, W = 3, 224, 224
    model = pretrainedmodels.resnet152(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'inception_v4':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
else:
    raise ValueError(f"Model {params['model']} is not supported")

model.last_linear = utils.Identity()  # Remove final classification layer
model = model.cuda()  # Use GPU

# Extract features for each split
for split in ['train', 'val', 'test']:
    extract_feats(params, model, load_image_fn, split)

Processing train videos: 100%|██████████| 1789/1789 [35:07<00:00,  1.18s/it]


Feature extraction for train set is complete.


Processing val videos: 100%|██████████| 542/542 [07:32<00:00,  1.20it/s]


Feature extraction for val set is complete.


Processing test videos: 100%|██████████| 546/546 [07:24<00:00,  1.23it/s]

Feature extraction for test set is complete.





In [None]:
#cleaning feature ,which is not match
import os
import json

def verify_and_fix_features(split, params):
    video_dir = os.path.join(params['output_dir'], split, 'videos')
    features_dir = os.path.join(params['output_dir'], split, 'features')
    json_path = os.path.join(params['output_dir'], split, f"{split}_captions.json")

    # Load JSON
    with open(json_path, 'r') as f:
        data = json.load(f)
        json_video_ids = {video['video_id'] for video in data['videos']}

    # All video and feature files
    video_files = {os.path.splitext(f)[0] for f in os.listdir(video_dir) if f.endswith('.avi')}
    feature_files = {os.path.splitext(f)[0] for f in os.listdir(features_dir) if f.endswith('.npy')}

    # Identify corrupted or extra items
    missing_feats_from_videos = video_files - feature_files
    missing_feats_from_json = json_video_ids - feature_files
    extra_feats = feature_files - video_files - json_video_ids

    # === ACTION 1: Delete corrupted video files (with no features)
    for vid in missing_feats_from_videos:
        path = os.path.join(video_dir, f"{vid}.avi")
        if os.path.exists(path):
            os.remove(path)
            print(f"🗑️ Deleted incomplete video (no features): {path}")

    # === ACTION 2: Remove JSON entries with missing features
    original_video_count = len(data['videos'])
    original_caption_count = len(data['sentences'])
    data['videos'] = [v for v in data['videos'] if v['video_id'] in feature_files]
    data['sentences'] = [s for s in data['sentences'] if s['video_id'] in feature_files]

    with open(json_path, 'w') as f:
        json.dump(data, f, indent=4)

    print(f"✅ Cleaned JSON for [{split}]")
    print(f"🧮 Removed {original_video_count - len(data['videos'])} invalid video entries")
    print(f"🧾 Removed {original_caption_count - len(data['sentences'])} invalid captions")

    # === ACTION 3: Delete stray feature files
    for vid in extra_feats:
        path = os.path.join(features_dir, f"{vid}.npy")
        if os.path.exists(path):
            os.remove(path)
            print(f"🧼 Deleted stray feature file: {path}")

    print(f"✅ [{split}] Verified and fixed. All valid videos now have features and JSON alignment.\n")

# Run for all splits
params = {
    'output_dir': '/content/drive/MyDrive/msvd_split',
}

for split in ['train', 'val', 'test']:
    verify_and_fix_features(split, params)

✅ Cleaned JSON for [train]
🧮 Removed 0 invalid video entries
🧾 Removed 0 invalid captions
✅ [train] Verified and fixed. All valid videos now have features and JSON alignment.

✅ Cleaned JSON for [val]
🧮 Removed 0 invalid video entries
🧾 Removed 0 invalid captions
✅ [val] Verified and fixed. All valid videos now have features and JSON alignment.

✅ Cleaned JSON for [test]
🧮 Removed 0 invalid video entries
🧾 Removed 0 invalid captions
✅ [test] Verified and fixed. All valid videos now have features and JSON alignment.



In [None]:
#dataset for captioning
import os
import json
import numpy as np
import torch
from torch.utils.data import Dataset

class VideoCaptionDataset(Dataset):
    """
    Dataset class for loading video features and their associated captions.

    Args:
        feature_dir (str): Directory containing .npy feature files.
        json_path (str): Path to JSON file with "videos" and "sentences".
        vocab (dict): Vocabulary mapping words to indices.
        max_caption_length (int): Max length of tokenized captions (including <SOS> and <EOS>).
        verbose (bool): If True, prints sample-level debug info.
    """
    def __init__(self, feature_dir, json_path, vocab, max_caption_length=15, verbose=False):
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_caption_length = max_caption_length
        self.verbose = verbose

        # Load JSON
        with open(json_path, 'r') as f:
            data = json.load(f)

        # Map video_id to all its captions
        self.video_captions = {}
        for item in data['sentences']:
            vid = item['video_id']
            if vid in self.video_captions:
                self.video_captions[vid].append(item['caption'])
            else:
                self.video_captions[vid] = [item['caption']]

        # Keep video_ids that have both captions and feature files
        all_video_ids = [v['video_id'] for v in data['videos']]
        self.video_ids = [
            vid for vid in all_video_ids
            if vid in self.video_captions and os.path.exists(os.path.join(self.feature_dir, f"{vid}.npy"))
        ]

        if self.verbose:
            print("✅ Initialized VideoCaptionDataset")
            print(f"🧾 Total valid samples: {len(self.video_ids)}")

    def __len__(self):
        return len(self.video_ids)

    def __getitem__(self, idx):
        video_id = self.video_ids[idx]
        feature_path = os.path.join(self.feature_dir, f"{video_id}.npy")

        try:
            video_features = np.load(feature_path)
        except Exception as e:
            raise RuntimeError(f"❌ Failed to load features for {video_id}: {e}")

        # Choose a random caption and tokenize
        caption = np.random.choice(self.video_captions[video_id])
        tokens = [self.vocab['<SOS>']] + [
            self.vocab.get(word, self.vocab['<UNK>']) for word in caption.lower().split()
        ] + [self.vocab['<EOS>']]

        # Truncate and pad
        tokens = tokens[:self.max_caption_length]
        tokens += [self.vocab['<PAD>']] * (self.max_caption_length - len(tokens))

        caption_tensor = torch.tensor(tokens, dtype=torch.long)
        video_tensor = torch.tensor(video_features, dtype=torch.float32)

        # Debug preview
        if self.verbose and idx == 0:
            reverse_vocab = {v: k for k, v in self.vocab.items()}
            print(f"\n📦 Sample [{video_id}]")
            print("📝 Caption:", caption)
            print("🔢 Tokens :", tokens)
            print("🔠 Decoded:", [reverse_vocab.get(t, '<UNK>') for t in tokens])
            print("🎞️ Video Features Shape:", video_tensor.shape)
            print("🧠 Caption Tensor:", caption_tensor)

        return video_tensor, caption_tensor


In [None]:
# Example usage
if __name__ == "__main__":
    # Define directories based on your structure
    feature_dir = '/content/drive/MyDrive/msvd_split/train/features'  # Path to train features directory
    json_path = '/content/drive/MyDrive/msvd_split/train/train_captions.json'  # Path to train captions JSON
    vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'  # Path to vocabulary JSON

    # Load vocabulary
    with open(vocab_path, 'r') as f:
        vocab = json.load(f)

    # Initialize dataset
    dataset = VideoCaptionDataset(feature_dir, json_path, vocab, verbose=True)
    print(f"Dataset size: {len(dataset)}")

    # Access a sample item to verify
    video_features, caption_tensor = dataset[0]
    print("Sample Video Features Shape:", video_features.shape)
    print("Sample Caption Tensor:", caption_tensor)

✅ Initialized VideoCaptionDataset
🧾 Total valid samples: 965
Dataset size: 965

📦 Sample [1sffYOXq4Iw_23_49]
📝 Caption: a man is slicing a potato
🔢 Tokens : [1, 4, 7, 5, 33, 4, 65, 2, 0, 0, 0, 0, 0, 0, 0]
🔠 Decoded: ['<SOS>', 'a', 'man', 'is', 'slicing', 'a', 'potato', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
🎞️ Video Features Shape: torch.Size([40, 2048])
🧠 Caption Tensor: tensor([ 1,  4,  7,  5, 33,  4, 65,  2,  0,  0,  0,  0,  0,  0,  0])
Sample Video Features Shape: torch.Size([40, 2048])
Sample Caption Tensor: tensor([ 1,  4,  7,  5, 33,  4, 65,  2,  0,  0,  0,  0,  0,  0,  0])


In [None]:
if __name__ == "__main__":
    feature_dir = '/content/drive/MyDrive/msvd_split/train/features'
    json_path = '/content/drive/MyDrive/msvd_split/train/train_captions.json'
    vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'

    with open(vocab_path, 'r') as f:
        vocab = json.load(f)

    dataset = VideoCaptionDataset(
        feature_dir=feature_dir,
        json_path=json_path,
        vocab=vocab,
        max_caption_length=15,
        verbose=True
    )

    print(f"Dataset Size: {len(dataset)}")

    # Test sample access
    video_feat, cap_tensor = dataset[0]


✅ Initialized VideoCaptionDataset
🧾 Total valid samples: 965
Dataset Size: 965

📦 Sample [1sffYOXq4Iw_23_49]
📝 Caption: someone is slicing potatoes
🔢 Tokens : [1, 28, 5, 33, 189, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
🔠 Decoded: ['<SOS>', 'someone', 'is', 'slicing', 'potatoes', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
🎞️ Video Features Shape: torch.Size([40, 2048])
🧠 Caption Tensor: tensor([  1,  28,   5,  33, 189,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          0])


In [None]:
# Vanilla S2VT Encoder-Decoder Model (No Attention)

import torch
import torch.nn as nn
import torch.nn.functional as F

class S2VTModel(nn.Module):
    def __init__(self, vocab_size, max_len=45, dim_hidden=1024, dim_word=512, dim_vid=2048,
                 sos_id=1, eos_id=0, n_layers=1, rnn_cell='lstm', rnn_dropout_p=0.3):
        super(S2VTModel, self).__init__()

        self.rnn_cell_type = rnn_cell.lower()
        self.rnn_cell = nn.LSTM if self.rnn_cell_type == 'lstm' else nn.GRU

        self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)
        self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)

        self.embedding = nn.Embedding(vocab_size, dim_word)
        self.out = nn.Linear(dim_hidden, vocab_size)

        self.dim_vid = dim_vid
        self.dim_hidden = dim_hidden
        self.dim_word = dim_word
        self.max_length = max_len
        self.vocab_size = vocab_size
        self.sos_id = sos_id
        self.eos_id = eos_id

    def forward(self, vid_feats, target_variable=None, mode='train', beam_width=3):
        batch_size, n_frames, _ = vid_feats.shape
        device = vid_feats.device

        encoder_outputs, state1 = self.rnn1(vid_feats)  # [B, T, H]

        if mode == 'train':
            seq_probs = []
            for t in range(self.max_length - 1):
                current_word = self.embedding(target_variable[:, t])
                input2 = torch.cat((encoder_outputs[:, -1], current_word), dim=1).unsqueeze(1)
                output2, state2 = self.rnn2(input2, state1)
                logits = self.out(output2.squeeze(1))
                logits = F.log_softmax(logits, dim=1)
                seq_probs.append(logits.unsqueeze(1))
                state1 = state2
            return torch.cat(seq_probs, dim=1), None

        else:  # Inference mode
            generated = torch.full((batch_size, 1), self.sos_id, dtype=torch.long, device=device)
            seqs = []
            for t in range(self.max_length - 1):
                emb = self.embedding(generated[:, -1])
                input2 = torch.cat((encoder_outputs[:, -1], emb), dim=1).unsqueeze(1)
                output2, state1 = self.rnn2(input2, state1)
                logits = self.out(output2.squeeze(1))
                next_word = torch.argmax(F.log_softmax(logits, dim=1), dim=1).unsqueeze(1)
                generated = torch.cat([generated, next_word], dim=1)
            return None, generated


In [None]:
# Define the full model code as a string save the model
model_code = '''import torch
from torch import nn
import torch.nn.functional as F

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size, encoder_output_size):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(encoder_output_size, hidden_size)
        self.V = nn.Linear(hidden_size, 1)

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.unsqueeze(1)  # [B, 1, H]
        score = self.V(torch.tanh(self.W1(hidden) + self.W2(encoder_outputs)))  # [B, T, 1]
        attention_weights = F.softmax(score, dim=1)  # [B, T, 1]
        context = torch.sum(attention_weights * encoder_outputs, dim=1)  # [B, H]
        return context, attention_weights

class S2VTModel(nn.Module):
    def __init__(self, vocab_size, max_len=45, dim_hidden=1024, dim_word=512, dim_vid=2048,
                 sos_id=1, eos_id=0, n_layers=1, rnn_cell='lstm', rnn_dropout_p=0.3):
        super(S2VTModel, self).__init__()

        self.rnn_cell_type = rnn_cell.lower()
        self.rnn_cell = nn.LSTM if self.rnn_cell_type == 'lstm' else nn.GRU

        self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)
        self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)

        self.attention = BahdanauAttention(dim_hidden, dim_hidden)

        self.embedding = nn.Embedding(vocab_size, dim_word)
        self.out = nn.Linear(dim_hidden, vocab_size)

        self.dim_vid = dim_vid
        self.dim_hidden = dim_hidden
        self.dim_word = dim_word
        self.max_length = max_len
        self.vocab_size = vocab_size
        self.sos_id = sos_id
        self.eos_id = eos_id

    def forward(self, vid_feats, target_variable=None, mode='train', beam_width=3):
        batch_size, n_frames, _ = vid_feats.shape
        device = vid_feats.device

        encoder_outputs, state1 = self.rnn1(vid_feats)  # [B, T, H]

        if mode == 'train':
            seq_probs = []
            for t in range(self.max_length - 1):
                current_word = self.embedding(target_variable[:, t])
                context, _ = self.attention(state1[0][-1] if self.rnn_cell_type == 'lstm' else state1[-1], encoder_outputs)
                input2 = torch.cat((context, current_word), dim=1).unsqueeze(1)
                output2, state2 = self.rnn2(input2, state1)
                logits = self.out(output2.squeeze(1))
                logits = F.log_softmax(logits, dim=1)
                seq_probs.append(logits.unsqueeze(1))
                state1 = state2
            return torch.cat(seq_probs, dim=1), None

        else:  # Beam Search Inference
            beams = [(torch.tensor([self.sos_id], device=device), 0.0, state1)]
            completed = []

            for _ in range(self.max_length - 1):
                new_beams = []
                for seq, score, state in beams:
                    last_word = seq[-1].unsqueeze(0)
                    if last_word.item() == self.eos_id:
                        completed.append((seq, score))
                        continue

                    emb = self.embedding(last_word).unsqueeze(0)  # [1, 1, D]
                    context, _ = self.attention(state[0][-1] if self.rnn_cell_type == 'lstm' else state[-1], encoder_outputs)
                    input2 = torch.cat((context, emb.squeeze(1)), dim=-1).unsqueeze(1)
                    output2, new_state = self.rnn2(input2, state)
                    logits = self.out(output2.squeeze(1))
                    log_probs = F.log_softmax(logits, dim=1)
                    topk_log_probs, topk_indices = torch.topk(log_probs, beam_width)

                    for k in range(beam_width):
                        new_seq = torch.cat([seq, topk_indices[0, k].unsqueeze(0)])
                        new_score = score + topk_log_probs[0, k].item()
                        new_beams.append((new_seq, new_score, new_state))

                beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

            best_seq = max(completed or beams, key=lambda x: x[1])[0]
            return None, best_seq.unsqueeze(0)

'''

# Save to Google Drive
with open('/content/drive/MyDrive/S2VTModel.py', 'w') as f:
    f.write(model_code)

print("✅ Model saved as S2VTModel.py in your Drive.")


✅ Model saved as S2VTModel.py in your Drive.


In [None]:
#training model with cheackpoint
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

def train_model(model, train_dataset, val_dataset, vocab, device,
                num_epochs=10, batch_size=8, learning_rate=1e-4, checkpoint_dir=None):

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = model.to(device)
    criterion = nn.NLLLoss(ignore_index=vocab['<PAD>'])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_train_loss = 0

        print(f"\n🚀 Epoch [{epoch}/{num_epochs}]")

        for video_feats, captions in tqdm(train_loader, desc='Training'):
            video_feats, captions = video_feats.to(device), captions.to(device)

            # Prepare input and target
            inputs = captions[:, :-1]
            targets = captions[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(video_feats, target_variable=inputs, mode='train')  # [B, T, V]

            outputs = outputs.view(-1, outputs.size(-1))       # [B*T, V]
            targets = targets.reshape(-1)                      # [B*T]

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"✅ Training Loss: {avg_train_loss:.4f}")

        # === Validation ===
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for video_feats, captions in tqdm(val_loader, desc='Validation'):
                video_feats, captions = video_feats.to(device), captions.to(device)
                inputs = captions[:, :-1]
                targets = captions[:, 1:]

                outputs, _ = model(video_feats, target_variable=inputs, mode='train')
                outputs = outputs.view(-1, outputs.size(-1))
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"🧪 Validation Loss: {avg_val_loss:.4f}")

        scheduler.step()

        # Save checkpoint
        if checkpoint_dir:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss,
            }, f"{checkpoint_dir}/checkpoint_epoch_{epoch}.pt")
            print(f"💾 Saved checkpoint to {checkpoint_dir}/checkpoint_epoch_{epoch}.pt")


In [None]:
from torch.utils.data import DataLoader
import json

# === Paths ===
feature_dir_train = '/content/drive/MyDrive/msvd_split/train/features'
feature_dir_val = '/content/drive/MyDrive/msvd_split/val/features'
json_train = '/content/drive/MyDrive/msvd_split/train/train_captions.json'
json_val = '/content/drive/MyDrive/msvd_split/val/val_captions.json'
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'

# === Load vocab ===
with open(vocab_path, 'r') as f:
    vocab = json.load(f)

# === Initialize datasets ===
train_dataset = VideoCaptionDataset(
    feature_dir=feature_dir_train,
    json_path=json_train,
    vocab=vocab,
    max_caption_length=45
)

val_dataset = VideoCaptionDataset(
    feature_dir=feature_dir_val,
    json_path=json_val,
    vocab=vocab,
    max_caption_length=45
)


In [None]:
from drive.MyDrive.S2VTModel import S2VTModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=2,
    rnn_cell='lstm',
    rnn_dropout_p=0.2
)

train_model(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    vocab=vocab,
    device=device,
    num_epochs=30,
    batch_size=8,
    learning_rate=1e-4,
    checkpoint_dir="/content/drive/MyDrive/msvd_split/checkpoints"
)



🚀 Epoch [1/30]


Training: 100%|██████████| 121/121 [09:53<00:00,  4.91s/it]


✅ Training Loss: 5.7049


Validation: 100%|██████████| 6/6 [00:20<00:00,  3.37s/it]


🧪 Validation Loss: 4.5177
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_1.pt

🚀 Epoch [2/30]


Training: 100%|██████████| 121/121 [00:55<00:00,  2.17it/s]


✅ Training Loss: 4.7778


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.52it/s]


🧪 Validation Loss: 4.3640
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_2.pt

🚀 Epoch [3/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.21it/s]


✅ Training Loss: 4.5346


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.33it/s]


🧪 Validation Loss: 4.3532
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_3.pt

🚀 Epoch [4/30]


Training: 100%|██████████| 121/121 [00:55<00:00,  2.16it/s]


✅ Training Loss: 4.4049


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.42it/s]


🧪 Validation Loss: 4.1124
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_4.pt

🚀 Epoch [5/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.23it/s]


✅ Training Loss: 4.1829


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.37it/s]


🧪 Validation Loss: 3.7684
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_5.pt

🚀 Epoch [6/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.21it/s]


✅ Training Loss: 4.2357


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.76it/s]


🧪 Validation Loss: 3.8354
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_6.pt

🚀 Epoch [7/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.21it/s]


✅ Training Loss: 4.1720


Validation: 100%|██████████| 6/6 [00:00<00:00, 11.97it/s]


🧪 Validation Loss: 3.6494
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_7.pt

🚀 Epoch [8/30]


Training: 100%|██████████| 121/121 [00:55<00:00,  2.20it/s]


✅ Training Loss: 4.1088


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.56it/s]


🧪 Validation Loss: 3.7368
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_8.pt

🚀 Epoch [9/30]


Training: 100%|██████████| 121/121 [00:55<00:00,  2.18it/s]


✅ Training Loss: 4.0950


Validation: 100%|██████████| 6/6 [00:00<00:00, 11.74it/s]


🧪 Validation Loss: 4.2432
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_9.pt

🚀 Epoch [10/30]


Training: 100%|██████████| 121/121 [00:55<00:00,  2.20it/s]


✅ Training Loss: 4.0317


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.59it/s]


🧪 Validation Loss: 3.8027
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_10.pt

🚀 Epoch [11/30]


Training: 100%|██████████| 121/121 [00:56<00:00,  2.14it/s]


✅ Training Loss: 3.9290


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.23it/s]


🧪 Validation Loss: 3.4410
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_11.pt

🚀 Epoch [12/30]


Training: 100%|██████████| 121/121 [00:55<00:00,  2.18it/s]


✅ Training Loss: 4.0207


Validation: 100%|██████████| 6/6 [00:00<00:00, 11.94it/s]


🧪 Validation Loss: 3.7534
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_12.pt

🚀 Epoch [13/30]


Training: 100%|██████████| 121/121 [00:57<00:00,  2.10it/s]


✅ Training Loss: 3.9466


Validation: 100%|██████████| 6/6 [00:00<00:00, 11.32it/s]


🧪 Validation Loss: 4.1549
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_13.pt

🚀 Epoch [14/30]


Training: 100%|██████████| 121/121 [00:57<00:00,  2.10it/s]


✅ Training Loss: 3.8821


Validation: 100%|██████████| 6/6 [00:00<00:00, 10.30it/s]


🧪 Validation Loss: 3.8751
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_14.pt

🚀 Epoch [15/30]


Training: 100%|██████████| 121/121 [00:55<00:00,  2.17it/s]


✅ Training Loss: 3.9862


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.18it/s]


🧪 Validation Loss: 3.1661
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_15.pt

🚀 Epoch [16/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.21it/s]


✅ Training Loss: 3.9115


Validation: 100%|██████████| 6/6 [00:00<00:00, 11.15it/s]


🧪 Validation Loss: 3.7140
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_16.pt

🚀 Epoch [17/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.22it/s]


✅ Training Loss: 3.8798


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.31it/s]


🧪 Validation Loss: 3.5871
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_17.pt

🚀 Epoch [18/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.20it/s]


✅ Training Loss: 3.8448


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.18it/s]


🧪 Validation Loss: 3.6615
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_18.pt

🚀 Epoch [19/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.21it/s]


✅ Training Loss: 3.8619


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.74it/s]


🧪 Validation Loss: 3.5195
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_19.pt

🚀 Epoch [20/30]


Training: 100%|██████████| 121/121 [00:55<00:00,  2.18it/s]


✅ Training Loss: 3.8771


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.19it/s]


🧪 Validation Loss: 3.4527
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_20.pt

🚀 Epoch [21/30]


Training: 100%|██████████| 121/121 [00:56<00:00,  2.15it/s]


✅ Training Loss: 3.7955


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.26it/s]


🧪 Validation Loss: 3.4630
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_21.pt

🚀 Epoch [22/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.20it/s]


✅ Training Loss: 3.8904


Validation: 100%|██████████| 6/6 [00:00<00:00, 11.88it/s]


🧪 Validation Loss: 3.4151
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_22.pt

🚀 Epoch [23/30]


Training: 100%|██████████| 121/121 [01:00<00:00,  2.00it/s]


✅ Training Loss: 3.8360


Validation: 100%|██████████| 6/6 [00:00<00:00, 11.94it/s]


🧪 Validation Loss: 3.6488
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_23.pt

🚀 Epoch [24/30]


Training: 100%|██████████| 121/121 [00:57<00:00,  2.10it/s]


✅ Training Loss: 3.7513


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.62it/s]


🧪 Validation Loss: 3.2991
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_24.pt

🚀 Epoch [25/30]


Training: 100%|██████████| 121/121 [00:56<00:00,  2.13it/s]


✅ Training Loss: 3.8303


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.19it/s]


🧪 Validation Loss: 3.5951
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_25.pt

🚀 Epoch [26/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.20it/s]


✅ Training Loss: 3.7781


Validation: 100%|██████████| 6/6 [00:00<00:00, 11.88it/s]


🧪 Validation Loss: 3.5658
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_26.pt

🚀 Epoch [27/30]


Training: 100%|██████████| 121/121 [00:57<00:00,  2.09it/s]


✅ Training Loss: 3.7780


Validation: 100%|██████████| 6/6 [00:00<00:00, 11.24it/s]


🧪 Validation Loss: 3.4334
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_27.pt

🚀 Epoch [28/30]


Training: 100%|██████████| 121/121 [00:54<00:00,  2.21it/s]


✅ Training Loss: 3.7344


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.21it/s]


🧪 Validation Loss: 3.7232
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_28.pt

🚀 Epoch [29/30]


Training: 100%|██████████| 121/121 [01:00<00:00,  1.99it/s]


✅ Training Loss: 3.7922


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.29it/s]


🧪 Validation Loss: 3.2920
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_29.pt

🚀 Epoch [30/30]


Training: 100%|██████████| 121/121 [00:55<00:00,  2.19it/s]


✅ Training Loss: 3.7795


Validation: 100%|██████████| 6/6 [00:00<00:00, 12.18it/s]


🧪 Validation Loss: 3.3980
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt


In [None]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=8bb20a4b5f3289096de90e719d4d9a73219f28b09dddfc6bfb2f3b121141716b
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
!git clone https://github.com/tylin/coco-caption
!cp -r coco-caption/pycocoevalcap /usr/local/lib/python3.*/dist-packages/


Cloning into 'coco-caption'...
remote: Enumerating objects: 736, done.[K
remote: Total 736 (delta 0), reused 0 (delta 0), pack-reused 736 (from 1)[K
Receiving objects: 100% (736/736), 130.04 MiB | 15.39 MiB/s, done.
Resolving deltas: 100% (390/390), done.
Updating files: 100% (47/47), done.


In [None]:
# === Install dependencies for metrics ===
!pip install nltk rouge-score
!git clone https://github.com/tylin/coco-caption
!pip install git+https://github.com/salaniz/pycocoevalcap


fatal: destination path 'coco-caption' already exists and is not an empty directory.
Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-6k2d6tmn
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-6k2d6tmn
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycocoevalcap
  Building wheel for pycocoevalcap (setup.py) ... [?25l[?25hdone
  Created wheel for pycocoevalcap: filename=pycocoevalcap-1.2-py3-none-any.whl size=104312245 sha256=484bf33c8e0b25c47a4b7362cca15f8596a6d77d510b401f75d94653b4fb0ab7
  Stored in directory: /tmp/pip-ephem-wheel-cache-6m3_nd7r/wheels/d2/1f/44/6485e566f8ae3d42b56e7c05fd50a3bbb70a50b0e6e7c55212
Successfully built pycocoevalcap
Installing collected packages: pycocoevalcap
Succes

In [None]:
!git clone https://github.com/tylin/coco-caption
!pip install git+https://github.com/salaniz/pycocoevalcap


fatal: destination path 'coco-caption' already exists and is not an empty directory.
Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-8or2s3ui
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-8or2s3ui
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from collections import defaultdict
from tqdm import tqdm
from pycocoevalcap.cider.cider import Cider

def evaluate_all_metrics(pred_path, ref_json_path):
    # Load predicted captions
    with open(pred_path, 'r') as f:
        preds = json.load(f)

    # Load ground truth captions
    with open(ref_json_path, 'r') as f:
        data = json.load(f)

    # Prepare refs dict: video_id → list of reference captions
    refs = defaultdict(list)
    for item in data['sentences']:
        refs[item['video_id']].append(item['caption'].lower())

    # Collect predictions and matching references
    gt, pr = [], []
    meteor_scores = []
    rouge_scores = []
    cider_refs = []
    cider_hyps = []

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1

    for video_id, pred_caption in tqdm(preds.items(), desc="Evaluating"):
        pred_caption = pred_caption.lower()
        references = refs.get(video_id, [])
        if not references:
            continue

        gt.append([ref.split() for ref in references])
        pr.append(pred_caption.split())

        meteor_scores.append(meteor_score(references, pred_caption))
        rouge_scores.append(scorer.score(' '.join(references), pred_caption)['rougeL'].fmeasure)

        cider_refs.append({'image_id': video_id, 'captions': references})
        cider_hyps.append({'image_id': video_id, 'caption': pred_caption})

    # Compute scores
    bleu1 = corpus_bleu(gt, pr, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = corpus_bleu(gt, pr, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = corpus_bleu(gt, pr, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = corpus_bleu(gt, pr, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_rouge = sum(rouge_scores) / len(rouge_scores)
    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(cider_refs, cider_hyps)

    # Show Results
    print("\n📊 Evaluation Results:")
    print(f"BLEU-1  : {bleu1:.4f}")
    print(f"BLEU-2  : {bleu2:.4f}")
    print(f"BLEU-3  : {bleu3:.4f}")
    print(f"BLEU-4  : {bleu4:.4f}")
    print(f"METEOR  : {avg_meteor:.4f}")
    print(f"ROUGE-L : {avg_rouge:.4f}")
    print(f"CIDEr   : {cider_score:.4f}")


In [None]:
import torch
import json
from tqdm import tqdm

def generate_predictions(model, dataset, vocab_rev, device):
    model.eval()
    predictions = {}

    for idx in tqdm(range(len(dataset)), desc="Generating Captions"):
        video_feats, _ = dataset[idx]
        video_feats = video_feats.unsqueeze(0).to(device)

        with torch.no_grad():
            _, output_seq = model(video_feats, mode='inference')

        tokens = output_seq.squeeze().tolist()
        caption = []
        for tok in tokens:
            word = vocab_rev.get(str(tok), '<UNK>')
            if word in ['<EOS>', '<PAD>']:
                break
            if word != '<SOS>':
                caption.append(word)

        video_id = dataset.video_ids[idx]
        predictions[video_id] = ' '.join(caption)

    return predictions


In [None]:
import os

checkpoint_path = "/content/drive/MyDrive/msvd_split/checkpoints"
os.makedirs(checkpoint_path, exist_ok=True)

# Then pass this to training
train_model(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    vocab=vocab,
    device=device,
    num_epochs=30,
    batch_size=8,
    learning_rate=1e-4,
    checkpoint_dir=checkpoint_path
)



🚀 Epoch [1/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.23it/s]


✅ Training Loss: 4.5071


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.67it/s]


🧪 Validation Loss: 4.1655
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_1.pt

🚀 Epoch [2/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.13it/s]


✅ Training Loss: 3.9500


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.54it/s]


🧪 Validation Loss: 3.8747
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_2.pt

🚀 Epoch [3/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.15it/s]


✅ Training Loss: 3.8790


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.44it/s]


🧪 Validation Loss: 4.3683
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_3.pt

🚀 Epoch [4/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.10it/s]


✅ Training Loss: 3.7757


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.49it/s]


🧪 Validation Loss: 4.1915
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_4.pt

🚀 Epoch [5/30]


Training: 100%|██████████| 157/157 [00:39<00:00,  4.01it/s]


✅ Training Loss: 3.6728


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.55it/s]


🧪 Validation Loss: 3.8768
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_5.pt

🚀 Epoch [6/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.14it/s]


✅ Training Loss: 3.5943


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.19it/s]


🧪 Validation Loss: 3.7895
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_6.pt

🚀 Epoch [7/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.11it/s]


✅ Training Loss: 3.5981


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.29it/s]


🧪 Validation Loss: 3.5976
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_7.pt

🚀 Epoch [8/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.11it/s]


✅ Training Loss: 3.4965


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.59it/s]


🧪 Validation Loss: 3.4450
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_8.pt

🚀 Epoch [9/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.05it/s]


✅ Training Loss: 3.4422


Validation: 100%|██████████| 12/12 [00:00<00:00, 13.74it/s]


🧪 Validation Loss: 3.5100
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_9.pt

🚀 Epoch [10/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.15it/s]


✅ Training Loss: 3.4471


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.70it/s]


🧪 Validation Loss: 3.7160
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_10.pt

🚀 Epoch [11/30]


Training: 100%|██████████| 157/157 [00:45<00:00,  3.47it/s]


✅ Training Loss: 3.3544


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.61it/s]


🧪 Validation Loss: 3.5924
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_11.pt

🚀 Epoch [12/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.05it/s]


✅ Training Loss: 3.4078


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.48it/s]


🧪 Validation Loss: 3.4277
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_12.pt

🚀 Epoch [13/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.12it/s]


✅ Training Loss: 3.3239


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.62it/s]


🧪 Validation Loss: 3.4064
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_13.pt

🚀 Epoch [14/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.14it/s]


✅ Training Loss: 3.3630


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.27it/s]


🧪 Validation Loss: 3.3922
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_14.pt

🚀 Epoch [15/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.16it/s]


✅ Training Loss: 3.3459


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.45it/s]


🧪 Validation Loss: 3.5659
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_15.pt

🚀 Epoch [16/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.10it/s]


✅ Training Loss: 3.2915


Validation: 100%|██████████| 12/12 [00:00<00:00, 13.20it/s]


🧪 Validation Loss: 3.5713
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_16.pt

🚀 Epoch [17/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.10it/s]


✅ Training Loss: 3.2815


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.48it/s]


🧪 Validation Loss: 3.5616
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_17.pt

🚀 Epoch [18/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.06it/s]


✅ Training Loss: 3.2674


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.81it/s]


🧪 Validation Loss: 3.6604
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_18.pt

🚀 Epoch [19/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.15it/s]


✅ Training Loss: 3.2831


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.44it/s]


🧪 Validation Loss: 3.4520
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_19.pt

🚀 Epoch [20/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.14it/s]


✅ Training Loss: 3.2817


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.23it/s]


🧪 Validation Loss: 3.5978
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_20.pt

🚀 Epoch [21/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.09it/s]


✅ Training Loss: 3.2260


Validation: 100%|██████████| 12/12 [00:00<00:00, 13.98it/s]


🧪 Validation Loss: 3.4773
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_21.pt

🚀 Epoch [22/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.11it/s]


✅ Training Loss: 3.1978


Validation: 100%|██████████| 12/12 [00:00<00:00, 13.49it/s]


🧪 Validation Loss: 3.3894
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_22.pt

🚀 Epoch [23/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.09it/s]


✅ Training Loss: 3.2879


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.55it/s]


🧪 Validation Loss: 3.3571
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_23.pt

🚀 Epoch [24/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.13it/s]


✅ Training Loss: 3.2479


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.48it/s]


🧪 Validation Loss: 3.6071
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_24.pt

🚀 Epoch [25/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.12it/s]


✅ Training Loss: 3.2296


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.60it/s]


🧪 Validation Loss: 3.2628
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_25.pt

🚀 Epoch [26/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.14it/s]


✅ Training Loss: 3.1683


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.70it/s]


🧪 Validation Loss: 3.4404
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_26.pt

🚀 Epoch [27/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.13it/s]


✅ Training Loss: 3.2643


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.57it/s]


🧪 Validation Loss: 3.4990
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_27.pt

🚀 Epoch [28/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.08it/s]


✅ Training Loss: 3.2574


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.48it/s]


🧪 Validation Loss: 3.3379
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_28.pt

🚀 Epoch [29/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.06it/s]


✅ Training Loss: 3.1511


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.47it/s]


🧪 Validation Loss: 3.5642
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_29.pt

🚀 Epoch [30/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.11it/s]


✅ Training Loss: 3.2400


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.07it/s]


🧪 Validation Loss: 3.4547
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt


In [None]:
!pip install git+https://github.com/vrama91/cider

Collecting git+https://github.com/vrama91/cider
  Cloning https://github.com/vrama91/cider to /tmp/pip-req-build-dam9k5rl
  Running command git clone --filter=blob:none --quiet https://github.com/vrama91/cider /tmp/pip-req-build-dam9k5rl
  Resolved https://github.com/vrama91/cider to commit f281464c60d496b8fabb89a7a8c120f655a3a2bd
[31mERROR: git+https://github.com/vrama91/cider does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [None]:
# ⬇️ Install pycocoevalcap from working fork
!git clone https://github.com/salaniz/pycocoevalcap.git
!pip install -e ./pycocoevalcap


Cloning into 'pycocoevalcap'...
remote: Enumerating objects: 821, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 821 (delta 4), reused 3 (delta 3), pack-reused 809 (from 2)[K
Receiving objects: 100% (821/821), 130.06 MiB | 25.70 MiB/s, done.
Resolving deltas: 100% (424/424), done.
Obtaining file:///content/pycocoevalcap
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: pycocoevalcap
  Running setup.py develop for pycocoevalcap
Successfully installed pycocoevalcap-1.2


In [None]:
from pycocoevalcap.cider.cider import Cider


In [None]:
import os, json, torch, torch.nn as nn, torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import single_meteor_score
from pycocoevalcap.cider.cider import Cider

In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F

class S2VTModel(nn.Module):
    def __init__(self, vocab_size, max_len=45, dim_hidden=1024, dim_word=512, dim_vid=2048,
                 sos_id=1, eos_id=0, n_layers=1, rnn_cell='lstm', rnn_dropout_p=0.3):
        super(S2VTModel, self).__init__()

        self.rnn_cell_type = rnn_cell.lower()
        self.rnn_cell = nn.LSTM if self.rnn_cell_type == 'lstm' else nn.GRU

        self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)
        self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)

        self.embedding = nn.Embedding(vocab_size, dim_word)
        self.out = nn.Linear(dim_hidden, vocab_size)

        self.dim_vid = dim_vid
        self.dim_hidden = dim_hidden
        self.dim_word = dim_word
        self.max_length = max_len
        self.vocab_size = vocab_size
        self.sos_id = sos_id
        self.eos_id = eos_id

    def forward(self, vid_feats, target_variable=None, mode='train', beam_width=3):
        batch_size, n_frames, _ = vid_feats.shape
        device = vid_feats.device

        encoder_outputs, state1 = self.rnn1(vid_feats)

        if mode == 'train':
            seq_probs = []
            for t in range(self.max_length - 1):
                current_word = self.embedding(target_variable[:, t])
                input2 = torch.cat((encoder_outputs[:, -1], current_word), dim=1).unsqueeze(1)
                output2, state2 = self.rnn2(input2, state1)
                logits = self.out(output2.squeeze(1))
                logits = F.log_softmax(logits, dim=1)
                seq_probs.append(logits.unsqueeze(1))
                state1 = state2
            return torch.cat(seq_probs, dim=1), None

        else:
            generated = torch.full((batch_size, 1), self.sos_id, dtype=torch.long, device=device)
            seqs = []
            for t in range(self.max_length - 1):
                emb = self.embedding(generated[:, -1])
                input2 = torch.cat((encoder_outputs[:, -1], emb), dim=1).unsqueeze(1)
                output2, state1 = self.rnn2(input2, state1)
                logits = self.out(output2.squeeze(1))
                next_word = torch.argmax(F.log_softmax(logits, dim=1), dim=1).unsqueeze(1)
                generated = torch.cat([generated, next_word], dim=1)
            return None, generated


In [None]:
class VideoCaptionDataset(Dataset):
    def __init__(self, feature_dir, json_path, vocab, max_caption_length=45):
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_caption_length = max_caption_length
        with open(json_path, 'r') as f:
            self.annotations = json.load(f)['annotations']

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        item = self.annotations[idx]
        feature_path = os.path.join(self.feature_dir, item['video_id'] + '.npy')
        features = torch.tensor(np.load(feature_path)).float()
        caption = [self.vocab.get(tok, self.vocab['<UNK>']) for tok in item['caption']]
        caption = torch.tensor(caption[:self.max_caption_length])
        return features, caption


In [None]:
def train_model(model, train_loader, val_loader, device, vocab, num_epochs=10, lr=1e-4):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.NLLLoss(ignore_index=vocab['<PAD>'])
    model.to(device)
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for video_feats, captions in train_loader:
            video_feats, captions = video_feats.to(device), captions.to(device)
            optimizer.zero_grad()
            outputs, _ = model(video_feats, captions)
            loss = criterion(outputs.view(-1, outputs.size(-1)), captions[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} | Loss: {total_loss / len(train_loader):.4f}")


In [None]:
def evaluate_model(model, test_loader, vocab, device):
    vocab_rev = {v: k for k, v in vocab.items()}
    all_hypotheses, all_references = [], []
    model.eval()
    with torch.no_grad():
        for video_feats, captions in test_loader:
            video_feats, captions = video_feats.to(device), captions.to(device)
            _, predicted_ids = model(video_feats, mode='inference')
            pred_tokens = [vocab_rev.get(int(tok), '<UNK>') for tok in predicted_ids[0] if vocab_rev.get(int(tok), '<UNK>') not in ['<SOS>', '<PAD>', '<EOS>']]
            ref_tokens = [vocab_rev.get(int(tok), '<UNK>') for tok in captions[0] if vocab_rev.get(int(tok), '<UNK>') not in ['<SOS>', '<PAD>', '<EOS>']]
            all_hypotheses.append(pred_tokens)
            all_references.append([ref_tokens])
    bleu = corpus_bleu(all_references, all_hypotheses)
    meteor_scores = [single_meteor_score(" ".join(ref[0]), " ".join(hyp)) for ref, hyp in zip(all_references, all_hypotheses)]
    cider_score, _ = Cider().compute_score({i:[" ".join(r[0])] for i,r in enumerate(all_references)}, {i:[" ".join(h)] for i,h in enumerate(all_hypotheses)})
    print(f"BLEU-4: {bleu:.4f}\nMETEOR: {sum(meteor_scores)/len(meteor_scores):.4f}\nCIDEr: {cider_score:.4f}")


In [None]:
with open(train_json_path, 'r') as f:
    data = json.load(f)

print(data.keys())


dict_keys(['videos', 'sentences'])


In [None]:
# === Sample Usage ===
# Update these paths as needed
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
train_json_path = '/content/drive/MyDrive/msvd_split/train/train_captions.json'
val_json_path = '/content/drive/MyDrive/msvd_split/val/val_captions.json'
test_json_path = '/content/drive/MyDrive/msvd_split/test/test_captions.json'
train_feat_dir = '/content/drive/MyDrive/msvd_split/train/features'
val_feat_dir = '/content/drive/MyDrive/msvd_split/val/features'
test_feat_dir = '/content/drive/MyDrive/msvd_split/test/features'

with open(vocab_path, 'r') as f:
    vocab = json.load(f)

train_dataset = VideoCaptionDataset(train_feat_dir, train_json_path, vocab)
val_dataset = VideoCaptionDataset(val_feat_dir, val_json_path, vocab)
test_dataset = VideoCaptionDataset(test_feat_dir, test_json_path, vocab)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=512,
    dim_word=256,
    dim_vid=1024,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=1,
    rnn_cell='lstm',
    rnn_dropout_p=0.2
)

train_model(model, train_loader, val_loader, device, vocab, num_epochs=30)
evaluate_model(model, test_loader, vocab, device)


RuntimeError: stack expects each tensor to be equal size, but got [42] at entry 0 and [21] at entry 1

In [None]:
class VideoCaptionDataset(Dataset):
    def __init__(self, feature_dir, json_path, vocab, max_caption_length=45):
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_caption_length = max_caption_length

        with open(json_path, 'r') as f:
            data = json.load(f)

        video_id_to_caption = {}
        for sent in data['sentences']:
            vid = sent['video_id']
            caption = sent['caption']
            if vid not in video_id_to_caption:
                video_id_to_caption[vid] = []
            video_id_to_caption[vid].append(caption)

        self.annotations = []
        for video in data['videos']:
            vid = video['video_id']
            if vid in video_id_to_caption:
                for cap in video_id_to_caption[vid]:
                    self.annotations.append({'video_id': vid, 'caption': cap})

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        item = self.annotations[idx]
        feature_path = os.path.join(self.feature_dir, item['video_id'] + '.npy')
        features = torch.tensor(np.load(feature_path)).float()
        caption = [self.vocab.get(tok, self.vocab['<UNK>']) for tok in item['caption']]
        caption = torch.tensor(caption[:self.max_caption_length])
        return features, caption


In [None]:
import os
import json
import numpy as np
import torch
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import corpus_bleu
from drive.MyDrive.S2VTModel import S2VTModel

# === Paths ===
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
test_json_path = '/content/drive/MyDrive/msvd_split/test/test_captions.json'
test_feature_dir = '/content/drive/MyDrive/msvd_split/test/features'
checkpoint_path = '/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt'

# === Load vocab ===
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}
vocab_size = len(vocab)

# === Load model (must match training config) ===
model = S2VTModel(
    vocab_size=vocab_size,
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=1,
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)

# === Load checkpoint ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

# === Load test data ===
test_dataset = VideoCaptionDataset(test_feature_dir, test_json_path, vocab, max_caption_length=45)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# === Inference + BLEU computation ===
print("Evaluating on test set...")

all_references = []
all_hypotheses = []

with torch.no_grad():
    for idx, (video_feats, captions) in enumerate(test_loader):
        video_feats, captions = video_feats.to(device), captions.to(device)

        # Model inference
        _, predicted_ids = model(video_feats, mode='inference')  # [1, T]

        # Decode generated caption
        pred_tokens = []
        for tok in predicted_ids[0]:
            word = vocab_rev.get(int(tok), '<UNK>')
            if word == '<EOS>':
                break
            if word not in ['<SOS>', '<PAD>']:
                pred_tokens.append(word)
        all_hypotheses.append(pred_tokens)

        # Decode ground truth caption
        ref_tokens = []
        for tok in captions[0]:
            word = vocab_rev.get(int(tok), '<UNK>')
            if word == '<EOS>':
                break
            if word not in ['<SOS>', '<PAD>']:
                ref_tokens.append(word)
        all_references.append([ref_tokens])  # list of references per hypothesis

        if idx < 5:
            print(f"\nExample {idx + 1}")
            print(f"Predicted    : {' '.join(pred_tokens)}")
            print(f"Ground Truth : {' '.join(ref_tokens)}")

# === Compute BLEU score ===
bleu_score = corpus_bleu(all_references, all_hypotheses)
print(f"\nFinal BLEU-4 Score: {bleu_score:.4f}")



RuntimeError: Error(s) in loading state_dict for S2VTModel:
	Unexpected key(s) in state_dict: "rnn1.weight_ih_l1", "rnn1.weight_hh_l1", "rnn1.bias_ih_l1", "rnn1.bias_hh_l1", "rnn2.weight_ih_l1", "rnn2.weight_hh_l1", "rnn2.bias_ih_l1", "rnn2.bias_hh_l1". 

In [None]:
import os
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
import subprocess
import tempfile
import glob
from PIL import Image
import numpy as np

from moviepy.editor import VideoFileClip

video_path = '/content/drive/MyDrive/YouTubeClips/-8y1Q0rA3n8_108_115.avi'

def extract_frames_ffmpeg(video_path, output_dir, frame_rate=1):
    os.makedirs(output_dir, exist_ok=True)
    cmd = [
        'ffmpeg', '-i', video_path, '-vf', f"fps={frame_rate},scale=224:224",
        os.path.join(output_dir, '%06d.jpg'), '-hide_banner', '-loglevel', 'error'
    ]
    subprocess.run(cmd)

def load_and_sample_frames(frame_dir, n_frames=40):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    frames = sorted(glob.glob(os.path.join(frame_dir, '*.jpg')))
    total = len(frames)
    if total == 0:
        raise ValueError("No frames extracted")

    indices = np.linspace(0, total - 1, min(n_frames, total)).astype(int)
    sampled = [frames[i] for i in indices]

    images = [transform(Image.open(f).convert('RGB')) for f in sampled]
    return torch.stack(images)  # [T, 3, H, W]

def extract_video_tensor(video_path, n_frames=40):
    with tempfile.TemporaryDirectory() as tmpdir:
        extract_frames_ffmpeg(video_path, tmpdir)
        tensor = load_and_sample_frames(tmpdir, n_frames)
    return tensor

def generate_caption_from_video(video_path, model, feature_extractor, vocab_rev, device, max_len=45):
    model.eval()
    model.to(device)

    video_tensor = extract_video_tensor(video_path).to(device)  # [T, 3, 224, 224]
    with torch.no_grad():
        features = feature_extractor(video_tensor)  # e.g. ResNet: [T, 2048]
        features = features.unsqueeze(0)  # Add batch dim: [1, T, 2048]
        _, predicted_ids = model(features, mode='inference')

    tokens = []
    for idx in predicted_ids[0]:
        word = vocab_rev.get(int(idx), '<UNK>')
        if word == '<EOS>':
            break
        if word not in ['<SOS>', '<PAD>']:
            tokens.append(word)

    return ' '.join(tokens)


  if event.key is 'enter':



In [None]:
# Install pretrainedmodels for ResNet152
!pip install pretrainedmodels

# Upload a video (e.g., example_video.avi)
from google.colab import files
uploaded = files.upload()  # upload your .avi file




Saving aa.mp4 to aa.mp4


In [None]:
!pip install pretrainedmodels




In [None]:
import os
import shutil
import subprocess

video_path = '/content/t1.mp4'
frame_dir = '/content/frames_t1mp4'

if os.path.exists(frame_dir):
    shutil.rmtree(frame_dir)
os.makedirs(frame_dir, exist_ok=True)

!ffmpeg -i {video_path} -vf "scale=400:300" -qscale:v 2 {frame_dir}/%06d.jpg


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
import pretrainedmodels
from pretrainedmodels import utils as pmutils

# Load pretrained ResNet-152 and remove classification layer
resnet = pretrainedmodels.resnet152(pretrained='imagenet')
resnet.last_linear = pmutils.Identity()
resnet = resnet.eval().cuda()

# Load image preprocessing function
load_image_fn = pmutils.LoadTransformImage(resnet)


In [None]:
import pretrainedmodels
from pretrainedmodels import utils as pmutils

resnet = pretrainedmodels.resnet152(pretrained='imagenet')
resnet.last_linear = pmutils.Identity()
resnet = resnet.eval().cuda()
load_image_fn = pmutils.LoadTransformImage(resnet)


In [None]:
import glob
import os
import numpy as np
import torch

frame_dir = '/content/frames_t1mp4'
frame_list = sorted(glob.glob(os.path.join(frame_dir, '*.jpg')))
selected_indices = np.linspace(0, len(frame_list) - 1, 40, dtype=int)
selected_frames = [frame_list[i] for i in selected_indices]

# Feature extraction
C, H, W = 3, 224, 224
features = torch.zeros((len(selected_frames), C, H, W)).cuda()
for i, frame_path in enumerate(selected_frames):
    img = load_image_fn(frame_path)
    features[i] = img

with torch.no_grad():
    video_feats = resnet(features).unsqueeze(0).cpu()  # [1, 40, 2048]


In [None]:
import os
import glob
import numpy as np
import torch

frame_dir = '/content/frames_t1mp4'  # 👈 ADD THIS LINE

# Select 40 frames
frame_list = sorted(glob.glob(os.path.join(frame_dir, '*.jpg')))
selected_indices = np.linspace(0, len(frame_list) - 1, 40, dtype=int)
selected_frames = [frame_list[i] for i in selected_indices]

# Extract features
C, H, W = 3, 224, 224
features = torch.zeros((len(selected_frames), C, H, W)).cuda()
for i, frame_path in enumerate(selected_frames):
    img = load_image_fn(frame_path)
    features[i] = img

with torch.no_grad():
    video_feats = resnet(features).unsqueeze(0).cpu()  # [1, 40, 2048]


In [None]:
import json
from drive.MyDrive.S2VTModel_Attention import S2VTModel

# Load vocab
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}

# Load model
model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)

checkpoint_path = '/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt'
checkpoint = torch.load(checkpoint_path, map_location='cuda')
model.load_state_dict(checkpoint['model_state_dict'])
model = model.eval().cuda()



  checkpoint = torch.load(checkpoint_path, map_location='cuda')



In [None]:
video_feats = video_feats.cuda()

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

# Decode prediction
caption = []
for tok in predicted_ids[0]:
    word = vocab_rev.get(int(tok), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        caption.append(word)

print("🎬 Caption for 04.mp4:\n", ' '.join(caption))


🎬 Caption for 04.mp4:
 a woman is slicing a potato


In [None]:
#avi video

# Step 1: Extract frames from AVI video
import os
import shutil
import subprocess

video_path = '/content/_9iG5Ge01PM_3_11.avi'  # 👈 Change filename if needed
frame_dir = '/content/frames_your_video'

if os.path.exists(frame_dir):
    shutil.rmtree(frame_dir)
os.makedirs(frame_dir, exist_ok=True)

!ffmpeg -i "$video_path" -vf "scale=400:300" -qscale:v 2 "$frame_dir/%06d.jpg"



ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
!pip install pretrainedmodels




In [None]:
# Step 2: Load ResNet152 for feature extraction
import pretrainedmodels
from pretrainedmodels import utils as pmutils
import torch

resnet = pretrainedmodels.resnet152(pretrained='imagenet')
resnet.last_linear = pmutils.Identity()
resnet = resnet.eval().cuda()
load_image_fn = pmutils.LoadTransformImage(resnet)






In [None]:
# Step 3: Extract features from 40 frames
import glob
import numpy as np

frame_list = sorted(glob.glob(os.path.join(frame_dir, '*.jpg')))
selected_indices = np.linspace(0, len(frame_list) - 1, 40, dtype=int)
selected_frames = [frame_list[i] for i in selected_indices]

C, H, W = 3, 224, 224
features = torch.zeros((len(selected_frames), C, H, W)).cuda()
for i, frame_path in enumerate(selected_frames):
    img = load_image_fn(frame_path)
    features[i] = img

with torch.no_grad():
    video_feats = resnet(features).unsqueeze(0).cpu()


In [None]:
# Step 4: Load trained S2VT model
import json
from drive.MyDrive.youtube_captioning_with_attention.S2VTModel_Attention import S2VTModel

vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}

model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)

checkpoint_path = '/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt'
checkpoint = torch.load(checkpoint_path, map_location='cuda')
model.load_state_dict(checkpoint['model_state_dict'])
model = model.eval().cuda()



  checkpoint = torch.load(checkpoint_path, map_location='cuda')



In [None]:
# Step 5: Generate caption
video_feats = video_feats.cuda()

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

caption = []
for tok in predicted_ids[0]:
    word = vocab_rev.get(int(tok), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        caption.append(word)

print("🎬 Caption for your AVI video:\n", ' '.join(caption))


🎬 Caption for your AVI video:
 a cat is playing


In [None]:
!pip install nltk rouge-score
!git clone https://github.com/tylin/coco-caption
!pip install git+https://github.com/salaniz/pycocoevalcap


fatal: destination path 'coco-caption' already exists and is not an empty directory.
Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-xhip2uvw
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-xhip2uvw
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install nltk rouge-score
!git clone https://github.com/tylin/coco-caption
!pip install git+https://github.com/salaniz/pycocoevalcap


fatal: destination path 'coco-caption' already exists and is not an empty directory.
Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-bgswltcp
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-bgswltcp
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import sys
sys.path.append('/content/drive/MyDrive')

from S2VTModel import S2VTModel


In [None]:
import torch
import numpy as np
import os
import json
from torch.utils.data import Dataset

class VideoCaptionDataset(Dataset):
    def __init__(self, feature_dir, json_path, vocab, max_caption_length=15, verbose=False):
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_caption_length = max_caption_length
        self.verbose = verbose

        with open(json_path, 'r') as f:
            data = json.load(f)

        self.video_captions = {}
        for item in data['sentences']:
            vid = item['video_id']
            if vid not in self.video_captions:
                self.video_captions[vid] = []
            self.video_captions[vid].append(item['caption'])

        all_video_ids = [v['video_id'] for v in data['videos']]
        self.video_ids = [
            vid for vid in all_video_ids
            if vid in self.video_captions and os.path.exists(os.path.join(self.feature_dir, f"{vid}.npy"))
        ]

    def __len__(self):
        return len(self.video_ids)

    def __getitem__(self, idx):
        video_id = self.video_ids[idx]
        feature_path = os.path.join(self.feature_dir, f"{video_id}.npy")
        video_features = np.load(feature_path)

        caption = np.random.choice(self.video_captions[video_id])
        tokens = [self.vocab['<SOS>']] + [self.vocab.get(w, self.vocab['<UNK>']) for w in caption.lower().split()] + [self.vocab['<EOS>']]
        tokens = tokens[:self.max_caption_length]
        tokens += [self.vocab['<PAD>']] * (self.max_caption_length - len(tokens))

        return torch.tensor(video_features, dtype=torch.float32), torch.tensor(tokens, dtype=torch.long)


In [None]:
import json

with open('/content/drive/MyDrive/msvd_split/vocab.json', 'r') as f:
    vocab = json.load(f)

with open('/content/drive/MyDrive/msvd_split/vocab_rev.json', 'r') as f:
    vocab_rev = json.load(f)

test_dataset = VideoCaptionDataset(
    feature_dir='/content/drive/MyDrive/msvd_split/test/features',
    json_path='/content/drive/MyDrive/msvd_split/test/test_captions.json',
    vocab=vocab,
    max_caption_length=45
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=2,
    rnn_cell='lstm',
    rnn_dropout_p=0.2
)

ckpt = torch.load('/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt', map_location='cpu')
model.load_state_dict(ckpt['model_state_dict'])
model = model.to(device)


In [None]:
from tqdm import tqdm

def generate_predictions(model, dataset, vocab_rev, device):
    model.eval()
    predictions = {}

    for idx in tqdm(range(len(dataset)), desc="Generating Captions"):
        video_feats, _ = dataset[idx]
        video_feats = video_feats.unsqueeze(0).to(device)

        with torch.no_grad():
            _, output_seq = model(video_feats, mode='inference')

        tokens = output_seq.squeeze().tolist()
        caption = []
        for tok in tokens:
            word = vocab_rev.get(str(tok), '<UNK>')
            if word in ['<EOS>', '<PAD>']:
                break
            if word != '<SOS>':
                caption.append(word)

        video_id = dataset.video_ids[idx]
        predictions[video_id] = ' '.join(caption)

    return predictions

preds = generate_predictions(model, test_dataset, vocab_rev, device)

with open('/content/drive/MyDrive/msvd_split/test/predicted_captions.json', 'w') as f:
    json.dump(preds, f, indent=4)

print("✅ Saved predictions.")


Generating Captions: 100%|██████████| 296/296 [06:19<00:00,  1.28s/it]

✅ Saved predictions.





In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from collections import defaultdict

def evaluate_all_metrics(pred_path, ref_json_path):
    with open(pred_path, 'r') as f:
        preds = json.load(f)

    with open(ref_json_path, 'r') as f:
        data = json.load(f)

    refs = defaultdict(list)
    for item in data['sentences']:
        refs[item['video_id']].append(item['caption'].lower())

    gt, pr = [], []
    meteor_scores = []
    rouge_scores = []
    cider_refs, cider_hyps = [], []

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1

    for video_id, pred_caption in preds.items():
    pred_caption = pred_caption.lower()
    references = refs.get(video_id, [])
    if not references:
        continue

    gt.append([ref.split() for ref in references])
    pr.append(pred_caption.split())

    # ✅ FIX HERE
    meteor_scores.append(meteor_score(references, pred_caption.split()))

    rouge_scores.append(scorer.score(' '.join(references), pred_caption)['rougeL'].fmeasure)
    cider_refs.append({'image_id': video_id, 'captions': references})
    cider_hyps.append({'image_id': video_id, 'caption': pred_caption})

    bleu1 = corpus_bleu(gt, pr, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = corpus_bleu(gt, pr, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = corpus_bleu(gt, pr, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = corpus_bleu(gt, pr, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_rouge = sum(rouge_scores) / len(rouge_scores)
    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(cider_refs, cider_hyps)

    print("\n📊 Evaluation Results:")
    print(f"BLEU-1  : {bleu1:.4f}")
    print(f"BLEU-2  : {bleu2:.4f}")
    print(f"BLEU-3  : {bleu3:.4f}")
    print(f"BLEU-4  : {bleu4:.4f}")
    print(f"METEOR  : {avg_meteor:.4f}")
    print(f"ROUGE-L : {avg_rouge:.4f}")
    print(f"CIDEr   : {cider_score:.4f}")


IndentationError: expected an indented block after 'for' statement on line 26 (<ipython-input-32-6a348d12c513>, line 27)

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from collections import defaultdict
import json

def evaluate_all_metrics(pred_path, ref_json_path):
    with open(pred_path, 'r') as f:
        preds = json.load(f)

    with open(ref_json_path, 'r') as f:
        data = json.load(f)

    refs = defaultdict(list)
    for item in data['sentences']:
        refs[item['video_id']].append(item['caption'].lower())

    gt, pr = [], []
    meteor_scores = []
    rouge_scores = []
    cider_refs, cider_hyps = []

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1

    for video_id, pred_caption in preds.items():
        pred_caption = pred_caption.lower()
        references = refs.get(video_id, [])
        if not references:
            continue

        gt.append([ref.split() for ref in references])
        pr.append(pred_caption.split())

        meteor_scores.append(meteor_score(references, pred_caption.split()))
        rouge_scores.append(scorer.score(' '.join(references), pred_caption)['rougeL'].fmeasure)
        cider_refs.append({'image_id': video_id, 'captions': references})
        cider_hyps.append({'image_id': video_id, 'caption': pred_caption})

    bleu1 = corpus_bleu(gt, pr, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = corpus_bleu(gt, pr, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = corpus_bleu(gt, pr, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = corpus_bleu(gt, pr, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_rouge = sum(rouge_scores) / len(rouge_scores)
    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(cider_refs, cider_hyps)

    print("\n📊 Evaluation Results:")
    print(f"BLEU-1  : {bleu1:.4f}")
    print(f"BLEU-2  : {bleu2:.4f}")
    print(f"BLEU-3  : {bleu3:.4f}")
    print(f"BLEU-4  : {bleu4:.4f}")
    print(f"METEOR  : {avg_meteor:.4f}")
    print(f"ROUGE-L : {avg_rouge:.4f}")
    print(f"CIDEr   : {cider_score:.4f}")


In [None]:
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from collections import defaultdict

def evaluate_bleu(pred_path, ref_json_path):
    with open(pred_path, 'r') as f:
        preds = json.load(f)

    with open(ref_json_path, 'r') as f:
        refs_data = json.load(f)

    # Build references dict
    references = defaultdict(list)
    for item in refs_data['sentences']:
        vid = item['video_id']
        cap = item['caption'].lower().split()  # tokenize
        references[vid].append(cap)

    # Prepare ground truth and predicted lists
    gt, pr = [], []
    for vid, pred_caption in preds.items():
        pred_tokens = pred_caption.lower().split()  # tokenize
        if vid in references:
            gt.append(references[vid])  # List of token lists
            pr.append(pred_tokens)

    # Compute BLEU
    smooth_fn = SmoothingFunction().method1
    bleu1 = corpus_bleu(gt, pr, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = corpus_bleu(gt, pr, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = corpus_bleu(gt, pr, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = corpus_bleu(gt, pr, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)

    print("\n📊 BLEU Evaluation:")
    print(f"BLEU-1 : {bleu1:.4f}")
    print(f"BLEU-2 : {bleu2:.4f}")
    print(f"BLEU-3 : {bleu3:.4f}")
    print(f"BLEU-4 : {bleu4:.4f}")

# === Run It ===
evaluate_bleu(
    pred_path='/content/drive/MyDrive/msvd_split/test/predicted_captions.json',
    ref_json_path='/content/drive/MyDrive/msvd_split/test/test_captions.json'
)



📊 BLEU Evaluation:
BLEU-1 : 0.7170
BLEU-2 : 0.5550
BLEU-3 : 0.4579
BLEU-4 : 0.3424


In [None]:
!pip install rouge-score
!git clone https://github.com/tylin/coco-caption
!python3 setup.py install --cwd coco-caption


fatal: destination path 'coco-caption' already exists and is not an empty directory.
python3: can't open file '/content/setup.py': [Errno 2] No such file or directory


In [None]:
evaluate_all_metrics(
    pred_path='/content/drive/MyDrive/msvd_split/test/predicted_captions.json',
    ref_json_path='/content/drive/MyDrive/msvd_split/test/test_captions.json'
)


TypeError: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): a woman is cutting a potato

In [None]:
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from collections import defaultdict

def evaluate_bleu(pred_path, ref_json_path):
    with open(pred_path, 'r') as f:
        preds = json.load(f)

    with open(ref_json_path, 'r') as f:
        refs_data = json.load(f)

    # Build references dict
    references = defaultdict(list)
    for item in refs_data['sentences']:
        vid = item['video_id']
        cap = item['caption'].lower().split()  # tokenize
        references[vid].append(cap)

    # Prepare ground truth and predicted lists
    gt, pr = [], []
    for vid, pred_caption in preds.items():
        pred_tokens = pred_caption.lower().split()  # tokenize
        if vid in references:
            gt.append(references[vid])  # List of token lists
            pr.append(pred_tokens)

    # Compute BLEU
    smooth_fn = SmoothingFunction().method1
    bleu1 = corpus_bleu(gt, pr, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = corpus_bleu(gt, pr, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = corpus_bleu(gt, pr, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = corpus_bleu(gt, pr, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)

    print("\n📊 BLEU Evaluation:")
    print(f"BLEU-1 : {bleu1:.4f}")
    print(f"BLEU-2 : {bleu2:.4f}")
    print(f"BLEU-3 : {bleu3:.4f}")
    print(f"BLEU-4 : {bleu4:.4f}")

# === Run It ===
evaluate_bleu(
    pred_path='/content/drive/MyDrive/msvd_split/test/predicted_captions.json',
    ref_json_path='/content/drive/MyDrive/msvd_split/test/test_captions.json'
)



📊 BLEU Evaluation:
BLEU-1 : 0.7170
BLEU-2 : 0.5550
BLEU-3 : 0.4579
BLEU-4 : 0.3424


In [None]:
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from collections import defaultdict

def evaluate_all_metrics(pred_path, ref_json_path):
    with open(pred_path, 'r') as f:
        preds = json.load(f)

    with open(ref_json_path, 'r') as f:
        refs_data = json.load(f)

    # Build references dict
    references = defaultdict(list)
    for item in refs_data['sentences']:
        vid = item['video_id']
        cap = item['caption'].lower().split()
        references[vid].append(cap)

    # Evaluation lists
    gt, pr = [], []
    meteor_scores, rouge_scores = [], []
    cider_refs, cider_hyps = [], []

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1

    for vid, pred_caption in preds.items():
        if vid not in references:
            continue

        ref_caps = references[vid]
        pred_tokens = pred_caption.lower().split()

        gt.append(ref_caps)
        pr.append(pred_tokens)

        # METEOR: average of all refs
        meteor_scores.append(
            sum(meteor_score([' '.join(ref)], ' '.join(pred_tokens)) for ref in ref_caps) / len(ref_caps)
        )

        # ROUGE-L: best among refs
        rouge_scores.append(
            max(scorer.score(' '.join(ref), ' '.join(pred_tokens))['rougeL'].fmeasure for ref in ref_caps)
        )

        cider_refs.append({'image_id': vid, 'captions': [' '.join(ref) for ref in ref_caps]})
        cider_hyps.append({'image_id': vid, 'caption': ' '.join(pred_tokens)})

    # BLEU
    bleu1 = corpus_bleu(gt, pr, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = corpus_bleu(gt, pr, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = corpus_bleu(gt, pr, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = corpus_bleu(gt, pr, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)

    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_rouge = sum(rouge_scores) / len(rouge_scores)
    cider_score, _ = Cider().compute_score(cider_refs, cider_hyps)

    # Print results
    print("\n📊 Evaluation Results:")
    print(f"BLEU-1  : {bleu1:.4f}")
    print(f"BLEU-2  : {bleu2:.4f}")
    print(f"BLEU-3  : {bleu3:.4f}")
    print(f"BLEU-4  : {bleu4:.4f}")
    print(f"METEOR  : {avg_meteor:.4f}")
    print(f"ROUGE-L : {avg_rouge:.4f}")
    print(f"CIDEr   : {cider_score:.4f}")


In [None]:
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from collections import defaultdict

def evaluate_bleu(pred_path, ref_json_path):
    with open(pred_path, 'r') as f:
        preds = json.load(f)

    with open(ref_json_path, 'r') as f:
        refs_data = json.load(f)

    # === Build references dict ===
    references = defaultdict(list)
    for item in refs_data['sentences']:
        video_id = item['video_id']
        caption_tokens = item['caption'].lower().split()
        references[video_id].append(caption_tokens)

    # === Prepare reference (gt) and hypothesis (pr) lists ===
    gt, pr = [], []
    for vid, pred_caption in preds.items():
        if vid in references:
            gt.append(references[vid])  # List of reference token lists
            pr.append(pred_caption.lower().split())  # Predicted token list

    # === Compute BLEU scores ===
    smooth_fn = SmoothingFunction().method1
    bleu1 = corpus_bleu(gt, pr, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = corpus_bleu(gt, pr, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = corpus_bleu(gt, pr, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = corpus_bleu(gt, pr, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)

    # === Output ===
    print("\n📊 BLEU Evaluation Results:")
    print(f"BLEU-1 : {bleu1:.4f}")
    print(f"BLEU-2 : {bleu2:.4f}")
    print(f"BLEU-3 : {bleu3:.4f}")
    print(f"BLEU-4 : {bleu4:.4f}")

# === Run BLEU Evaluation ===
evaluate_bleu(
    pred_path='/content/drive/MyDrive/msvd_split/test/predicted_captions.json',
    ref_json_path='/content/drive/MyDrive/msvd_split/test/test_captions.json'
)



📊 BLEU Evaluation Results:
BLEU-1 : 0.7170
BLEU-2 : 0.5550
BLEU-3 : 0.4579
BLEU-4 : 0.3424


In [None]:
from torch.utils.data import Dataset
import torch
import os
import json

class VideoCaptionDataset(Dataset):
    def __init__(self, feature_dir, json_path, vocab, max_caption_length=45):
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_caption_length = max_caption_length

        with open(json_path, 'r') as f:
            data = json.load(f)

        self.samples = data['sentences']

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        video_id = sample['video_id']
        caption = sample['caption'].lower().split()

        # Convert caption to indices
        caption_ids = [self.vocab['<SOS>']] + \
                      [self.vocab.get(word, self.vocab['<UNK>']) for word in caption] + \
                      [self.vocab['<EOS>']]
        caption_ids = caption_ids[:self.max_caption_length]
        caption_ids += [self.vocab['<PAD>']] * (self.max_caption_length - len(caption_ids))

        caption_tensor = torch.tensor(caption_ids, dtype=torch.long)

        # Load pre-extracted I3D features
        feat_path = os.path.join(self.feature_dir, video_id + '.npy')
        video_feat = torch.tensor(np.load(feat_path), dtype=torch.float32)

        return video_feat, caption_tensor


In [None]:
import os
import json
import numpy as np
import torch
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from collections import defaultdict
from drive.MyDrive.S2VTModel import S2VTModel
from VideoCaptionDataset import VideoCaptionDataset  # ✅ Make sure this class is imported properly

# === Paths ===
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
test_json_path = '/content/drive/MyDrive/msvd_split/test/test_captions.json'
test_feature_dir = '/content/drive/MyDrive/msvd_split/test/features'
checkpoint_path = '/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt'

# === Load vocab ===
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}
vocab_size = len(vocab)

# === Load model ===
model = S2VTModel(
    vocab_size=vocab_size,
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=1,
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

# === Load test dataset ===
test_dataset = VideoCaptionDataset(test_feature_dir, test_json_path, vocab, max_caption_length=45)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# === Inference + BLEU computation ===
print("Evaluating on test set...")
all_references = []
all_hypotheses = []

with torch.no_grad():
    for idx, (video_feats, captions) in enumerate(test_loader):
        video_feats, captions = video_feats.to(device), captions.to(device)

        # Generate prediction
        _, predicted_ids = model(video_feats, mode='inference')

        # Decode predicted caption
        pred_tokens = []
        for tok in predicted_ids[0]:
            word = vocab_rev.get(int(tok), '<UNK>')
            if word == '<EOS>': break
            if word not in ['<SOS>', '<PAD>']:
                pred_tokens.append(word)

        # Decode ground truth
        ref_tokens = []
        for tok in captions[0]:
            word = vocab_rev.get(int(tok), '<UNK>')
            if word == '<EOS>': break
            if word not in ['<SOS>', '<PAD>']:
                ref_tokens.append(word)

        all_hypotheses.append(pred_tokens)
        all_references.append([ref_tokens])

        if idx < 5:
            print(f"\nExample {idx + 1}")
            print(f"Predicted    : {' '.join(pred_tokens)}")
            print(f"Ground Truth : {' '.join(ref_tokens)}")

# === Compute BLEU Scores ===
smooth_fn = SmoothingFunction().method1
bleu1 = corpus_bleu(all_references, all_hypotheses, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
bleu2 = corpus_bleu(all_references, all_hypotheses, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
bleu3 = corpus_bleu(all_references, all_hypotheses, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
bleu4 = corpus_bleu(all_references, all_hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)

# === Display ===
print("\n📊 BLEU Evaluation:")
print(f"BLEU-1 : {bleu1:.4f}")
print(f"BLEU-2 : {bleu2:.4f}")
print(f"BLEU-3 : {bleu3:.4f}")
print(f"BLEU-4 : {bleu4:.4f}")


ModuleNotFoundError: No module named 'VideoCaptionDataset'

In [None]:
!pip install rouge-score
!pip install git+https://github.com/tylin/coco-caption.git


Collecting git+https://github.com/tylin/coco-caption.git
  Cloning https://github.com/tylin/coco-caption.git to /tmp/pip-req-build-gwfu9rmh
  Running command git clone --filter=blob:none --quiet https://github.com/tylin/coco-caption.git /tmp/pip-req-build-gwfu9rmh
  Resolved https://github.com/tylin/coco-caption.git to commit 3a9afb2682141a03e1cdc02b0df6770d2c884f6f
[31mERROR: git+https://github.com/tylin/coco-caption.git does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [None]:
import json
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from collections import defaultdict

def evaluate_all_metrics(pred_path, ref_json_path):
    with open(pred_path, 'r') as f:
        preds = json.load(f)

    with open(ref_json_path, 'r') as f:
        refs_data = json.load(f)

    # Build references dictionary
    references = defaultdict(list)
    for item in refs_data['sentences']:
        vid = item['video_id']
        cap = item['caption'].lower().split()  # Tokenized reference
        references[vid].append(cap)

    gt, pr = [], []
    meteor_scores = []
    rouge_scores = []
    cider_refs, cider_hyps = [], []

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1

    for vid, pred in preds.items():
        if vid not in references:
            continue
        pred_tokens = pred.lower().split()
        ref_tokens = references[vid]

        # BLEU
        gt.append(ref_tokens)
        pr.append(pred_tokens)

        # METEOR
        references_joined = [' '.join(ref) for ref in ref_tokens]
        meteor_scores.append(meteor_score(references_joined, ' '.join(pred_tokens)))

        # ROUGE
        rouge_scores.append(scorer.score(' '.join(references_joined), ' '.join(pred_tokens))['rougeL'].fmeasure)

        # CIDEr format
        cider_refs.append({'image_id': vid, 'captions': references_joined})
        cider_hyps.append({'image_id': vid, 'caption': ' '.join(pred_tokens)})

    # BLEU
    bleu1 = corpus_bleu(gt, pr, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = corpus_bleu(gt, pr, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = corpus_bleu(gt, pr, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = corpus_bleu(gt, pr, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)

    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_rouge = sum(rouge_scores) / len(rouge_scores)

    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(cider_refs, cider_hyps)

    print("\n📊 Evaluation Metrics:")
    print(f"BLEU-1   : {bleu1:.4f}")
    print(f"BLEU-2   : {bleu2:.4f}")
    print(f"BLEU-3   : {bleu3:.4f}")
    print(f"BLEU-4   : {bleu4:.4f}")
    print(f"METEOR   : {avg_meteor:.4f}")
    print(f"ROUGE-L  : {avg_rouge:.4f}")
    print(f"CIDEr    : {cider_score:.4f}")

# ✅ Call this function
evaluate_all_metrics(
    pred_path='/content/drive/MyDrive/msvd_split/test/predicted_captions.json',
    ref_json_path='/content/drive/MyDrive/msvd_split/test/test_captions.json'
)


TypeError: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): a woman is cutting a potato

In [None]:
import json
from collections import defaultdict
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

def evaluate_bleu_only(pred_path, ref_json_path):
    with open(pred_path, 'r') as f:
        preds = json.load(f)

    with open(ref_json_path, 'r') as f:
        refs_data = json.load(f)

    references = defaultdict(list)
    for item in refs_data['sentences']:
        vid = item['video_id']
        cap = item['caption'].lower().split()
        references[vid].append(cap)

    gt, pr = [], []
    smooth_fn = SmoothingFunction().method1

    for vid, pred in preds.items():
        if vid not in references:
            continue
        gt.append(references[vid])
        pr.append(pred.lower().split())

    bleu1 = corpus_bleu(gt, pr, weights=(1, 0, 0, 0), smoothing_function=smooth_fn)
    bleu2 = corpus_bleu(gt, pr, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth_fn)
    bleu3 = corpus_bleu(gt, pr, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth_fn)
    bleu4 = corpus_bleu(gt, pr, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth_fn)

    print("\n📊 BLEU Score Evaluation:")
    print(f"BLEU-1 : {bleu1:.4f}")
    print(f"BLEU-2 : {bleu2:.4f}")
    print(f"BLEU-3 : {bleu3:.4f}")
    print(f"BLEU-4 : {bleu4:.4f}")

# ✅ Run it
evaluate_bleu_only(
    pred_path='/content/drive/MyDrive/msvd_split/test/predicted_captions.json',
    ref_json_path='/content/drive/MyDrive/msvd_split/test/test_captions.json'
)



📊 BLEU Score Evaluation:
BLEU-1 : 0.7170
BLEU-2 : 0.5550
BLEU-3 : 0.4579
BLEU-4 : 0.3424


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torchvision.models as models
import torchvision.transforms as transforms
import torch
import cv2
import os
from PIL import Image

def extract_resnet_features(video_path):
    model = models.resnet152(pretrained=True)
    model = torch.nn.Sequential(*list(model.children())[:-1])  # remove final fc
    model.eval()

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    cap = cv2.VideoCapture(video_path)
    features = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        tensor = transform(img).unsqueeze(0)  # [1, 3, 224, 224]
        with torch.no_grad():
            feat = model(tensor)  # [1, 2048, 1, 1]
        features.append(feat.squeeze().numpy())

    cap.release()
    return torch.tensor(features).unsqueeze(0)  # shape: [1, T, 2048]


In [None]:
from drive.MyDrive.S2VTModel import S2VTModel

# Load vocab
import json
with open('/content/drive/MyDrive/msvd_split/vocab.json') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}

# Load model
model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=1,
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)

checkpoint = torch.load('/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt', map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()


RuntimeError: Error(s) in loading state_dict for S2VTModel:
	Unexpected key(s) in state_dict: "rnn1.weight_ih_l1", "rnn1.weight_hh_l1", "rnn1.bias_ih_l1", "rnn1.bias_hh_l1", "rnn2.weight_ih_l1", "rnn2.weight_hh_l1", "rnn2.bias_ih_l1", "rnn2.bias_hh_l1". 

In [None]:
import os
import json
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from drive.MyDrive.S2VTModel import S2VTModel  # ✅ Adjust if path differs

# === Parameters ===
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
checkpoint_path = '/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt'
video_feature_dir = '/content/drive/MyDrive/msvd_split/test/features'  # Adjust if needed
max_len = 45

# === Load Vocabulary ===
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}

# === Define Dataset ===
class VideoFeatureDataset(Dataset):
    def __init__(self, video_dir):
        self.video_dir = video_dir
        self.video_files = [f for f in os.listdir(video_dir) if f.endswith('.npy')]

    def __len__(self):
        return len(self.video_files)

    def __getitem__(self, idx):
        fname = self.video_files[idx]
        path = os.path.join(self.video_dir, fname)
        feature = np.load(path)
        return torch.tensor(feature, dtype=torch.float32), fname.split('.')[0]

# === Load Dataset ===
dataset = VideoFeatureDataset(video_feature_dir)
loader = DataLoader(dataset, batch_size=1, shuffle=False)

# === Load Model ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = S2VTModel(
    vocab_size=len(vocab),
    max_len=max_len,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=2,  # ✅ Fix: match your checkpoint
    rnn_cell='lstm',
    rnn_dropout_p=0.3
).to(device)

checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# === Generate Captions ===
print("\n🎥 Generating Captions:\n")
with torch.no_grad():
    for video_feats, vid_id in loader:
        video_feats = video_feats.to(device)

        _, predicted_ids = model(video_feats, mode='inference')
        pred_tokens = []

        for tok in predicted_ids[0]:
            word = vocab_rev.get(int(tok), '<UNK>')
            if word == '<EOS>':
                break
            if word not in ['<SOS>', '<PAD>']:
                pred_tokens.append(word)

        caption = ' '.join(pred_tokens)
        print(f"{vid_id[0]}: {caption}")



🎥 Generating Captions:

qeKX-N1nKiM_68_72: a woman is cutting a potato
Sg5rTYrkpnU_35_48: a man is walking
C_DDjCRxTxQ_1_4: a woman is riding a horse
-wa0umYJVGg_271_276: a woman is cutting a potato
9LHg5RUGukI_58_63: a man is cutting a potato
PCXHuseKwDc_68_76: a man is playing a guitar
tBj4Ny19vfQ_54_59: a dog is playing
jW77z3-SrO4_56_63: a woman is cutting a potato
8PQiaurIiDM_247_255: a man is riding a horse
5XxshEdcfAM_96_107: a man is riding a horse
9zXqkHvs0po_59_65: a woman is cutting a potato
ok4cM6WTA5E_142_150: a woman is cutting a potato
FwCmcZpkk-k_22_32: a man is riding a horse
FWzsXeXCwuc_111_116: a man is playing a guitar
0lh_UWF9ZP4_82_87: a woman is cutting a potato
dJCtOz32dnw_40_60: a man is riding a horse
Ixw6wmoC_xg_116_126: a man is riding a horse
lcu-DwrnYY8_2_5: a man is walking
ZlX_Gy4HP2E_38_55: a man is riding a horse
5U3xz9Ovmhk_214_222: a woman is cutting a potato
BVilbVCo9sU_1_11: a cat is playing
ACOmKiJDkA4_130_144: a woman is cutting a potato
L6dEUQ6

In [None]:
def generate_caption_from_video(video_path):
    feats = extract_resnet_features(video_path)  # [1, T, 2048]

    with torch.no_grad():
        _, pred_ids = model(feats, mode='inference')

    caption = []
    for idx in pred_ids[0]:
        word = vocab_rev.get(int(idx), '<UNK>')
        if word == '<EOS>':
            break
        if word not in ['<SOS>', '<PAD>']:
            caption.append(word)
    return ' '.join(caption)


In [None]:
video_path = "/content/sample_data/t1.mp4"
caption = generate_caption_from_video(video_path)
print("📝 Caption:", caption)


ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
for file in os.listdir("/content/sample_data"):
    if file.endswith(".mp4") or file.endswith(".avi"):
        cap = generate_caption_from_video(os.path.join("/content/sample_data", file))
        print(f"{file} ➤ {cap}")


In [None]:
!pip install opencv-python-headless
!pip install torch torchvision torchaudio




In [None]:
import os
import cv2
import json
import torch
import numpy as np
from torchvision import transforms
from torch import nn
from moviepy.editor import VideoFileClip

# Assuming your S2VTModel.py is in the same directory or mounted from Drive
from drive.MyDrive.S2VTModel import S2VTModel  # Adjust path as needed


In [None]:
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}


In [None]:
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet152(pretrained=True)
resnet = nn.Sequential(*list(resnet.children())[:-1])
resnet.eval().to(device)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


In [None]:
def extract_features_from_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_features = []
    frame_count = 0
    while cap.isOpened() and frame_count < 40:  # Limit to 40 frames
        ret, frame = cap.read()
        if not ret:
            break
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = transform(Image.fromarray(img)).unsqueeze(0).to(device)
        with torch.no_grad():
            feat = resnet(img).squeeze().cpu().numpy()
        frame_features.append(feat)
        frame_count += 1
    cap.release()

    # Pad or truncate to 40
    frame_features = frame_features[:40]
    while len(frame_features) < 40:
        frame_features.append(np.zeros(2048))
    return torch.tensor(frame_features).unsqueeze(0).float().to(device)


In [None]:
def extract_features_from_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_features = []
    frame_count = 0
    while cap.isOpened() and frame_count < 40:  # Limit to 40 frames
        ret, frame = cap.read()
        if not ret:
            break
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = transform(Image.fromarray(img)).unsqueeze(0).to(device)
        with torch.no_grad():
            feat = resnet(img).squeeze().cpu().numpy()
        frame_features.append(feat)
        frame_count += 1
    cap.release()

    # Pad or truncate to 40
    frame_features = frame_features[:40]
    while len(frame_features) < 40:
        frame_features.append(np.zeros(2048))
    return torch.tensor(frame_features).unsqueeze(0).float().to(device)


In [None]:
from PIL import Image

video_path = '/content/sample_data/t1.mp4'  # Adjust as needed
video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')  # [1, T]

# Decode prediction
generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("📝 Generated Caption:", caption)


📝 Generated Caption: a man is playing


In [None]:
from PIL import Image

video_path = '/content/sample_data/t1.mp4'  # Adjust as needed
video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')  # [1, T]

# Decode prediction
generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("📝 Generated Caption:", caption)


📝 Generated Caption: a man is playing


In [None]:
from PIL import Image

video_path = '/content/sample_data/04.mp4'  # Adjust as needed
video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')  # [1, T]

# Decode prediction
generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("📝 Generated Caption:", caption)


📝 Generated Caption: a man is playing


In [None]:
video_path = '/content/_6OTzzK7t9Y_73_78.avi'


In [None]:
video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)


🎬 Caption for AVI video: a woman is cutting a potato


In [None]:
video_path = '/content/_1vy2HIN60A_32_40.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is playing a guitar


In [None]:
video_path = '/content/04.mp4'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is riding a horse


In [None]:
video_path = '/content/_6OTzzK7t9Y_73_78.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a woman is cutting a potato


In [None]:
video_path = '/content/aa.mp4'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is dancing


In [None]:
video_path = '/content/-_hbPLsZvvo_19_26.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a woman is cutting a potato


In [None]:
video_path = '/content/-_hbPLsZvvo_19_26.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is playing


In [None]:
video_path = '/content/6t0BpjwYKco_118_127.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a woman is playing a guitar


In [None]:
video_path = '/content/video0.mp4'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is riding a horse


In [None]:
video_path = '/content/video1.mp4'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is cutting a potato


In [None]:
video_path = '/content/video2.mp4'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is playing a guitar


In [None]:
video_path = '/content/video3.mp4'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is walking


In [None]:
video_path = '/content/video4.mp4'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a woman is cutting a potato


In [None]:
video_path = '/content/video10.mp4'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

In [None]:
from PIL import Image

video_path = '/content/6t0BpjwYKco_118_127.avi'  # Adjust as needed
video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')  # [1, T]

# Decode prediction
generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("📝 Generated Caption:", caption)


NameError: name 'extract_features_from_video' is not defined

In [None]:
video_path = '/content/6t0BpjwYKco_118_127.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

  return torch.tensor(frame_features).unsqueeze(0).float().to(device)



NameError: name 'model' is not defined

In [None]:
video_path = '/content/6t0BpjwYKco_118_127.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a woman is playing a guitar


In [None]:
video_path = '/content/_0nX-El-ySo_83_93.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is cutting a potato


In [None]:
video_path = '/content/1.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is riding a horse


In [None]:
video_path = '/content/2.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is riding a horse


In [None]:
video_path = '/content/3.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a cat is playing


In [None]:
video_path = '/content/4.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a dog is playing a guitar


In [None]:
video_path = '/content/5.avi'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is cutting a potato


In [None]:
video_path = '/content/video1.mp4'

video_feats = extract_features_from_video(video_path)

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

generated_caption = []
for idx in predicted_ids[0]:
    word = vocab_rev.get(int(idx), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        generated_caption.append(word)

caption = ' '.join(generated_caption)
print("🎬 Caption for AVI video:", caption)

🎬 Caption for AVI video: a man is cutting a potato
