In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import torch
print(torch.cuda.is_available())

False


In [None]:
# Install the required libraries
!pip install numpy torch torchvision nltk



In [None]:
#video and json split

import json
import os
import random
import shutil

def split_json_data(input_json_path, videos_dir, output_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    # Load JSON data
    with open(input_json_path, 'r') as f:
        data = json.load(f)

    # Shuffle the video data to ensure randomness
    video_entries = data['videos']
    random.shuffle(video_entries)

    # Calculate the split sizes
    total_videos = len(video_entries)
    train_size = int(total_videos * train_ratio)
    val_size = int(total_videos * val_ratio)
    test_size = total_videos - train_size - val_size

    # Split the data
    train_videos = video_entries[:train_size]
    val_videos = video_entries[train_size:train_size + val_size]
    test_videos = video_entries[train_size + val_size:]

    # Organize sentences based on video_ids for each split
    video_sentences = {video['video_id']: [] for video in video_entries}
    for sentence in data['sentences']:
        video_sentences[sentence['video_id']].append(sentence)

    def create_split_data(split_videos, split_name):
        split_data = {
            "videos": split_videos,
            "sentences": []
        }
        split_videos_dir = os.path.join(output_dir, split_name, "videos")
        os.makedirs(split_videos_dir, exist_ok=True)

        # Add captions and copy video files
        for video in split_videos:
            video_id = video['video_id']
            split_data['sentences'].extend(video_sentences[video_id])

            # Copy video file to the split directory
            video_filename = f"{video_id}.avi"
            src_video_path = os.path.join(videos_dir, video_filename)
            dst_video_path = os.path.join(split_videos_dir, video_filename)
            if os.path.exists(src_video_path):
                shutil.copy(src_video_path, dst_video_path)
            else:
                print(f"Warning: Video file {video_filename} not found in {videos_dir}.")

        # Save the JSON file for the split
        split_json_path = os.path.join(output_dir, split_name, f"{split_name}_captions.json")
        with open(split_json_path, 'w') as f:
            json.dump(split_data, f, indent=4)

    # Create each split
    create_split_data(train_videos, "train")
    create_split_data(val_videos, "val")
    create_split_data(test_videos, "test")

    print("Data split and saved successfully.")

# Define paths based on your directory structure
input_json_path = '/content/drive/MyDrive/msvd_captions.json'  # Path to the original JSON file
videos_dir = '/content/drive/MyDrive/YouTubeClips'  # Directory where video files are stored
output_dir = '/content/drive/MyDrive/msvd_split'  # Directory where you want to save the splits

# Run the split function
split_json_data(input_json_path, videos_dir, output_dir)

Data split and saved successfully.


In [None]:
#verification for split

import os
import json

def verify_split_integrity(split_name, base_dir):
    """
    Verifies that each video in the split has a corresponding entry in the JSON file
    and each entry in the JSON file has a corresponding video file.

    Args:
        split_name (str): Name of the split (e.g., 'train', 'val', 'test').
        base_dir (str): Base directory where splits are stored (e.g., '/content/drive/MyDrive/msvd_split/').
    """
    video_dir = os.path.join(base_dir, split_name, 'videos')
    json_path = os.path.join(base_dir, split_name, f"{split_name}_captions.json")

    # Load JSON data
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    # Get the list of video files and remove file extensions
    video_files = {os.path.splitext(file)[0] for file in os.listdir(video_dir) if file.endswith('.avi')}

    # Get the list of video_ids from JSON "videos" section
    json_video_ids = {video['video_id'] for video in json_data['videos']}

    # Check that each video file has a corresponding entry in JSON "videos" section
    missing_in_json_videos = video_files - json_video_ids
    if missing_in_json_videos:
        print(f"[{split_name}] Videos present in folder but missing in JSON 'videos' section: {missing_in_json_videos}")
    else:
        print(f"[{split_name}] All videos in folder have matching entries in JSON 'videos' section.")

    # Check that each JSON entry in "videos" section has a corresponding video file in the folder
    missing_in_videos_folder = json_video_ids - video_files
    if missing_in_videos_folder:
        print(f"[{split_name}] Entries in JSON 'videos' section but missing video files: {missing_in_videos_folder}")
    else:
        print(f"[{split_name}] All entries in JSON 'videos' section have matching video files in folder.")

    # Get the list of video_ids from JSON "sentences" section
    json_caption_video_ids = {sentence['video_id'] for sentence in json_data['sentences']}

    # Check that each video file has a corresponding caption in JSON "sentences" section
    missing_captions_for_videos = video_files - json_caption_video_ids
    if missing_captions_for_videos:
        print(f"[{split_name}] Videos present in folder but missing captions in JSON 'sentences' section: {missing_captions_for_videos}")
    else:
        print(f"[{split_name}] All videos in folder have matching captions in JSON 'sentences' section.")

    # Check that each JSON entry in "sentences" section has a corresponding video file in the folder
    missing_videos_for_captions = json_caption_video_ids - video_files
    if missing_videos_for_captions:
        print(f"[{split_name}] Captions in JSON 'sentences' section but missing video files: {missing_videos_for_captions}")
    else:
        print(f"[{split_name}] All captions in JSON 'sentences' section have matching video files in folder.")

# Run verification for each split
base_dir = '/content/drive/MyDrive/msvd_split'  # Change to your base directory if different
for split in ['train', 'val', 'test']:
    verify_split_integrity(split, base_dir)

In [None]:
#removing caption of which video is not present

import os
import json

def clean_json_for_existing_videos(json_path, video_dir):
    # Load JSON data
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Get the set of video_ids from the actual video files in the folder
    video_files = {os.path.splitext(file)[0] for file in os.listdir(video_dir) if file.endswith('.avi')}

    # Filter out entries in 'videos' and 'sentences' that don't have a corresponding video file
    data['videos'] = [video for video in data['videos'] if video['video_id'] in video_files]
    data['sentences'] = [sentence for sentence in data['sentences'] if sentence['video_id'] in video_files]

    # Save the cleaned JSON data back to the file
    with open(json_path, 'w') as f:
        json.dump(data, f, indent=4)

    print(f"Cleaned JSON file saved to {json_path}")

# Define paths to each split's JSON file and video folder
base_dir = '/content/drive/MyDrive/msvd_split'
splits = {
    'train': {'json_path': os.path.join(base_dir, 'train', 'train_captions.json'),
              'video_dir': os.path.join(base_dir, 'train', 'videos')},
    'val': {'json_path': os.path.join(base_dir, 'val', 'val_captions.json'),
            'video_dir': os.path.join(base_dir, 'val', 'videos')},
    'test': {'json_path': os.path.join(base_dir, 'test', 'test_captions.json'),
             'video_dir': os.path.join(base_dir, 'test', 'videos')}
}

# Clean each JSON file based on existing video files
for split, paths in splits.items():
    print(f"Cleaning {split} JSON file...")
    clean_json_for_existing_videos(paths['json_path'], paths['video_dir'])
    print(f"{split} JSON file cleaned.\n")

Cleaning train JSON file...
Cleaned JSON file saved to /content/drive/MyDrive/msvd_split/train/train_captions.json
train JSON file cleaned.

Cleaning val JSON file...
Cleaned JSON file saved to /content/drive/MyDrive/msvd_split/val/val_captions.json
val JSON file cleaned.

Cleaning test JSON file...
Cleaned JSON file saved to /content/drive/MyDrive/msvd_split/test/test_captions.json
test JSON file cleaned.



In [None]:
#vocabulary building

import json
from collections import Counter

def build_vocabulary(json_paths, min_freq=1, save_path='/content/drive/MyDrive/msvd_split/vocab.json'):
    special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
    word_counter = Counter()

    # Process each JSON file to gather word frequencies
    for json_path in json_paths:
        with open(json_path, 'r') as f:
            data = json.load(f)
            for sentence in data['sentences']:
                words = sentence['caption'].lower().split()
                word_counter.update(words)

    # Create vocabulary by adding special tokens and frequent words
    vocab = {token: idx for idx, token in enumerate(special_tokens)}
    for word, freq in word_counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)

    # Save vocabulary as a JSON file
    with open(save_path, 'w') as f:
        json.dump(vocab, f, indent=4)

    print(f"Vocabulary built with {len(vocab)} words. Saved to {save_path}")

# Define paths to JSON files
json_paths = [
    '/content/drive/MyDrive/msvd_split/train/train_captions.json',
    '/content/drive/MyDrive/msvd_split/val/val_captions.json',
    '/content/drive/MyDrive/msvd_split/test/test_captions.json'
]

# Build vocabulary
build_vocabulary(json_paths)

Vocabulary built with 10658 words. Saved to /content/drive/MyDrive/msvd_split/vocab.json


In [None]:
!pip install torchvision
!pip install opencv-python



In [None]:
!pip install pretrainedmodels



In [None]:
#feature extraction

import os
import shutil
import subprocess
import glob
import numpy as np
import json
from tqdm import tqdm
import torch
import pretrainedmodels
from pretrainedmodels import utils

# Constants for input dimensions
C, H, W = 3, 224, 224

# Function to extract frames from a video
def extract_frames(video_path, dst):
    if os.path.exists(dst):
        shutil.rmtree(dst)
    os.makedirs(dst)
    video_to_frames_command = [
        "ffmpeg",
        '-y',
        '-i', video_path,
        '-vf', "scale=400:300",
        '-qscale:v', "2",
        f"{dst}/%06d.jpg"
    ]
    subprocess.call(video_to_frames_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

# Function to extract features from frames
def extract_feats(params, model, load_image_fn, split):
    model.eval()
    dir_fc = os.path.join(params['output_dir'], split, 'features')  # Store features in respective split folder
    os.makedirs(dir_fc, exist_ok=True)

    # Load video list from JSON file for the split
    json_path = params[f'{split}_json']
    with open(json_path, 'r') as f:
        data = json.load(f)
        video_ids = {video['video_id'] for video in data['videos']}

    # Process each video in the specified directory
    video_dir = os.path.join(params['video_path'], split, 'videos')  # Use respective split folder
    video_list = glob.glob(os.path.join(video_dir, '*.avi'))
    for video in tqdm(video_list, desc=f"Processing {split} videos"):
        video_id = os.path.splitext(os.path.basename(video))[0]
        if video_id not in video_ids:
            continue

        # Extract frames
        dst = os.path.join(params['tmp_dir'], video_id)
        extract_frames(video, dst)

        # Load frames and extract features
        image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
        samples = np.round(np.linspace(0, len(image_list) - 1, params['n_frame_steps'])).astype(int)
        image_list = [image_list[sample] for sample in samples]
        images = torch.zeros((len(image_list), C, H, W))

        for i, img_path in enumerate(image_list):
            img = load_image_fn(img_path)
            images[i] = img

        # Move images to GPU for feature extraction
        images = images.cuda()
        with torch.no_grad():
            fc_feats = model(images).cpu().squeeze()

        # Save features
        outfile = os.path.join(dir_fc, f"{video_id}.npy")
        np.save(outfile, fc_feats.numpy())

        # Clean up
        shutil.rmtree(dst)

    print(f"Feature extraction for {split} set is complete.")

# Main code setup with hardcoded parameters
params = {
    'output_dir': '/content/drive/MyDrive/msvd_split',
    'video_path': '/content/drive/MyDrive/msvd_split',
    'n_frame_steps': 40,
    'tmp_dir': '/content/tmp_frames',
    'train_json': '/content/drive/MyDrive/msvd_split/train/train_captions.json',
    'val_json': '/content/drive/MyDrive/msvd_split/val/val_captions.json',
    'test_json': '/content/drive/MyDrive/msvd_split/test/test_captions.json',
    'model': 'resnet152'  # Set your model choice here (resnet152, inception_v3, or inception_v4)
}

# Set up model and image loader
if params['model'] == 'inception_v3':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv3(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'resnet152':
    C, H, W = 3, 224, 224
    model = pretrainedmodels.resnet152(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'inception_v4':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
else:
    raise ValueError(f"Model {params['model']} is not supported")

model.last_linear = utils.Identity()  # Remove final classification layer
model = model.cuda()  # Use GPU

# Extract features for each split
for split in ['train', 'val', 'test']:
    extract_feats(params, model, load_image_fn, split)


Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /root/.cache/torch/hub/checkpoints/resnet152-b121ed2d.pth
100%|██████████| 230M/230M [00:01<00:00, 204MB/s]
Processing train videos:   0%|          | 0/1535 [00:00<?, ?it/s]

In [None]:
#verifying feature extracted correctly or not

import os
import json

# Function to verify features extraction for each split by cross-checking both video files and JSON entries
def verify_features_extraction(split, params):
    # Paths for the split
    video_dir = os.path.join(params['output_dir'], split, 'videos')
    features_dir = os.path.join(params['output_dir'], split, 'features')
    json_path = os.path.join(params['output_dir'], split, f"{split}_captions.json")

    # Load JSON file for the split
    with open(json_path, 'r') as f:
        data = json.load(f)
        video_ids_in_json = {video['video_id'] for video in data['videos']}

    # Check from Video Files
    missing_features_from_videos = []
    extra_features_in_folder_from_videos = []
    video_files = {os.path.splitext(video)[0] for video in os.listdir(video_dir) if video.endswith('.avi')}
    for video_id in video_files:
        feature_path = os.path.join(features_dir, f"{video_id}.npy")
        if not os.path.exists(feature_path):
            missing_features_from_videos.append(video_id)

    # Check from JSON Entries
    missing_features_from_json = []
    for video_id in video_ids_in_json:
        feature_path = os.path.join(features_dir, f"{video_id}.npy")
        if not os.path.exists(feature_path):
            missing_features_from_json.append(video_id)

    # Check for extra feature files
    extra_features = []
    for feature_file in os.listdir(features_dir):
        video_id = os.path.splitext(feature_file)[0]
        if video_id not in video_files and video_id not in video_ids_in_json:
            extra_features.append(video_id)

    # Print results
    if missing_features_from_videos:
        print(f"[{split}] Missing features for videos in the video folder: {len(missing_features_from_videos)}")
        print("Missing video IDs from video files:", missing_features_from_videos)
    else:
        print(f"[{split}] All video files have corresponding features.")

    if missing_features_from_json:
        print(f"[{split}] Missing features for videos in the JSON file: {len(missing_features_from_json)}")
        print("Missing video IDs from JSON:", missing_features_from_json)
    else:
        print(f"[{split}] All JSON entries have corresponding features.")

    if extra_features:
        print(f"[{split}] Extra feature files found that do not match any video or JSON entry: {len(extra_features)}")
        print("Extra feature file video IDs:", extra_features)
    else:
        print(f"[{split}] No extra feature files found.")

# Define your parameters
params = {
    'output_dir': '/content/drive/MyDrive/msvd_split',
}

# Verify for each split
for split in ['train', 'val', 'test']:
    verify_features_extraction(split, params)

[train] All video files have corresponding features.
[train] All JSON entries have corresponding features.
[train] No extra feature files found.
[val] All video files have corresponding features.
[val] All JSON entries have corresponding features.
[val] No extra feature files found.
[test] All video files have corresponding features.
[test] All JSON entries have corresponding features.
[test] No extra feature files found.


In [None]:
#video and json split

import json
import os
import random
import shutil

def split_json_data(input_json_path, videos_dir, output_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    # Load JSON data
    with open(input_json_path, 'r') as f:
        data = json.load(f)

    # Shuffle the video data to ensure randomness
    video_entries = data['videos']
    random.shuffle(video_entries)

    # Calculate the split sizes
    total_videos = len(video_entries)
    train_size = int(total_videos * train_ratio)
    val_size = int(total_videos * val_ratio)
    test_size = total_videos - train_size - val_size

    # Split the data
    train_videos = video_entries[:train_size]
    val_videos = video_entries[train_size:train_size + val_size]
    test_videos = video_entries[train_size + val_size:]

    # Organize sentences based on video_ids for each split
    video_sentences = {video['video_id']: [] for video in video_entries}
    for sentence in data['sentences']:
        video_sentences[sentence['video_id']].append(sentence)

    def create_split_data(split_videos, split_name):
        split_data = {
            "videos": split_videos,
            "sentences": []
        }
        split_videos_dir = os.path.join(output_dir, split_name, "videos")
        os.makedirs(split_videos_dir, exist_ok=True)

        # Add captions and copy video files
        for video in split_videos:
            video_id = video['video_id']
            split_data['sentences'].extend(video_sentences[video_id])

            # Copy video file to the split directory
            video_filename = f"{video_id}.avi"
            src_video_path = os.path.join(videos_dir, video_filename)
            dst_video_path = os.path.join(split_videos_dir, video_filename)
            if os.path.exists(src_video_path):
                shutil.copy(src_video_path, dst_video_path)
            else:
                print(f"Warning: Video file {video_filename} not found in {videos_dir}.")

        # Save the JSON file for the split
        split_json_path = os.path.join(output_dir, split_name, f"{split_name}_captions.json")
        with open(split_json_path, 'w') as f:
            json.dump(split_data, f, indent=4)

    # Create each split
    create_split_data(train_videos, "train")
    create_split_data(val_videos, "val")
    create_split_data(test_videos, "test")

    print("Data split and saved successfully.")

# Define paths based on your directory structure
input_json_path = '/content/drive/MyDrive/msvd_captions.json'  # Path to the original JSON file
videos_dir = '/content/drive/MyDrive/YouTubeClips'  # Directory where video files are stored
output_dir = '/content/drive/MyDrive/msvd_split'  # Directory where you want to save the splits

# Run the split function
split_json_data(input_json_path, videos_dir, output_dir)

In [None]:
import json
from collections import Counter

def build_vocabulary(
    json_paths,
    min_freq=1,
    save_path='/content/drive/MyDrive/msvd_split/vocab.json',
    reverse_path='/content/drive/MyDrive/msvd_split/vocab_rev.json',
    top_k_preview=10
):
    special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
    word_counter = Counter()

    # Step 1: Count all words
    for json_path in json_paths:
        with open(json_path, 'r') as f:
            data = json.load(f)
            for sentence in data['sentences']:
                words = sentence['caption'].lower().split()
                word_counter.update(words)

    # Step 2: Initialize vocab with special tokens
    vocab = {token: idx for idx, token in enumerate(special_tokens)}

    # Step 3: Add frequent words sorted by frequency (descending)
    for word, freq in word_counter.most_common():
        if freq >= min_freq:
            if word not in vocab:  # Avoid conflict with special tokens
                vocab[word] = len(vocab)

    # Step 4: Create reverse vocab
    vocab_rev = {idx: word for word, idx in vocab.items()}

    # Step 5: Save both files
    with open(save_path, 'w') as f:
        json.dump(vocab, f, indent=4)
    with open(reverse_path, 'w') as f:
        json.dump(vocab_rev, f, indent=4)

    # Step 6: Show summary
    print(f"✅ Vocabulary built with {len(vocab)} tokens.")
    print(f"📁 Saved vocab to: {save_path}")
    print(f"📁 Saved reverse vocab to: {reverse_path}")
    print(f"📊 Top {top_k_preview} frequent words:")
    for word, freq in word_counter.most_common(top_k_preview):
        print(f"   {word:<15} : {freq}")

# === Paths to your cleaned split JSON files
json_paths = [
    '/content/drive/MyDrive/msvd_split/train/train_captions.json',
    '/content/drive/MyDrive/msvd_split/val/val_captions.json',
    '/content/drive/MyDrive/msvd_split/test/test_captions.json'
]

# === Run vocabulary builder
build_vocabulary(json_paths, min_freq=1)


✅ Vocabulary built with 10658 tokens.
📁 Saved vocab to: /content/drive/MyDrive/msvd_split/vocab.json
📁 Saved reverse vocab to: /content/drive/MyDrive/msvd_split/vocab_rev.json
📊 Top 10 frequent words:
   a               : 70529
   is              : 34539
   the             : 22887
   man             : 17914
   woman           : 7939
   on              : 7417
   in              : 7370
   playing         : 6272
   are             : 5718
   of              : 5240


In [None]:
#feature extraction

import os
import shutil
import subprocess
import glob
import numpy as np
import json
from tqdm import tqdm
import torch
import pretrainedmodels
from pretrainedmodels import utils

# Constants for input dimensions
C, H, W = 3, 224, 224

# Function to extract frames from a video
def extract_frames(video_path, dst):
    if os.path.exists(dst):
        shutil.rmtree(dst)
    os.makedirs(dst)
    video_to_frames_command = [
        "ffmpeg",
        '-y',
        '-i', video_path,
        '-vf', "scale=400:300",
        '-qscale:v', "2",
        f"{dst}/%06d.jpg"
    ]
    subprocess.call(video_to_frames_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

# Function to extract features from frames
def extract_feats(params, model, load_image_fn, split):
    model.eval()
    dir_fc = os.path.join(params['output_dir'], split, 'features')  # Store features in respective split folder
    os.makedirs(dir_fc, exist_ok=True)

    # Load video list from JSON file for the split
    json_path = params[f'{split}_json']
    with open(json_path, 'r') as f:
        data = json.load(f)
        video_ids = {video['video_id'] for video in data['videos']}

    # Process each video in the specified directory
    video_dir = os.path.join(params['video_path'], split, 'videos')  # Use respective split folder
    video_list = glob.glob(os.path.join(video_dir, '*.avi'))
    for video in tqdm(video_list, desc=f"Processing {split} videos"):
        video_id = os.path.splitext(os.path.basename(video))[0]
        if video_id not in video_ids:
            continue

        # Extract frames
        dst = os.path.join(params['tmp_dir'], video_id)
        extract_frames(video, dst)

        # Load frames and extract features
        # image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
        # samples = np.round(np.linspace(0, len(image_list) - 1, params['n_frame_steps'])).astype(int)
        # image_list = [image_list[sample] for sample in samples]
        image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
        if len(image_list) == 0:
            print(f"[WARNING] No frames extracted for video {video_id}. Skipping...")
            shutil.rmtree(dst)
            continue  # Skip to next video

        # Safe sampling
        n_frames = len(image_list)
        sample_count = min(params['n_frame_steps'], n_frames)
        samples = np.round(np.linspace(0, n_frames - 1, sample_count)).astype(int)
        image_list = [image_list[sample] for sample in samples]

        images = torch.zeros((len(image_list), C, H, W))

        for i, img_path in enumerate(image_list):
            img = load_image_fn(img_path)
            images[i] = img

        # Move images to GPU for feature extraction
        images = images.cuda()
        with torch.no_grad():
            fc_feats = model(images).cpu().squeeze()

        # Save features
        outfile = os.path.join(dir_fc, f"{video_id}.npy")
        np.save(outfile, fc_feats.numpy())

        # Clean up
        shutil.rmtree(dst)

    print(f"Feature extraction for {split} set is complete.")

# Main code setup with hardcoded parameters
params = {
    'output_dir': '/content/drive/MyDrive/VideoCaptioning/msvd_split',
    'video_path': '/content/drive/MyDrive/VideoCaptioning/msvd_split',
    'n_frame_steps': 40,
    'tmp_dir': '/content/tmp_frames',
    'train_json': '/content/drive/MyDrive/VideoCaptioning/msvd_split/train/train_captions.json',
    'val_json': '/content/drive/MyDrive/VideoCaptioning/msvd_split/val/val_captions.json',
    'test_json': '/content/drive/MyDrive/VideoCaptioning/msvd_split/test/test_captions.json',
    'model': 'resnet152'  # Set your model choice here (resnet152, inception_v3, or inception_v4)
}

# Set up model and image loader
if params['model'] == 'inception_v3':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv3(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'resnet152':
    C, H, W = 3, 224, 224
    model = pretrainedmodels.resnet152(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'inception_v4':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
else:
    raise ValueError(f"Model {params['model']} is not supported")

model.last_linear = utils.Identity()  # Remove final classification layer
model = model.cuda()  # Use GPU

# Extract features for each split
for split in ['train', 'val', 'test']:
    extract_feats(params, model, load_image_fn, split)

Processing train videos: 100%|██████████| 1789/1789 [35:07<00:00,  1.18s/it]


Feature extraction for train set is complete.


Processing val videos: 100%|██████████| 542/542 [07:32<00:00,  1.20it/s]


Feature extraction for val set is complete.


Processing test videos: 100%|██████████| 546/546 [07:24<00:00,  1.23it/s]

Feature extraction for test set is complete.





In [None]:
#cleaning feature ,which is not match
import os
import json

def verify_and_fix_features(split, params):
    video_dir = os.path.join(params['output_dir'], split, 'videos')
    features_dir = os.path.join(params['output_dir'], split, 'features')
    json_path = os.path.join(params['output_dir'], split, f"{split}_captions.json")

    # Load JSON
    with open(json_path, 'r') as f:
        data = json.load(f)
        json_video_ids = {video['video_id'] for video in data['videos']}

    # All video and feature files
    video_files = {os.path.splitext(f)[0] for f in os.listdir(video_dir) if f.endswith('.avi')}
    feature_files = {os.path.splitext(f)[0] for f in os.listdir(features_dir) if f.endswith('.npy')}

    # Identify corrupted or extra items
    missing_feats_from_videos = video_files - feature_files
    missing_feats_from_json = json_video_ids - feature_files
    extra_feats = feature_files - video_files - json_video_ids

    # === ACTION 1: Delete corrupted video files (with no features)
    for vid in missing_feats_from_videos:
        path = os.path.join(video_dir, f"{vid}.avi")
        if os.path.exists(path):
            os.remove(path)
            print(f"🗑️ Deleted incomplete video (no features): {path}")

    # === ACTION 2: Remove JSON entries with missing features
    original_video_count = len(data['videos'])
    original_caption_count = len(data['sentences'])
    data['videos'] = [v for v in data['videos'] if v['video_id'] in feature_files]
    data['sentences'] = [s for s in data['sentences'] if s['video_id'] in feature_files]

    with open(json_path, 'w') as f:
        json.dump(data, f, indent=4)

    print(f"✅ Cleaned JSON for [{split}]")
    print(f"🧮 Removed {original_video_count - len(data['videos'])} invalid video entries")
    print(f"🧾 Removed {original_caption_count - len(data['sentences'])} invalid captions")

    # === ACTION 3: Delete stray feature files
    for vid in extra_feats:
        path = os.path.join(features_dir, f"{vid}.npy")
        if os.path.exists(path):
            os.remove(path)
            print(f"🧼 Deleted stray feature file: {path}")

    print(f"✅ [{split}] Verified and fixed. All valid videos now have features and JSON alignment.\n")

# Run for all splits
params = {
    'output_dir': '/content/drive/MyDrive/msvd_split',
}

for split in ['train', 'val', 'test']:
    verify_and_fix_features(split, params)

✅ Cleaned JSON for [train]
🧮 Removed 0 invalid video entries
🧾 Removed 0 invalid captions
✅ [train] Verified and fixed. All valid videos now have features and JSON alignment.

✅ Cleaned JSON for [val]
🧮 Removed 0 invalid video entries
🧾 Removed 0 invalid captions
✅ [val] Verified and fixed. All valid videos now have features and JSON alignment.

✅ Cleaned JSON for [test]
🧮 Removed 0 invalid video entries
🧾 Removed 0 invalid captions
✅ [test] Verified and fixed. All valid videos now have features and JSON alignment.



In [None]:
#data loader
import os
import json
import numpy as np
import torch
from torch.utils.data import Dataset

class VideoCaptionDataset(Dataset):
    """
    Dataset class for loading video features and their associated captions.

    Args:
        feature_dir (str): Directory containing .npy feature files.
        json_path (str): Path to JSON file with "videos" and "sentences".
        vocab (dict): Vocabulary mapping words to indices.
        max_caption_length (int): Max length of tokenized captions (including <SOS> and <EOS>).
        verbose (bool): If True, prints sample-level debug info.
    """
    def __init__(self, feature_dir, json_path, vocab, max_caption_length=15, verbose=False):
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_caption_length = max_caption_length
        self.verbose = verbose

        # Load JSON
        with open(json_path, 'r') as f:
            data = json.load(f)

        # Map video_id to all its captions
        self.video_captions = {}
        for item in data['sentences']:
            vid = item['video_id']
            if vid in self.video_captions:
                self.video_captions[vid].append(item['caption'])
            else:
                self.video_captions[vid] = [item['caption']]

        # Keep video_ids that have both captions and feature files
        all_video_ids = [v['video_id'] for v in data['videos']]
        self.video_ids = [
            vid for vid in all_video_ids
            if vid in self.video_captions and os.path.exists(os.path.join(self.feature_dir, f"{vid}.npy"))
        ]

        if self.verbose:
            print("✅ Initialized VideoCaptionDataset")
            print(f"🧾 Total valid samples: {len(self.video_ids)}")

    def __len__(self):
        return len(self.video_ids)

    def __getitem__(self, idx):
        video_id = self.video_ids[idx]
        feature_path = os.path.join(self.feature_dir, f"{video_id}.npy")

        try:
            video_features = np.load(feature_path)
        except Exception as e:
            raise RuntimeError(f"❌ Failed to load features for {video_id}: {e}")

        # Choose a random caption and tokenize
        caption = np.random.choice(self.video_captions[video_id])
        tokens = [self.vocab['<SOS>']] + [
            self.vocab.get(word, self.vocab['<UNK>']) for word in caption.lower().split()
        ] + [self.vocab['<EOS>']]

        # Truncate and pad
        tokens = tokens[:self.max_caption_length]
        tokens += [self.vocab['<PAD>']] * (self.max_caption_length - len(tokens))

        caption_tensor = torch.tensor(tokens, dtype=torch.long)
        video_tensor = torch.tensor(video_features, dtype=torch.float32)

        # Debug preview
        if self.verbose and idx == 0:
            reverse_vocab = {v: k for k, v in self.vocab.items()}
            print(f"\n📦 Sample [{video_id}]")
            print("📝 Caption:", caption)
            print("🔢 Tokens :", tokens)
            print("🔠 Decoded:", [reverse_vocab.get(t, '<UNK>') for t in tokens])
            print("🎞️ Video Features Shape:", video_tensor.shape)
            print("🧠 Caption Tensor:", caption_tensor)

        return video_tensor, caption_tensor


In [None]:
# Example usage
if __name__ == "__main__":
    # Define directories based on your structure
    feature_dir = '/content/drive/MyDrive/msvd_split/train/features'  # Path to train features directory
    json_path = '/content/drive/MyDrive/msvd_split/train/train_captions.json'  # Path to train captions JSON
    vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'  # Path to vocabulary JSON

    # Load vocabulary
    with open(vocab_path, 'r') as f:
        vocab = json.load(f)

    # Initialize dataset
    dataset = VideoCaptionDataset(feature_dir, json_path, vocab, verbose=True)
    print(f"Dataset size: {len(dataset)}")

    # Access a sample item to verify
    video_features, caption_tensor = dataset[0]
    print("Sample Video Features Shape:", video_features.shape)
    print("Sample Caption Tensor:", caption_tensor)

✅ Initialized VideoCaptionDataset
🧾 Total valid samples: 1251
Dataset size: 1251

📦 Sample [p9g06ktIkJg_4_11]
📝 Caption: several lemurs are huddling together
🔢 Tokens : [1, 259, 2084, 12, 4031, 214, 2, 0, 0, 0, 0, 0, 0, 0, 0]
🔠 Decoded: ['<SOS>', 'several', 'lemurs', 'are', 'huddling', 'together', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
🎞️ Video Features Shape: torch.Size([40, 2048])
🧠 Caption Tensor: tensor([   1,  259, 2084,   12, 4031,  214,    2,    0,    0,    0,    0,    0,
           0,    0,    0])
Sample Video Features Shape: torch.Size([40, 2048])
Sample Caption Tensor: tensor([   1,  259, 2084,   12, 4031,  214,    2,    0,    0,    0,    0,    0,
           0,    0,    0])


In [None]:
if __name__ == "__main__":
    feature_dir = '/content/drive/MyDrive/msvd_split/train/features'
    json_path = '/content/drive/MyDrive/msvd_split/train/train_captions.json'
    vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'

    with open(vocab_path, 'r') as f:
        vocab = json.load(f)

    dataset = VideoCaptionDataset(
        feature_dir=feature_dir,
        json_path=json_path,
        vocab=vocab,
        max_caption_length=15,
        verbose=True
    )

    print(f"Dataset Size: {len(dataset)}")

    # Test sample access
    video_feat, cap_tensor = dataset[0]


✅ Initialized VideoCaptionDataset
🧾 Total valid samples: 1251
Dataset Size: 1251

📦 Sample [p9g06ktIkJg_4_11]
📝 Caption: several furry animals are huddled together
🔢 Tokens : [1, 259, 1970, 298, 12, 2866, 214, 2, 0, 0, 0, 0, 0, 0, 0]
🔠 Decoded: ['<SOS>', 'several', 'furry', 'animals', 'are', 'huddled', 'together', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
🎞️ Video Features Shape: torch.Size([40, 2048])
🧠 Caption Tensor: tensor([   1,  259, 1970,  298,   12, 2866,  214,    2,    0,    0,    0,    0,
           0,    0,    0])


In [None]:
import torch
from torch import nn
import torch.nn.functional as F

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size, encoder_output_size):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(encoder_output_size, hidden_size)
        self.V = nn.Linear(hidden_size, 1)

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.unsqueeze(1)  # [B, 1, H]
        score = self.V(torch.tanh(self.W1(hidden) + self.W2(encoder_outputs)))  # [B, T, 1]
        attention_weights = F.softmax(score, dim=1)  # [B, T, 1]
        context = torch.sum(attention_weights * encoder_outputs, dim=1)  # [B, H]
        return context, attention_weights

class S2VTModel(nn.Module):
    def __init__(self, vocab_size, max_len=45, dim_hidden=1024, dim_word=512, dim_vid=2048,
                 sos_id=1, eos_id=0, n_layers=1, rnn_cell='lstm', rnn_dropout_p=0.3):
        super(S2VTModel, self).__init__()

        self.rnn_cell_type = rnn_cell.lower()
        self.rnn_cell = nn.LSTM if self.rnn_cell_type == 'lstm' else nn.GRU

        self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)
        self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)

        self.attention = BahdanauAttention(dim_hidden, dim_hidden)

        self.embedding = nn.Embedding(vocab_size, dim_word)
        self.out = nn.Linear(dim_hidden, vocab_size)

        self.dim_vid = dim_vid
        self.dim_hidden = dim_hidden
        self.dim_word = dim_word
        self.max_length = max_len
        self.vocab_size = vocab_size
        self.sos_id = sos_id
        self.eos_id = eos_id

    def forward(self, vid_feats, target_variable=None, mode='train', beam_width=3):
        batch_size, n_frames, _ = vid_feats.shape
        device = vid_feats.device

        encoder_outputs, state1 = self.rnn1(vid_feats)  # [B, T, H]

        if mode == 'train':
            seq_probs = []
            for t in range(self.max_length - 1):
                current_word = self.embedding(target_variable[:, t])
                context, _ = self.attention(state1[0][-1] if self.rnn_cell_type == 'lstm' else state1[-1], encoder_outputs)
                input2 = torch.cat((context, current_word), dim=1).unsqueeze(1)
                output2, state2 = self.rnn2(input2, state1)
                logits = self.out(output2.squeeze(1))
                logits = F.log_softmax(logits, dim=1)
                seq_probs.append(logits.unsqueeze(1))
                state1 = state2
            return torch.cat(seq_probs, dim=1), None

        else:  # Beam Search Inference
            beams = [(torch.tensor([self.sos_id], device=device), 0.0, state1)]
            completed = []

            for _ in range(self.max_length - 1):
                new_beams = []
                for seq, score, state in beams:
                    last_word = seq[-1].unsqueeze(0)
                    if last_word.item() == self.eos_id:
                        completed.append((seq, score))
                        continue

                    emb = self.embedding(last_word).unsqueeze(0)  # [1, 1, D]
                    context, _ = self.attention(state[0][-1] if self.rnn_cell_type == 'lstm' else state[-1], encoder_outputs)
                    input2 = torch.cat((context, emb.squeeze(1)), dim=-1).unsqueeze(1)
                    output2, new_state = self.rnn2(input2, state)
                    logits = self.out(output2.squeeze(1))
                    log_probs = F.log_softmax(logits, dim=1)
                    topk_log_probs, topk_indices = torch.topk(log_probs, beam_width)

                    for k in range(beam_width):
                        new_seq = torch.cat([seq, topk_indices[0, k].unsqueeze(0)])
                        new_score = score + topk_log_probs[0, k].item()
                        new_beams.append((new_seq, new_score, new_state))

                beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

            best_seq = max(completed or beams, key=lambda x: x[1])[0]
            return None, best_seq.unsqueeze(0)


In [None]:
# Define the full model code as a string
model_code = '''import torch
from torch import nn
import torch.nn.functional as F

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size, encoder_output_size):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(encoder_output_size, hidden_size)
        self.V = nn.Linear(hidden_size, 1)

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.unsqueeze(1)  # [B, 1, H]
        score = self.V(torch.tanh(self.W1(hidden) + self.W2(encoder_outputs)))  # [B, T, 1]
        attention_weights = F.softmax(score, dim=1)  # [B, T, 1]
        context = torch.sum(attention_weights * encoder_outputs, dim=1)  # [B, H]
        return context, attention_weights

class S2VTModel(nn.Module):
    def __init__(self, vocab_size, max_len=45, dim_hidden=1024, dim_word=512, dim_vid=2048,
                 sos_id=1, eos_id=0, n_layers=1, rnn_cell='lstm', rnn_dropout_p=0.3):
        super(S2VTModel, self).__init__()

        self.rnn_cell_type = rnn_cell.lower()
        self.rnn_cell = nn.LSTM if self.rnn_cell_type == 'lstm' else nn.GRU

        self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)
        self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)

        self.attention = BahdanauAttention(dim_hidden, dim_hidden)

        self.embedding = nn.Embedding(vocab_size, dim_word)
        self.out = nn.Linear(dim_hidden, vocab_size)

        self.dim_vid = dim_vid
        self.dim_hidden = dim_hidden
        self.dim_word = dim_word
        self.max_length = max_len
        self.vocab_size = vocab_size
        self.sos_id = sos_id
        self.eos_id = eos_id

    def forward(self, vid_feats, target_variable=None, mode='train', beam_width=3):
        batch_size, n_frames, _ = vid_feats.shape
        device = vid_feats.device

        encoder_outputs, state1 = self.rnn1(vid_feats)  # [B, T, H]

        if mode == 'train':
            seq_probs = []
            for t in range(self.max_length - 1):
                current_word = self.embedding(target_variable[:, t])
                context, _ = self.attention(state1[0][-1] if self.rnn_cell_type == 'lstm' else state1[-1], encoder_outputs)
                input2 = torch.cat((context, current_word), dim=1).unsqueeze(1)
                output2, state2 = self.rnn2(input2, state1)
                logits = self.out(output2.squeeze(1))
                logits = F.log_softmax(logits, dim=1)
                seq_probs.append(logits.unsqueeze(1))
                state1 = state2
            return torch.cat(seq_probs, dim=1), None

        else:  # Beam Search Inference
            beams = [(torch.tensor([self.sos_id], device=device), 0.0, state1)]
            completed = []

            for _ in range(self.max_length - 1):
                new_beams = []
                for seq, score, state in beams:
                    last_word = seq[-1].unsqueeze(0)
                    if last_word.item() == self.eos_id:
                        completed.append((seq, score))
                        continue

                    emb = self.embedding(last_word).unsqueeze(0)  # [1, 1, D]
                    context, _ = self.attention(state[0][-1] if self.rnn_cell_type == 'lstm' else state[-1], encoder_outputs)
                    input2 = torch.cat((context, emb.squeeze(1)), dim=-1).unsqueeze(1)
                    output2, new_state = self.rnn2(input2, state)
                    logits = self.out(output2.squeeze(1))
                    log_probs = F.log_softmax(logits, dim=1)
                    topk_log_probs, topk_indices = torch.topk(log_probs, beam_width)

                    for k in range(beam_width):
                        new_seq = torch.cat([seq, topk_indices[0, k].unsqueeze(0)])
                        new_score = score + topk_log_probs[0, k].item()
                        new_beams.append((new_seq, new_score, new_state))

                beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

            best_seq = max(completed or beams, key=lambda x: x[1])[0]
            return None, best_seq.unsqueeze(0)

'''

# Save to Google Drive
with open('/content/drive/MyDrive/S2VTModel_Attention.py', 'w') as f:
    f.write(model_code)

print("✅ Model saved as S2VTModel_Attention.py in your Drive.")


✅ Model saved as S2VTModel_Attention.py in your Drive.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

def train_model(model, train_dataset, val_dataset, vocab, device,
                num_epochs=10, batch_size=8, learning_rate=1e-4, checkpoint_dir=None):

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = model.to(device)
    criterion = nn.NLLLoss(ignore_index=vocab['<PAD>'])
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    for epoch in range(1, num_epochs + 1):
        model.train()
        total_train_loss = 0

        print(f"\n🚀 Epoch [{epoch}/{num_epochs}]")

        for video_feats, captions in tqdm(train_loader, desc='Training'):
            video_feats, captions = video_feats.to(device), captions.to(device)

            # Prepare input and target
            inputs = captions[:, :-1]
            targets = captions[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(video_feats, target_variable=inputs, mode='train')  # [B, T, V]

            outputs = outputs.view(-1, outputs.size(-1))       # [B*T, V]
            targets = targets.reshape(-1)                      # [B*T]

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"✅ Training Loss: {avg_train_loss:.4f}")

        # === Validation ===
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for video_feats, captions in tqdm(val_loader, desc='Validation'):
                video_feats, captions = video_feats.to(device), captions.to(device)
                inputs = captions[:, :-1]
                targets = captions[:, 1:]

                outputs, _ = model(video_feats, target_variable=inputs, mode='train')
                outputs = outputs.view(-1, outputs.size(-1))
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"🧪 Validation Loss: {avg_val_loss:.4f}")

        scheduler.step()

        # Save checkpoint
        if checkpoint_dir:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss,
            }, f"{checkpoint_dir}/checkpoint_epoch_{epoch}.pt")
            print(f"💾 Saved checkpoint to {checkpoint_dir}/checkpoint_epoch_{epoch}.pt")


In [None]:
from torch.utils.data import DataLoader
import json

# === Paths ===
feature_dir_train = '/content/drive/MyDrive/msvd_split/train/features'
feature_dir_val = '/content/drive/MyDrive/msvd_split/val/features'
json_train = '/content/drive/MyDrive/msvd_split/train/train_captions.json'
json_val = '/content/drive/MyDrive/msvd_split/val/val_captions.json'
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'

# === Load vocab ===
with open(vocab_path, 'r') as f:
    vocab = json.load(f)

# === Initialize datasets ===
train_dataset = VideoCaptionDataset(
    feature_dir=feature_dir_train,
    json_path=json_train,
    vocab=vocab,
    max_caption_length=45
)

val_dataset = VideoCaptionDataset(
    feature_dir=feature_dir_val,
    json_path=json_val,
    vocab=vocab,
    max_caption_length=45
)


In [None]:
from drive.MyDrive.S2VTModel_Attention import S2VTModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=1,
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)

train_model(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    vocab=vocab,
    device=device,
    num_epochs=30,
    batch_size=8,
    learning_rate=1e-4,
    checkpoint_dir="/content/drive/MyDrive/msvd_split/checkpoints"
)





🚀 Epoch [1/30]


Training: 100%|██████████| 157/157 [11:44<00:00,  4.48s/it]


✅ Training Loss: 5.4056


Validation: 100%|██████████| 12/12 [00:44<00:00,  3.68s/it]


🧪 Validation Loss: 4.7492
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_1.pt

🚀 Epoch [2/30]


Training: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]


✅ Training Loss: 4.5354


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.13it/s]


🧪 Validation Loss: 4.5767
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_2.pt

🚀 Epoch [3/30]


Training: 100%|██████████| 157/157 [00:47<00:00,  3.32it/s]


✅ Training Loss: 4.3235


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.21it/s]


🧪 Validation Loss: 4.4360
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_3.pt

🚀 Epoch [4/30]


Training: 100%|██████████| 157/157 [00:47<00:00,  3.32it/s]


✅ Training Loss: 4.1335


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.16it/s]


🧪 Validation Loss: 4.3701
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_4.pt

🚀 Epoch [5/30]


Training: 100%|██████████| 157/157 [00:46<00:00,  3.36it/s]


✅ Training Loss: 4.0684


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.23it/s]


🧪 Validation Loss: 4.1582
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_5.pt

🚀 Epoch [6/30]


Training: 100%|██████████| 157/157 [00:46<00:00,  3.39it/s]


✅ Training Loss: 3.9724


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.29it/s]


🧪 Validation Loss: 3.8608
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_6.pt

🚀 Epoch [7/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.21it/s]


✅ Training Loss: 3.8226


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.89it/s]


🧪 Validation Loss: 3.9018
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_7.pt

🚀 Epoch [8/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.14it/s]


✅ Training Loss: 3.8469


Validation: 100%|██████████| 12/12 [00:00<00:00, 13.75it/s]


🧪 Validation Loss: 3.6028
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_8.pt

🚀 Epoch [9/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.16it/s]


✅ Training Loss: 3.6757


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.53it/s]


🧪 Validation Loss: 3.8771
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_9.pt

🚀 Epoch [10/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.18it/s]


✅ Training Loss: 3.6954


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.56it/s]


🧪 Validation Loss: 4.2475
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_10.pt

🚀 Epoch [11/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.18it/s]


✅ Training Loss: 3.6614


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.52it/s]


🧪 Validation Loss: 3.9698
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_11.pt

🚀 Epoch [12/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.19it/s]


✅ Training Loss: 3.5523


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.62it/s]


🧪 Validation Loss: 3.8174
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_12.pt

🚀 Epoch [13/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.14it/s]


✅ Training Loss: 3.5984


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.74it/s]


🧪 Validation Loss: 3.7847
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_13.pt

🚀 Epoch [14/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.15it/s]


✅ Training Loss: 3.5181


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.50it/s]


🧪 Validation Loss: 3.4858
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_14.pt

🚀 Epoch [15/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.15it/s]


✅ Training Loss: 3.5702


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.56it/s]


🧪 Validation Loss: 3.7691
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_15.pt

🚀 Epoch [16/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.11it/s]


✅ Training Loss: 3.5573


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.13it/s]


🧪 Validation Loss: 3.5152
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_16.pt

🚀 Epoch [17/30]


Training: 100%|██████████| 157/157 [00:38<00:00,  4.11it/s]


✅ Training Loss: 3.5329


Validation: 100%|██████████| 12/12 [00:00<00:00, 14.21it/s]


🧪 Validation Loss: 3.3757
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_17.pt

🚀 Epoch [18/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.22it/s]


✅ Training Loss: 3.5281


Validation: 100%|██████████| 12/12 [00:00<00:00, 13.68it/s]


🧪 Validation Loss: 3.5658
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_18.pt

🚀 Epoch [19/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.22it/s]


✅ Training Loss: 3.4557


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.10it/s]


🧪 Validation Loss: 3.7165
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_19.pt

🚀 Epoch [20/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.23it/s]


✅ Training Loss: 3.4613


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.65it/s]


🧪 Validation Loss: 3.5408
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_20.pt

🚀 Epoch [21/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.22it/s]


✅ Training Loss: 3.4997


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.38it/s]


🧪 Validation Loss: 3.4975
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_21.pt

🚀 Epoch [22/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.19it/s]


✅ Training Loss: 3.4768


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.53it/s]


🧪 Validation Loss: 3.1841
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_22.pt

🚀 Epoch [23/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.21it/s]


✅ Training Loss: 3.4418


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.46it/s]


🧪 Validation Loss: 3.6346
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_23.pt

🚀 Epoch [24/30]


Training: 100%|██████████| 157/157 [00:36<00:00,  4.26it/s]


✅ Training Loss: 3.3959


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.09it/s]


🧪 Validation Loss: 3.5725
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_24.pt

🚀 Epoch [25/30]


Training: 100%|██████████| 157/157 [00:36<00:00,  4.25it/s]


✅ Training Loss: 3.4514


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.61it/s]


🧪 Validation Loss: 3.7555
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_25.pt

🚀 Epoch [26/30]


Training: 100%|██████████| 157/157 [00:36<00:00,  4.26it/s]


✅ Training Loss: 3.4606


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.42it/s]


🧪 Validation Loss: 3.6945
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_26.pt

🚀 Epoch [27/30]


Training: 100%|██████████| 157/157 [00:36<00:00,  4.25it/s]


✅ Training Loss: 3.4542


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.47it/s]


🧪 Validation Loss: 3.4339
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_27.pt

🚀 Epoch [28/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.23it/s]


✅ Training Loss: 3.4202


Validation: 100%|██████████| 12/12 [00:00<00:00, 13.19it/s]


🧪 Validation Loss: 3.8997
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_28.pt

🚀 Epoch [29/30]


Training: 100%|██████████| 157/157 [00:37<00:00,  4.24it/s]


✅ Training Loss: 3.5267


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.33it/s]


🧪 Validation Loss: 3.3861
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_29.pt

🚀 Epoch [30/30]


Training: 100%|██████████| 157/157 [00:43<00:00,  3.65it/s]


✅ Training Loss: 3.3833


Validation: 100%|██████████| 12/12 [00:00<00:00, 15.33it/s]


🧪 Validation Loss: 3.7119
💾 Saved checkpoint to /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt


In [None]:
import os

checkpoint_path = "/content/drive/MyDrive/msvd_split/checkpoints"
os.makedirs(checkpoint_path, exist_ok=True)

# Then pass this to training
train_model(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    vocab=vocab,
    device=device,
    num_epochs=30,
    batch_size=8,
    learning_rate=1e-4,
    checkpoint_dir=checkpoint_path
)



🚀 Epoch [1/30]


Training:  13%|█▎        | 20/157 [00:11<01:15,  1.81it/s]


KeyboardInterrupt: 

In [None]:
import os
import json
import numpy as np
import torch
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import corpus_bleu
from drive.MyDrive.S2VTModel_Attention import S2VTModel

# === Paths ===
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
test_json_path = '/content/drive/MyDrive/msvd_split/test/test_captions.json'
test_feature_dir = '/content/drive/MyDrive/msvd_split/test/features'
checkpoint_path = '/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt'

# === Load vocab ===
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}
vocab_size = len(vocab)

# === Load model (must match training config) ===
model = S2VTModel(
    vocab_size=vocab_size,
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=1,
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)

# === Load checkpoint ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

# === Load test data ===
test_dataset = VideoCaptionDataset(test_feature_dir, test_json_path, vocab, max_caption_length=45)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# === Inference + BLEU computation ===
print("Evaluating on test set...")

all_references = []
all_hypotheses = []

with torch.no_grad():
    for idx, (video_feats, captions) in enumerate(test_loader):
        video_feats, captions = video_feats.to(device), captions.to(device)

        # Model inference
        _, predicted_ids = model(video_feats, mode='inference')  # [1, T]

        # Decode generated caption
        pred_tokens = []
        for tok in predicted_ids[0]:
            word = vocab_rev.get(int(tok), '<UNK>')
            if word == '<EOS>':
                break
            if word not in ['<SOS>', '<PAD>']:
                pred_tokens.append(word)
        all_hypotheses.append(pred_tokens)

        # Decode ground truth caption
        ref_tokens = []
        for tok in captions[0]:
            word = vocab_rev.get(int(tok), '<UNK>')
            if word == '<EOS>':
                break
            if word not in ['<SOS>', '<PAD>']:
                ref_tokens.append(word)
        all_references.append([ref_tokens])  # list of references per hypothesis

        if idx < 5:
            print(f"\nExample {idx + 1}")
            print(f"Predicted    : {' '.join(pred_tokens)}")
            print(f"Ground Truth : {' '.join(ref_tokens)}")

# === Compute BLEU score ===
bleu_score = corpus_bleu(all_references, all_hypotheses)
print(f"\nFinal BLEU-4 Score: {bleu_score:.4f}")



Evaluating on test set...

Example 1
Predicted    : a woman is cooking
Ground Truth : the lady added ingredients to the water in the bowl and whisked it

Example 2
Predicted    : a man is riding a horse
Ground Truth : a woman hits volleyballs at the beach

Example 3
Predicted    : a woman is dancing
Ground Truth : two couple are talking with each other

Example 4
Predicted    : a man is riding a horse
Ground Truth : a man was hosing down a jogger and the water turned black

Example 5
Predicted    : a man is riding a horse
Ground Truth : a guy is kicking a ball

Final BLEU-4 Score: 0.0709


In [None]:
import os
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
import subprocess
import tempfile
import glob
from PIL import Image
import numpy as np

from moviepy.editor import VideoFileClip

video_path = '/content/drive/MyDrive/YouTubeClips/-8y1Q0rA3n8_108_115.avi'

def extract_frames_ffmpeg(video_path, output_dir, frame_rate=1):
    os.makedirs(output_dir, exist_ok=True)
    cmd = [
        'ffmpeg', '-i', video_path, '-vf', f"fps={frame_rate},scale=224:224",
        os.path.join(output_dir, '%06d.jpg'), '-hide_banner', '-loglevel', 'error'
    ]
    subprocess.run(cmd)

def load_and_sample_frames(frame_dir, n_frames=40):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    frames = sorted(glob.glob(os.path.join(frame_dir, '*.jpg')))
    total = len(frames)
    if total == 0:
        raise ValueError("No frames extracted")

    indices = np.linspace(0, total - 1, min(n_frames, total)).astype(int)
    sampled = [frames[i] for i in indices]

    images = [transform(Image.open(f).convert('RGB')) for f in sampled]
    return torch.stack(images)  # [T, 3, H, W]

def extract_video_tensor(video_path, n_frames=40):
    with tempfile.TemporaryDirectory() as tmpdir:
        extract_frames_ffmpeg(video_path, tmpdir)
        tensor = load_and_sample_frames(tmpdir, n_frames)
    return tensor

def generate_caption_from_video(video_path, model, feature_extractor, vocab_rev, device, max_len=45):
    model.eval()
    model.to(device)

    video_tensor = extract_video_tensor(video_path).to(device)  # [T, 3, 224, 224]
    with torch.no_grad():
        features = feature_extractor(video_tensor)  # e.g. ResNet: [T, 2048]
        features = features.unsqueeze(0)  # Add batch dim: [1, T, 2048]
        _, predicted_ids = model(features, mode='inference')

    tokens = []
    for idx in predicted_ids[0]:
        word = vocab_rev.get(int(idx), '<UNK>')
        if word == '<EOS>':
            break
        if word not in ['<SOS>', '<PAD>']:
            tokens.append(word)

    return ' '.join(tokens)


  if event.key is 'enter':



In [None]:
import pretrainedmodels
from pretrainedmodels import utils
from drive.MyDrive.S2VTModel_Attention import S2VTModel
import torch
import json


# Load ResNet152 and remove final classification layer
resnet = pretrainedmodels.resnet152(pretrained='imagenet')
resnet.last_linear = utils.Identity()  # Remove classification head
feature_extractor = resnet.cuda().eval()
load_image_fn = utils.LoadTransformImage(resnet)



with open('/content/drive/MyDrive/msvd_split/vocab.json', 'r') as f:
    vocab = json.load(f)

vocab_rev = {v: k for k, v in vocab.items()}  # This is what you must pass



checkpoint = torch.load('/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt', map_location='cuda')
model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=1,
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)
model.load_state_dict(checkpoint['model_state_dict'])

video_path = '/content/drive/MyDrive/msvd_split/test/videos/-_hbPLsZvvo_19_25.avi'

caption = generate_caption_from_video(
    video_path=video_path,
    model=model,
    feature_extractor=feature_extractor,
    vocab_rev=vocab_rev,
    device=torch.device('cuda')
)

print("🎬 Generated Caption:", caption)



Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /root/.cache/torch/hub/checkpoints/resnet152-b121ed2d.pth
100%|██████████| 230M/230M [00:04<00:00, 53.1MB/s]



ValueError: No frames extracted

In [None]:
import os
import json

import subprocess
import glob
import numpy as np
import torch
import pretrainedmodels
from pretrainedmodels import utils

# === Setup ===
video_path = '/content/01_inpainted.mp4'
frame_dir = 'frames_tmp'
feature_save_path = 'example_video.npy'

# === Extract frames ===
if os.path.exists(frame_dir):
    subprocess.call(['rm', '-rf', frame_dir])
os.makedirs(frame_dir, exist_ok=True)

cmd = [
    'ffmpeg', '-y', '-i', video_path,
    '-vf', 'scale=400:300',
    '-qscale:v', '2',
    f'{frame_dir}/%06d.jpg'
]
subprocess.call(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# === Load pretrained model (same as training) ===
model = pretrainedmodels.resnet152(pretrained='imagenet')
model.last_linear = utils.Identity()
model.eval().cuda()

load_image_fn = utils.LoadTransformImage(model)

# === Load 40 frames ===
image_list = sorted(glob.glob(f'{frame_dir}/*.jpg'))
samples = np.linspace(0, len(image_list) - 1, 40).astype(int)
image_list = [image_list[i] for i in samples]
C, H, W = 3, 224, 224

images = torch.zeros((len(image_list), C, H, W))
for i, img_path in enumerate(image_list):
    img = load_image_fn(img_path)
    images[i] = img

images = images.cuda()

# === Extract features ===
with torch.no_grad():
    feats = model(images).cpu().numpy()
np.save(feature_save_path, feats)
print(f"✅ Feature extracted and saved as {feature_save_path}")




✅ Feature extracted and saved as example_video.npy


In [None]:
# Install pretrainedmodels for ResNet152
!pip install pretrainedmodels

# Upload a video (e.g., example_video.avi)
from google.colab import files
uploaded = files.upload()  # upload your .avi file




Saving 01_inpainted.mp4 to 01_inpainted (1).mp4


In [None]:
import os, glob, subprocess
import numpy as np
import torch
import pretrainedmodels
from pretrainedmodels import utils

# === Paths
video_path = 'example_video.avi'  # or replace with uploaded file name
frame_dir = 'frames_tmp'
feature_save_path = 'example_video.npy'

# === Extract frames from video
if os.path.exists(frame_dir):
    subprocess.call(['rm', '-rf', frame_dir])
os.makedirs(frame_dir, exist_ok=True)

cmd = [
    'ffmpeg', '-y', '-i', video_path,
    '-vf', 'scale=400:300',
    '-qscale:v', '2',
    f'{frame_dir}/%06d.jpg'
]
subprocess.call(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# === Load pretrained ResNet152
model_cnn = pretrainedmodels.resnet152(pretrained='imagenet')
model_cnn.last_linear = utils.Identity()
model_cnn.eval().cuda()
load_image_fn = utils.LoadTransformImage(model_cnn)

# === Load 40 frames
image_list = sorted(glob.glob(f'{frame_dir}/*.jpg'))
samples = np.linspace(0, len(image_list) - 1, 40).astype(int)
image_list = [image_list[i] for i in samples]
images = torch.zeros((len(image_list), 3, 224, 224))

for i, img_path in enumerate(image_list):
    images[i] = load_image_fn(img_path)

# === Extract features
with torch.no_grad():
    feats = model_cnn(images.cuda()).cpu().numpy()
np.save(feature_save_path, feats)
print("✅ Features saved as:", feature_save_path)


In [None]:
!pip install pretrainedmodels




In [None]:
import os
import shutil
import subprocess

video_path = '/content/t1.mp4'
frame_dir = '/content/frames_t1mp4'

if os.path.exists(frame_dir):
    shutil.rmtree(frame_dir)
os.makedirs(frame_dir, exist_ok=True)

!ffmpeg -i {video_path} -vf "scale=400:300" -qscale:v 2 {frame_dir}/%06d.jpg


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
import pretrainedmodels
from pretrainedmodels import utils as pmutils

# Load pretrained ResNet-152 and remove classification layer
resnet = pretrainedmodels.resnet152(pretrained='imagenet')
resnet.last_linear = pmutils.Identity()
resnet = resnet.eval().cuda()

# Load image preprocessing function
load_image_fn = pmutils.LoadTransformImage(resnet)




In [None]:
import pretrainedmodels
from pretrainedmodels import utils as pmutils

resnet = pretrainedmodels.resnet152(pretrained='imagenet')
resnet.last_linear = pmutils.Identity()
resnet = resnet.eval().cuda()
load_image_fn = pmutils.LoadTransformImage(resnet)


In [None]:
import glob
import os
import numpy as np
import torch

frame_dir = '/content/frames_t1mp4'
frame_list = sorted(glob.glob(os.path.join(frame_dir, '*.jpg')))
selected_indices = np.linspace(0, len(frame_list) - 1, 40, dtype=int)
selected_frames = [frame_list[i] for i in selected_indices]

# Feature extraction
C, H, W = 3, 224, 224
features = torch.zeros((len(selected_frames), C, H, W)).cuda()
for i, frame_path in enumerate(selected_frames):
    img = load_image_fn(frame_path)
    features[i] = img

with torch.no_grad():
    video_feats = resnet(features).unsqueeze(0).cpu()  # [1, 40, 2048]


In [None]:
import os
import glob
import numpy as np
import torch

frame_dir = '/content/frames_t1mp4'  # 👈 ADD THIS LINE

# Select 40 frames
frame_list = sorted(glob.glob(os.path.join(frame_dir, '*.jpg')))
selected_indices = np.linspace(0, len(frame_list) - 1, 40, dtype=int)
selected_frames = [frame_list[i] for i in selected_indices]

# Extract features
C, H, W = 3, 224, 224
features = torch.zeros((len(selected_frames), C, H, W)).cuda()
for i, frame_path in enumerate(selected_frames):
    img = load_image_fn(frame_path)
    features[i] = img

with torch.no_grad():
    video_feats = resnet(features).unsqueeze(0).cpu()  # [1, 40, 2048]


In [None]:
import json
from drive.MyDrive.S2VTModel_Attention import S2VTModel

# Load vocab
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}

# Load model
model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)

checkpoint_path = '/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt'
checkpoint = torch.load(checkpoint_path, map_location='cuda')
model.load_state_dict(checkpoint['model_state_dict'])
model = model.eval().cuda()




In [None]:
video_feats = video_feats.cuda()

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

# Decode prediction
caption = []
for tok in predicted_ids[0]:
    word = vocab_rev.get(int(tok), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        caption.append(word)

print("🎬 Caption for 04.mp4:\n", ' '.join(caption))


🎬 Caption for 04.mp4:
 a woman is slicing a potato


In [None]:
#avi

# Step 1: Extract frames from AVI video
import os
import shutil
import subprocess

video_path = '/content/Cxxx6wJ1jNo_0_10.avi'  # 👈 Change filename if needed
frame_dir = '/content/frames_your_video'

if os.path.exists(frame_dir):
    shutil.rmtree(frame_dir)
os.makedirs(frame_dir, exist_ok=True)

!ffmpeg -i "$video_path" -vf "scale=400:300" -qscale:v 2 "$frame_dir/%06d.jpg"



ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
!pip install pretrainedmodels


Collecting pretrainedmodels
  Downloading pretrainedmodels-0.7.4.tar.gz (58 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting munch (from pretrainedmodels)
  Downloading munch-4.0.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->pretrainedmodels)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->pretrainedmodels)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->pretrainedmodels)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB

In [None]:
# Step 2: Load ResNet152 for feature extraction
import pretrainedmodels
from pretrainedmodels import utils as pmutils
import torch

resnet = pretrainedmodels.resnet152(pretrained='imagenet')
resnet.last_linear = pmutils.Identity()
resnet = resnet.eval().cuda()
load_image_fn = pmutils.LoadTransformImage(resnet)


Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /root/.cache/torch/hub/checkpoints/resnet152-b121ed2d.pth
100%|██████████| 230M/230M [00:01<00:00, 123MB/s]


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
# Step 3: Extract features from 40 frames
import glob
import numpy as np

frame_list = sorted(glob.glob(os.path.join(frame_dir, '*.jpg')))
selected_indices = np.linspace(0, len(frame_list) - 1, 40, dtype=int)
selected_frames = [frame_list[i] for i in selected_indices]

C, H, W = 3, 224, 224
features = torch.zeros((len(selected_frames), C, H, W)).cuda()
for i, frame_path in enumerate(selected_frames):
    img = load_image_fn(frame_path)
    features[i] = img

with torch.no_grad():
    video_feats = resnet(features).unsqueeze(0).cpu()


In [None]:
# Step 4: Load trained S2VT model
import json
from drive.MyDrive.youtube_captioning_with_attention.S2VTModel_Attention import S2VTModel

vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_rev = {v: k for k, v in vocab.items()}

model = S2VTModel(
    vocab_size=len(vocab),
    max_len=45,
    dim_hidden=1024,
    dim_word=512,
    dim_vid=2048,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    rnn_cell='lstm',
    rnn_dropout_p=0.3
)

checkpoint_path = '/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_30.pt'
checkpoint = torch.load(checkpoint_path, map_location='cuda')
model.load_state_dict(checkpoint['model_state_dict'])
model = model.eval().cuda()


In [None]:
# Step 5: Generate caption
video_feats = video_feats.cuda()

with torch.no_grad():
    _, predicted_ids = model(video_feats, mode='inference')

caption = []
for tok in predicted_ids[0]:
    word = vocab_rev.get(int(tok), '<UNK>')
    if word == '<EOS>':
        break
    if word not in ['<SOS>', '<PAD>']:
        caption.append(word)

print("🎬 Caption for your AVI video:\n", ' '.join(caption))
