In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Install the required libraries
!pip install numpy torch torchvision nltk




In [None]:
#video and json split

import json
import os
import random
import shutil

def split_json_data(input_json_path, videos_dir, output_dir, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    # Load JSON data
    with open(input_json_path, 'r') as f:
        data = json.load(f)

    # Shuffle the video data to ensure randomness
    video_entries = data['videos']
    random.shuffle(video_entries)

    # Calculate the split sizes
    total_videos = len(video_entries)
    train_size = int(total_videos * train_ratio)
    val_size = int(total_videos * val_ratio)
    test_size = total_videos - train_size - val_size

    # Split the data
    train_videos = video_entries[:train_size]
    val_videos = video_entries[train_size:train_size + val_size]
    test_videos = video_entries[train_size + val_size:]

    # Organize sentences based on video_ids for each split
    video_sentences = {video['video_id']: [] for video in video_entries}
    for sentence in data['sentences']:
        video_sentences[sentence['video_id']].append(sentence)

    def create_split_data(split_videos, split_name):
        split_data = {
            "videos": split_videos,
            "sentences": []
        }
        split_videos_dir = os.path.join(output_dir, split_name, "videos")
        os.makedirs(split_videos_dir, exist_ok=True)

        # Add captions and copy video files
        for video in split_videos:
            video_id = video['video_id']
            split_data['sentences'].extend(video_sentences[video_id])

            # Copy video file to the split directory
            video_filename = f"{video_id}.avi"
            src_video_path = os.path.join(videos_dir, video_filename)
            dst_video_path = os.path.join(split_videos_dir, video_filename)
            if os.path.exists(src_video_path):
                shutil.copy(src_video_path, dst_video_path)
            else:
                print(f"Warning: Video file {video_filename} not found in {videos_dir}.")

        # Save the JSON file for the split
        split_json_path = os.path.join(output_dir, split_name, f"{split_name}_captions.json")
        with open(split_json_path, 'w') as f:
            json.dump(split_data, f, indent=4)

    # Create each split
    create_split_data(train_videos, "train")
    create_split_data(val_videos, "val")
    create_split_data(test_videos, "test")

    print("Data split and saved successfully.")

# Define paths based on your directory structure
input_json_path = '/content/drive/MyDrive/msvd_captions.json'  # Path to the original JSON file
videos_dir = '/content/drive/MyDrive/YouTubeClips'  # Directory where video files are stored
output_dir = '/content/drive/MyDrive/msvd_split'  # Directory where you want to save the splits

# Run the split function
split_json_data(input_json_path, videos_dir, output_dir)


Data split and saved successfully.


In [None]:
#verification for split

import os
import json

def verify_split_integrity(split_name, base_dir):
    """
    Verifies that each video in the split has a corresponding entry in the JSON file
    and each entry in the JSON file has a corresponding video file.

    Args:
        split_name (str): Name of the split (e.g., 'train', 'val', 'test').
        base_dir (str): Base directory where splits are stored (e.g., '/content/drive/MyDrive/msvd_split/').
    """
    video_dir = os.path.join(base_dir, split_name, 'videos')
    json_path = os.path.join(base_dir, split_name, f"{split_name}_captions.json")

    # Load JSON data
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    # Get the list of video files and remove file extensions
    video_files = {os.path.splitext(file)[0] for file in os.listdir(video_dir) if file.endswith('.avi')}

    # Get the list of video_ids from JSON "videos" section
    json_video_ids = {video['video_id'] for video in json_data['videos']}

    # Check that each video file has a corresponding entry in JSON "videos" section
    missing_in_json_videos = video_files - json_video_ids
    if missing_in_json_videos:
        print(f"[{split_name}] Videos present in folder but missing in JSON 'videos' section: {missing_in_json_videos}")
    else:
        print(f"[{split_name}] All videos in folder have matching entries in JSON 'videos' section.")

    # Check that each JSON entry in "videos" section has a corresponding video file in the folder
    missing_in_videos_folder = json_video_ids - video_files
    if missing_in_videos_folder:
        print(f"[{split_name}] Entries in JSON 'videos' section but missing video files: {missing_in_videos_folder}")
    else:
        print(f"[{split_name}] All entries in JSON 'videos' section have matching video files in folder.")

    # Get the list of video_ids from JSON "sentences" section
    json_caption_video_ids = {sentence['video_id'] for sentence in json_data['sentences']}

    # Check that each video file has a corresponding caption in JSON "sentences" section
    missing_captions_for_videos = video_files - json_caption_video_ids
    if missing_captions_for_videos:
        print(f"[{split_name}] Videos present in folder but missing captions in JSON 'sentences' section: {missing_captions_for_videos}")
    else:
        print(f"[{split_name}] All videos in folder have matching captions in JSON 'sentences' section.")

    # Check that each JSON entry in "sentences" section has a corresponding video file in the folder
    missing_videos_for_captions = json_caption_video_ids - video_files
    if missing_videos_for_captions:
        print(f"[{split_name}] Captions in JSON 'sentences' section but missing video files: {missing_videos_for_captions}")
    else:
        print(f"[{split_name}] All captions in JSON 'sentences' section have matching video files in folder.")

# Run verification for each split
base_dir = '/content/drive/MyDrive/msvd_split'  # Change to your base directory if different
for split in ['train', 'val', 'test']:
    verify_split_integrity(split, base_dir)


[train] All videos in folder have matching entries in JSON 'videos' section.
[train] All entries in JSON 'videos' section have matching video files in folder.
[train] All videos in folder have matching captions in JSON 'sentences' section.
[train] All captions in JSON 'sentences' section have matching video files in folder.
[val] All videos in folder have matching entries in JSON 'videos' section.
[val] All entries in JSON 'videos' section have matching video files in folder.
[val] All videos in folder have matching captions in JSON 'sentences' section.
[val] All captions in JSON 'sentences' section have matching video files in folder.
[test] All videos in folder have matching entries in JSON 'videos' section.
[test] All entries in JSON 'videos' section have matching video files in folder.
[test] All videos in folder have matching captions in JSON 'sentences' section.
[test] All captions in JSON 'sentences' section have matching video files in folder.


In [None]:
#removing caption of which video is not present

import os
import json

def clean_json_for_existing_videos(json_path, video_dir):
    # Load JSON data
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Get the set of video_ids from the actual video files in the folder
    video_files = {os.path.splitext(file)[0] for file in os.listdir(video_dir) if file.endswith('.avi')}

    # Filter out entries in 'videos' and 'sentences' that don't have a corresponding video file
    data['videos'] = [video for video in data['videos'] if video['video_id'] in video_files]
    data['sentences'] = [sentence for sentence in data['sentences'] if sentence['video_id'] in video_files]

    # Save the cleaned JSON data back to the file
    with open(json_path, 'w') as f:
        json.dump(data, f, indent=4)

    print(f"Cleaned JSON file saved to {json_path}")

# Define paths to each split's JSON file and video folder
base_dir = '/content/drive/MyDrive/msvd_split'
splits = {
    'train': {'json_path': os.path.join(base_dir, 'train', 'train_captions.json'),
              'video_dir': os.path.join(base_dir, 'train', 'videos')},
    'val': {'json_path': os.path.join(base_dir, 'val', 'val_captions.json'),
            'video_dir': os.path.join(base_dir, 'val', 'videos')},
    'test': {'json_path': os.path.join(base_dir, 'test', 'test_captions.json'),
             'video_dir': os.path.join(base_dir, 'test', 'videos')}
}

# Clean each JSON file based on existing video files
for split, paths in splits.items():
    print(f"Cleaning {split} JSON file...")
    clean_json_for_existing_videos(paths['json_path'], paths['video_dir'])
    print(f"{split} JSON file cleaned.\n")


Cleaning train JSON file...
Cleaned JSON file saved to /content/drive/MyDrive/msvd_split/train/train_captions.json
train JSON file cleaned.

Cleaning val JSON file...
Cleaned JSON file saved to /content/drive/MyDrive/msvd_split/val/val_captions.json
val JSON file cleaned.

Cleaning test JSON file...
Cleaned JSON file saved to /content/drive/MyDrive/msvd_split/test/test_captions.json
test JSON file cleaned.



In [None]:
#vocabulary building

import json
from collections import Counter

def build_vocabulary(json_paths, min_freq=1, save_path='/content/drive/MyDrive/msvd_split/vocab.json'):
    special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
    word_counter = Counter()

    # Process each JSON file to gather word frequencies
    for json_path in json_paths:
        with open(json_path, 'r') as f:
            data = json.load(f)
            for sentence in data['sentences']:
                words = sentence['caption'].lower().split()
                word_counter.update(words)

    # Create vocabulary by adding special tokens and frequent words
    vocab = {token: idx for idx, token in enumerate(special_tokens)}
    for word, freq in word_counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)

    # Save vocabulary as a JSON file
    with open(save_path, 'w') as f:
        json.dump(vocab, f, indent=4)

    print(f"Vocabulary built with {len(vocab)} words. Saved to {save_path}")

# Define paths to JSON files
json_paths = [
    '/content/drive/MyDrive/msvd_split/train/train_captions.json',
    '/content/drive/MyDrive/msvd_split/val/val_captions.json',
    '/content/drive/MyDrive/msvd_split/test/test_captions.json'
]

# Build vocabulary
build_vocabulary(json_paths)


Vocabulary built with 12596 words. Saved to /content/drive/MyDrive/msvd_split/vocab.json


In [None]:
!pip install torchvision
!pip install opencv-python




In [None]:
!pip install pretrainedmodels


Collecting pretrainedmodels
  Downloading pretrainedmodels-0.7.4.tar.gz (58 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting munch (from pretrainedmodels)
  Downloading munch-4.0.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading munch-4.0.0-py2.py3-none-any.whl (9.9 kB)
Building wheels for collected packages: pretrainedmodels
  Building wheel for pretrainedmodels (setup.py) ... [?25l[?25hdone
  Created wheel for pretrainedmodels: filename=pretrainedmodels-0.7.4-py3-none-any.whl size=60944 sha256=4de36ebbc08cfb7191dba756804afbf846121b3383f2f563f4753a7022ec828b
  Stored in directory: /root/.cache/pip/wheels/35/cb/a5/8f534c60142835bfc889f9a482e4a67e0b817032d9c6883b64
Successfully built pretrainedmodels
Installing collected packag

In [None]:
#feature extraction

import os
import shutil
import subprocess
import glob
import numpy as np
import json
from tqdm import tqdm
import torch
import pretrainedmodels
from pretrainedmodels import utils

# Constants for input dimensions
C, H, W = 3, 224, 224

# Function to extract frames from a video
def extract_frames(video_path, dst):
    if os.path.exists(dst):
        shutil.rmtree(dst)
    os.makedirs(dst)
    video_to_frames_command = [
        "ffmpeg",
        '-y',
        '-i', video_path,
        '-vf', "scale=400:300",
        '-qscale:v', "2",
        f"{dst}/%06d.jpg"
    ]
    subprocess.call(video_to_frames_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

# Function to extract features from frames
def extract_feats(params, model, load_image_fn, split):
    model.eval()
    dir_fc = os.path.join(params['output_dir'], split, 'features')  # Store features in respective split folder
    os.makedirs(dir_fc, exist_ok=True)

    # Load video list from JSON file for the split
    json_path = params[f'{split}_json']
    with open(json_path, 'r') as f:
        data = json.load(f)
        video_ids = {video['video_id'] for video in data['videos']}

    # Process each video in the specified directory
    video_dir = os.path.join(params['video_path'], split, 'videos')  # Use respective split folder
    video_list = glob.glob(os.path.join(video_dir, '*.avi'))
    for video in tqdm(video_list, desc=f"Processing {split} videos"):
        video_id = os.path.splitext(os.path.basename(video))[0]
        if video_id not in video_ids:
            continue

        # Extract frames
        dst = os.path.join(params['tmp_dir'], video_id)
        extract_frames(video, dst)

        # Load frames and extract features
        image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
        samples = np.round(np.linspace(0, len(image_list) - 1, params['n_frame_steps'])).astype(int)
        image_list = [image_list[sample] for sample in samples]
        images = torch.zeros((len(image_list), C, H, W))

        for i, img_path in enumerate(image_list):
            img = load_image_fn(img_path)
            images[i] = img

        # Move images to GPU for feature extraction
        images = images.cuda()
        with torch.no_grad():
            fc_feats = model(images).cpu().squeeze()

        # Save features
        outfile = os.path.join(dir_fc, f"{video_id}.npy")
        np.save(outfile, fc_feats.numpy())

        # Clean up
        shutil.rmtree(dst)

    print(f"Feature extraction for {split} set is complete.")

# Main code setup with hardcoded parameters
params = {
    'output_dir': '/content/drive/MyDrive/msvd_split',
    'video_path': '/content/drive/MyDrive/msvd_split',
    'n_frame_steps': 40,
    'tmp_dir': '/content/tmp_frames',
    'train_json': '/content/drive/MyDrive/msvd_split/train/train_captions.json',
    'val_json': '/content/drive/MyDrive/msvd_split/val/val_captions.json',
    'test_json': '/content/drive/MyDrive/msvd_split/test/test_captions.json',
    'model': 'resnet152'  # Set your model choice here (resnet152, inception_v3, or inception_v4)
}

# Set up model and image loader
if params['model'] == 'inception_v3':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv3(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'resnet152':
    C, H, W = 3, 224, 224
    model = pretrainedmodels.resnet152(pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
elif params['model'] == 'inception_v4':
    C, H, W = 3, 299, 299
    model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet')
    load_image_fn = utils.LoadTransformImage(model)
else:
    raise ValueError(f"Model {params['model']} is not supported")

model.last_linear = utils.Identity()  # Remove final classification layer
model = model.cuda()  # Use GPU

# Extract features for each split
for split in ['train', 'val', 'test']:
    extract_feats(params, model, load_image_fn, split)


Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /root/.cache/torch/hub/checkpoints/resnet152-b121ed2d.pth
100%|██████████| 230M/230M [00:01<00:00, 193MB/s]
Processing train videos: 100%|██████████| 1378/1378 [33:45<00:00,  1.47s/it]


Feature extraction for train set is complete.


Processing val videos: 100%|██████████| 295/295 [06:37<00:00,  1.35s/it]


Feature extraction for val set is complete.


Processing test videos: 100%|██████████| 297/297 [06:50<00:00,  1.38s/it]

Feature extraction for test set is complete.





In [None]:
#verifying feature extracted correctly or not

import os
import json

# Function to verify features extraction for each split by cross-checking both video files and JSON entries
def verify_features_extraction(split, params):
    # Paths for the split
    video_dir = os.path.join(params['output_dir'], split, 'videos')
    features_dir = os.path.join(params['output_dir'], split, 'features')
    json_path = os.path.join(params['output_dir'], split, f"{split}_captions.json")

    # Load JSON file for the split
    with open(json_path, 'r') as f:
        data = json.load(f)
        video_ids_in_json = {video['video_id'] for video in data['videos']}

    # Check from Video Files
    missing_features_from_videos = []
    extra_features_in_folder_from_videos = []
    video_files = {os.path.splitext(video)[0] for video in os.listdir(video_dir) if video.endswith('.avi')}
    for video_id in video_files:
        feature_path = os.path.join(features_dir, f"{video_id}.npy")
        if not os.path.exists(feature_path):
            missing_features_from_videos.append(video_id)

    # Check from JSON Entries
    missing_features_from_json = []
    for video_id in video_ids_in_json:
        feature_path = os.path.join(features_dir, f"{video_id}.npy")
        if not os.path.exists(feature_path):
            missing_features_from_json.append(video_id)

    # Check for extra feature files
    extra_features = []
    for feature_file in os.listdir(features_dir):
        video_id = os.path.splitext(feature_file)[0]
        if video_id not in video_files and video_id not in video_ids_in_json:
            extra_features.append(video_id)

    # Print results
    if missing_features_from_videos:
        print(f"[{split}] Missing features for videos in the video folder: {len(missing_features_from_videos)}")
        print("Missing video IDs from video files:", missing_features_from_videos)
    else:
        print(f"[{split}] All video files have corresponding features.")

    if missing_features_from_json:
        print(f"[{split}] Missing features for videos in the JSON file: {len(missing_features_from_json)}")
        print("Missing video IDs from JSON:", missing_features_from_json)
    else:
        print(f"[{split}] All JSON entries have corresponding features.")

    if extra_features:
        print(f"[{split}] Extra feature files found that do not match any video or JSON entry: {len(extra_features)}")
        print("Extra feature file video IDs:", extra_features)
    else:
        print(f"[{split}] No extra feature files found.")

# Define your parameters
params = {
    'output_dir': '/content/drive/MyDrive/msvd_split',
}

# Verify for each split
for split in ['train', 'val', 'test']:
    verify_features_extraction(split, params)


[train] All video files have corresponding features.
[train] All JSON entries have corresponding features.
[train] No extra feature files found.
[val] All video files have corresponding features.
[val] All JSON entries have corresponding features.
[val] No extra feature files found.
[test] All video files have corresponding features.
[test] All JSON entries have corresponding features.
[test] No extra feature files found.


In [None]:
#data loader

import os
import json
import numpy as np
import torch
from torch.utils.data import Dataset

class VideoCaptionDataset(Dataset):
    def __init__(self, feature_dir, json_path, vocab, max_caption_length=15, verbose=False):
        """
        Args:
            feature_dir (str): Directory where video feature files (.npy) are stored.
            json_path (str): Path to the JSON file containing captions.
            vocab (dict): Vocabulary dictionary mapping words to indices.
            max_caption_length (int): Maximum length for captions after padding/truncation.
            verbose (bool): If True, print additional information during loading.
        """
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_caption_length = max_caption_length
        self.verbose = verbose

        # Load JSON data
        with open(json_path, 'r') as f:
            data = json.load(f)

        # Map video_id to its captions
        self.video_captions = {}
        for item in data['sentences']:
            video_id = item['video_id']
            caption = item['caption']
            if video_id in self.video_captions:
                self.video_captions[video_id].append(caption)
            else:
                self.video_captions[video_id] = [caption]

        # List of video IDs that have both features and captions
        self.video_ids = [vid['video_id'] for vid in data['videos'] if vid['video_id'] in self.video_captions]

        if self.verbose:
            print("Initialized VideoCaptionDataset")
            print(f"Total videos with captions: {len(self.video_ids)}")

    def __len__(self):
        return len(self.video_ids)

    def __getitem__(self, idx):
        video_id = self.video_ids[idx]

        # Load video features
        feature_path = os.path.join(self.feature_dir, f"{video_id}.npy")
        if not os.path.exists(feature_path):
            raise FileNotFoundError(f"Feature file for video_id {video_id} not found at {feature_path}")

        video_features = np.load(feature_path)
        video_features_tensor = torch.tensor(video_features, dtype=torch.float32)
        if self.verbose:
            print(f"Loaded features for video_id {video_id}, Shape: {video_features_tensor.shape}")

        # Choose a random caption for this video
        caption = np.random.choice(self.video_captions[video_id])
        if self.verbose:
            print(f"Original caption: '{caption}'")

        # Convert caption to tensor of word indices with padding
        caption_indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in caption.lower().split()]

        # Apply padding or truncation
        caption_indices = caption_indices[:self.max_caption_length]  # Truncate if too long
        caption_indices += [self.vocab['<PAD>']] * (self.max_caption_length - len(caption_indices))  # Pad if too short

        caption_tensor = torch.tensor(caption_indices, dtype=torch.long)
        if self.verbose:
            print(f"Processed caption indices (padded/truncated): {caption_indices}")
            print(f"Caption Tensor Shape: {caption_tensor.shape}")

        return video_features_tensor, caption_tensor

# Example usage
if __name__ == "__main__":
    # Define directories based on your structure
    feature_dir = '/content/drive/MyDrive/msvd_split/train/features'  # Path to train features directory
    json_path = '/content/drive/MyDrive/msvd_split/train/train_captions.json'  # Path to train captions JSON
    vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'  # Path to vocabulary JSON

    # Load vocabulary
    with open(vocab_path, 'r') as f:
        vocab = json.load(f)

    # Initialize dataset
    dataset = VideoCaptionDataset(feature_dir, json_path, vocab, verbose=True)
    print(f"Dataset size: {len(dataset)}")

    # Access a sample item to verify
    video_features, caption_tensor = dataset[0]
    print("Sample Video Features Shape:", video_features.shape)
    print("Sample Caption Tensor:", caption_tensor)


Initialized VideoCaptionDataset
Total videos with captions: 1378
Dataset size: 1378
Loaded features for video_id DKZg4kIEa0A_31_36, Shape: torch.Size([40, 2048])
Original caption: 'a boy is skateboarding'
Processed caption indices (padded/truncated): [4, 5, 6, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Caption Tensor Shape: torch.Size([15])
Sample Video Features Shape: torch.Size([40, 2048])
Sample Caption Tensor: tensor([ 4,  5,  6, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])


In [None]:
#s2vt model

import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

class S2VTModel(nn.Module):
    def __init__(self, vocab_size, max_len=45, dim_hidden=512, dim_word=512, dim_vid=2048, sos_id=1, eos_id=0,
                 n_layers=1, rnn_cell='lstm', rnn_dropout_p=0.3):
        super(S2VTModel, self).__init__()

        # Set RNN cell type based on input
        if rnn_cell.lower() == 'lstm':
            self.rnn_cell = nn.LSTM
        elif rnn_cell.lower() == 'gru':
            self.rnn_cell = nn.GRU

        # Define RNN layers
        self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)
        self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)

        # Other configurations
        self.dim_vid = dim_vid
        self.dim_output = vocab_size
        self.dim_hidden = dim_hidden
        self.dim_word = dim_word
        self.max_length = max_len
        self.sos_id = sos_id
        self.eos_id = eos_id

        # Embedding and output layers
        self.embedding = nn.Embedding(self.dim_output, self.dim_word)
        self.out = nn.Linear(self.dim_hidden, self.dim_output)

    def forward(self, vid_feats, target_variable=None, mode='train'):
        batch_size, n_frames, _ = vid_feats.shape
        padding_words = Variable(vid_feats.data.new(batch_size, n_frames, self.dim_word)).zero_()
        padding_frames = Variable(vid_feats.data.new(batch_size, 1, self.dim_vid)).zero_()
        state1 = None
        state2 = None

        # Pass through first RNN layer
        output1, state1 = self.rnn1(vid_feats, state1)
        input2 = torch.cat((output1, padding_words), dim=2)
        output2, state2 = self.rnn2(input2, state2)

        # Sequence generation
        seq_probs = []
        seq_preds = []
        if mode == 'train':
            for i in range(self.max_length - 1):
                current_words = self.embedding(target_variable[:, i])
                output1, state1 = self.rnn1(padding_frames, state1)
                input2 = torch.cat((output1, current_words.unsqueeze(1)), dim=2)
                output2, state2 = self.rnn2(input2, state2)
                logits = self.out(output2.squeeze(1))
                logits = F.log_softmax(logits, dim=1)
                seq_probs.append(logits.unsqueeze(1))
            seq_probs = torch.cat(seq_probs, 1)

        else:  # Inference mode
            current_words = self.embedding(Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda())
            for i in range(self.max_length - 1):
                output1, state1 = self.rnn1(padding_frames, state1)
                input2 = torch.cat((output1, current_words.unsqueeze(1)), dim=2)
                output2, state2 = self.rnn2(input2, state2)
                logits = self.out(output2.squeeze(1))
                logits = F.log_softmax(logits, dim=1)
                seq_probs.append(logits.unsqueeze(1))
                _, preds = torch.max(logits, 1)
                current_words = self.embedding(preds)
                seq_preds.append(preds.unsqueeze(1))
            seq_probs = torch.cat(seq_probs, 1)
            seq_preds = torch.cat(seq_preds, 1)

        return seq_probs, seq_preds


In [None]:
#saving model to drive

model_code = '''import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

class S2VTModel(nn.Module):
    def __init__(self, vocab_size, max_len=45, dim_hidden=512, dim_word=512, dim_vid=2048, sos_id=1, eos_id=0,
                 n_layers=1, rnn_cell='lstm', rnn_dropout_p=0.3):
        super(S2VTModel, self).__init__()

        # Set RNN cell type based on input
        if rnn_cell.lower() == 'lstm':
            self.rnn_cell = nn.LSTM
        elif rnn_cell.lower() == 'gru':
            self.rnn_cell = nn.GRU

        # Define RNN layers
        self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)
        self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, batch_first=True, dropout=rnn_dropout_p)

        # Other configurations
        self.dim_vid = dim_vid
        self.dim_output = vocab_size
        self.dim_hidden = dim_hidden
        self.dim_word = dim_word
        self.max_length = max_len
        self.sos_id = sos_id
        self.eos_id = eos_id

        # Embedding and output layers
        self.embedding = nn.Embedding(self.dim_output, self.dim_word)
        self.out = nn.Linear(self.dim_hidden, self.dim_output)

    def forward(self, vid_feats, target_variable=None, mode='train'):
        batch_size, n_frames, _ = vid_feats.shape
        padding_words = Variable(vid_feats.data.new(batch_size, n_frames, self.dim_word)).zero_()
        padding_frames = Variable(vid_feats.data.new(batch_size, 1, self.dim_vid)).zero_()
        state1 = None
        state2 = None

        # Pass through first RNN layer
        output1, state1 = self.rnn1(vid_feats, state1)
        input2 = torch.cat((output1, padding_words), dim=2)
        output2, state2 = self.rnn2(input2, state2)

        # Sequence generation
        seq_probs = []
        seq_preds = []
        if mode == 'train':
            for i in range(self.max_length - 1):
                current_words = self.embedding(target_variable[:, i])
                output1, state1 = self.rnn1(padding_frames, state1)
                input2 = torch.cat((output1, current_words.unsqueeze(1)), dim=2)
                output2, state2 = self.rnn2(input2, state2)
                logits = self.out(output2.squeeze(1))
                logits = F.log_softmax(logits, dim=1)
                seq_probs.append(logits.unsqueeze(1))
            seq_probs = torch.cat(seq_probs, 1)

        else:  # Inference mode
            current_words = self.embedding(Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda())
            for i in range(self.max_length - 1):
                output1, state1 = self.rnn1(padding_frames, state1)
                input2 = torch.cat((output1, current_words.unsqueeze(1)), dim=2)
                output2, state2 = self.rnn2(input2, state2)
                logits = self.out(output2.squeeze(1))
                logits = F.log_softmax(logits, dim=1)
                seq_probs.append(logits.unsqueeze(1))
                _, preds = torch.max(logits, 1)
                current_words = self.embedding(preds)
                seq_preds.append(preds.unsqueeze(1))
            seq_probs = torch.cat(seq_probs, 1)
            seq_preds = torch.cat(seq_preds, 1)

        return seq_probs, seq_preds
'''

# Save the model code as a Python file
with open('/content/drive/MyDrive/S2VTModel.py', 'w') as f:
    f.write(model_code)


In [None]:
from drive.MyDrive.S2VTModel import S2VTModel


In [None]:
#training

import os
import json
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from drive.MyDrive.S2VTModel import S2VTModel  # Import the model from Google Drive

# Define the VideoCaptionDataset class in the notebook itself
class VideoCaptionDataset(Dataset):
    def __init__(self, feature_dir, json_path, vocab, max_caption_length=45):
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_caption_length = max_caption_length

        # Load JSON data
        with open(json_path, 'r') as f:
            data = json.load(f)

        # Map video_id to its captions
        self.video_captions = {}
        for item in data['sentences']:
            video_id = item['video_id']
            caption = item['caption']
            if video_id in self.video_captions:
                self.video_captions[video_id].append(caption)
            else:
                self.video_captions[video_id] = [caption]

        # List of video IDs that have both features and captions
        self.video_ids = [vid['video_id'] for vid in data['videos'] if vid['video_id'] in self.video_captions]

        print("Initialized VideoCaptionDataset")
        print(f"Total videos with captions: {len(self.video_ids)}")

    def __len__(self):
        return len(self.video_ids)

    def __getitem__(self, idx):
        video_id = self.video_ids[idx]

        # Load video features
        feature_path = os.path.join(self.feature_dir, f"{video_id}.npy")
        video_features = np.load(feature_path)

        # Choose a random caption for this video
        caption = np.random.choice(self.video_captions[video_id])

        # Convert caption to tensor of word indices with padding
        caption_indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in caption.lower().split()]

        # Apply padding or truncation
        caption_indices = caption_indices[:self.max_caption_length]
        caption_indices += [self.vocab['<PAD>']] * (self.max_caption_length - len(caption_indices))

        caption_tensor = torch.tensor(caption_indices, dtype=torch.long)

        return torch.tensor(video_features, dtype=torch.float32), caption_tensor

# Set paths for features, JSON files, and vocabulary based on your structure
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
train_json_path = '/content/drive/MyDrive/msvd_split/train/train_captions.json'
val_json_path = '/content/drive/MyDrive/msvd_split/val/val_captions.json'
train_feature_dir = '/content/drive/MyDrive/msvd_split/train/features'
val_feature_dir = '/content/drive/MyDrive/msvd_split/val/features'
checkpoint_dir = '/content/drive/MyDrive/msvd_split/checkpoints/'

# Load vocabulary and dataset
print("Loading vocabulary...")
with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_size = len(vocab)
print(f"Vocabulary loaded with size: {vocab_size}")

# Hyperparameters
max_len = 45
dim_hidden = 1024
dim_word = 512
dim_vid = 2048
n_layers = 2
rnn_cell = 'lstm'
rnn_dropout_p = 0.2
batch_size = 8
learning_rate = 0.0001
num_epochs = 20

# Initialize model
print("Initializing model...")
model = S2VTModel(
    vocab_size=vocab_size,
    max_len=max_len,
    dim_hidden=dim_hidden,
    dim_word=dim_word,
    dim_vid=dim_vid,
    sos_id=vocab['<SOS>'],
    eos_id=vocab['<EOS>'],
    n_layers=n_layers,
    rnn_cell=rnn_cell,
    rnn_dropout_p=rnn_dropout_p
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

# Initialize datasets
print("Initializing datasets...")
train_dataset = VideoCaptionDataset(train_feature_dir, train_json_path, vocab, max_caption_length=max_len)
val_dataset = VideoCaptionDataset(val_feature_dir, val_json_path, vocab, max_caption_length=max_len)
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Training loop
for epoch in range(num_epochs):
    print(f"\nStarting Epoch [{epoch + 1}/{num_epochs}]")
    model.train()
    total_loss = 0

    for i, (video_features, captions) in enumerate(train_loader):
        video_features, captions = video_features.to(device), captions.to(device)

        optimizer.zero_grad()
        outputs, _ = model(video_features, captions, mode='train')

        # Ensure that outputs and captions have the same shape for the loss calculation
        outputs = outputs[:, :captions.size(1), :]

        # Flatten the tensors to make them compatible with CrossEntropyLoss
        outputs = outputs.contiguous().view(-1, vocab_size)
        captions = captions.contiguous().view(-1)

        # Align the lengths of outputs and captions (if there's still a mismatch)
        min_len = min(outputs.size(0), captions.size(0))
        outputs = outputs[:min_len]
        captions = captions[:min_len]

        # Calculate loss
        loss = criterion(outputs, captions)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print step details every 20 steps
        if (i + 1) % 20 == 0:
            print(f'[Epoch {epoch + 1}/{num_epochs}] Step [{i + 1}/{len(train_loader)}]: Loss = {loss.item():.4f}')

    # Adjust learning rate
    scheduler.step()

    # Save checkpoint
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch + 1}.pt')
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, checkpoint_path)
    print(f'Checkpoint saved at {checkpoint_path}')

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for j, (video_features, captions) in enumerate(val_loader):
            video_features, captions = video_features.to(device), captions.to(device)
            outputs, _ = model(video_features, captions, mode='train')
            outputs = outputs[:, :captions.size(1), :]

            # Flatten for loss calculation
            outputs = outputs.contiguous().view(-1, vocab_size)
            captions = captions.contiguous().view(-1)

            # Align lengths in validation as well
            min_len = min(outputs.size(0), captions.size(0))
            outputs = outputs[:min_len]
            captions = captions[:min_len]

            loss = criterion(outputs, captions)
            val_loss += loss.item()

            # Print validation step loss every 20 steps
            if (j + 1) % 20 == 0:
                print(f'[Validation] Step [{j + 1}/{len(val_loader)}]: Loss = {loss.item():.4f}')

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}] Complete: Avg Train Loss = {avg_train_loss:.4f}, Avg Validation Loss = {avg_val_loss:.4f}')

print("Training completed.")


Loading vocabulary...
Vocabulary loaded with size: 12596
Initializing model...
Using device: cuda
Initializing datasets...
Initialized VideoCaptionDataset
Total videos with captions: 1378
Initialized VideoCaptionDataset
Total videos with captions: 295
Training dataset size: 1378
Validation dataset size: 295

Starting Epoch [1/20]
[Epoch 1/20] Step [20/173]: Loss = 7.8490
[Epoch 1/20] Step [40/173]: Loss = 6.2657
[Epoch 1/20] Step [60/173]: Loss = 6.6559
[Epoch 1/20] Step [80/173]: Loss = 5.9623
[Epoch 1/20] Step [100/173]: Loss = 6.5990
[Epoch 1/20] Step [120/173]: Loss = 5.8543
[Epoch 1/20] Step [140/173]: Loss = 5.5405
[Epoch 1/20] Step [160/173]: Loss = 5.7641
Checkpoint saved at /content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_1.pt
[Validation] Step [20/37]: Loss = 5.7686
Epoch [1/20] Complete: Avg Train Loss = 6.1996, Avg Validation Loss = 5.6714

Starting Epoch [2/20]
[Epoch 2/20] Step [20/173]: Loss = 4.6611
[Epoch 2/20] Step [40/173]: Loss = 5.9823
[Epoch 2/20] St

In [None]:
# testing

import os
import torch
import json
import numpy as np
from torch.utils.data import Dataset, DataLoader
from nltk.translate.bleu_score import corpus_bleu
from drive.MyDrive.S2VTModel import S2VTModel

# Define the VideoCaptionDataset
class VideoCaptionDataset(Dataset):
    def __init__(self, feature_dir, json_path, vocab, max_caption_length=45):
        self.feature_dir = feature_dir
        self.vocab = vocab
        self.max_caption_length = max_caption_length

        with open(json_path, 'r') as f:
            data = json.load(f)

        self.video_captions = {}
        for item in data['sentences']:
            video_id = item['video_id']
            caption = item['caption']
            if video_id in self.video_captions:
                self.video_captions[video_id].append(caption)
            else:
                self.video_captions[video_id] = [caption]

        self.video_ids = [vid['video_id'] for vid in data['videos'] if vid['video_id'] in self.video_captions]

    def __len__(self):
        return len(self.video_ids)

    def __getitem__(self, idx):
        video_id = self.video_ids[idx]
        feature_path = os.path.join(self.feature_dir, f"{video_id}.npy")
        video_features = np.load(feature_path)
        caption = np.random.choice(self.video_captions[video_id])
        caption_indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in caption.lower().split()]
        caption_indices = caption_indices[:self.max_caption_length]
        caption_indices += [self.vocab['<PAD>']] * (self.max_caption_length - len(caption_indices))
        caption_tensor = torch.tensor(caption_indices, dtype=torch.long)
        return torch.tensor(video_features, dtype=torch.float32), caption_tensor

# Set paths and load vocabulary
vocab_path = '/content/drive/MyDrive/msvd_split/vocab.json'
test_json_path = '/content/drive/MyDrive/msvd_split/test/test_captions.json'
test_feature_dir = '/content/drive/MyDrive/msvd_split/test/features'

with open(vocab_path, 'r') as f:
    vocab = json.load(f)
vocab_size = len(vocab)
vocab_rev = {v: k for k, v in vocab.items()}

# Load model and dataset
model_path = '/content/drive/MyDrive/msvd_split/checkpoints/checkpoint_epoch_20.pt'  # Adjust epoch number as needed
model = S2VTModel(vocab_size=vocab_size, max_len=45, dim_hidden=1024, dim_word=512, dim_vid=2048,
                  sos_id=vocab['<SOS>'], eos_id=vocab['<EOS>'], n_layers=2, rnn_cell='lstm', rnn_dropout_p=0.2)
model.load_state_dict(torch.load(model_path, map_location='cuda')['model_state_dict'])
model.eval()
model = model.to('cuda')

test_dataset = VideoCaptionDataset(test_feature_dir, test_json_path, vocab)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Testing
all_hypotheses = []
all_references = []

for video_features, captions in test_loader:
    video_features, captions = video_features.to('cuda'), captions.to('cuda')
    with torch.no_grad():
        outputs, _ = model(video_features, captions, mode='inference')

    # Decode generated caption
    pred_caption = []
    for idx in outputs[0].cpu().numpy().flatten():  # Flatten to ensure single integer elements
        idx = int(idx)  # Ensure idx is a scalar integer
        if idx not in [vocab['<PAD>'], vocab['<EOS>']]:
            pred_caption.append(vocab_rev.get(idx, ''))
    pred_caption_str = ' '.join(pred_caption)
    print(f"\nGenerated Caption: {pred_caption_str}")
    all_hypotheses.append(pred_caption)

    # Decode ground truth captions
    references = []
    for caption in captions:
        ref_caption = []
        for idx in caption.cpu().numpy().flatten():  # Flatten to ensure single integer elements
            idx = int(idx)  # Ensure idx is a scalar integer
            if idx not in [vocab['<PAD>'], vocab['<EOS>']]:
                ref_caption.append(vocab_rev.get(idx, ''))
        references.append(ref_caption)
    print(f"Ground Truth Captions: {references}")
    all_references.append([references[0]])  # BLEU expects a list of lists for references

    # Display only a few samples
    if len(all_hypotheses) == 5:
        break

# Calculate BLEU score
bleu_score = corpus_bleu(all_references, all_hypotheses)
print(f"\nBLEU score: {bleu_score:.4f}")


  model.load_state_dict(torch.load(model_path, map_location='cuda')['model_state_dict'])



Generated Caption:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     