In [1]:
pip install torch==2.0.0 torchvision==0.15.1

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pycocoevalcap

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import time
import random
import argparse

import numpy as np
import pandas as pd
from PIL import Image
import nltk

import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.distributed import DistributedSampler

import torchvision.models as models
from torchvision import transforms
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.models.detection.image_list import ImageList
import torchvision.transforms.functional as F

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score

# Ensure NLTK data is downloaded
nltk.download('punkt')  # Download the Punkt tokenizer for sentence splitting

# Ensure reproducibility
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

[nltk_data] Downloading package punkt to /home/jed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size=256, device='cuda'):
        super(EncoderCNN, self).__init__()
        self.device = device

        # Load the pre-trained Mask R-CNN model
        self.mask_rcnn = maskrcnn_resnet50_fpn(weights='DEFAULT')
        self.mask_rcnn.to(self.device)
        self.mask_rcnn.eval()

        # Freeze Mask R-CNN parameters
        for param in self.mask_rcnn.parameters():
            param.requires_grad = False

        # Access the backbone, RPN, and ROI Heads from Mask R-CNN
        self.backbone = self.mask_rcnn.backbone
        self.rpn = self.mask_rcnn.rpn
        self.roi_heads = self.mask_rcnn.roi_heads
        self.transform = self.mask_rcnn.transform

        # Global feature embedding
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.global_embed = nn.Linear(self.backbone.out_channels, embed_size)

        # Object feature embedding
        # The output of the box head is 1024-dimensional
        self.obj_embed = nn.Linear(1024, embed_size)

        # Initialize weights of the linear layers
        self.init_weights()

    def init_weights(self):
        # Initialize weights of the linear layers using Xavier initialization
        nn.init.xavier_uniform_(self.global_embed.weight)
        nn.init.zeros_(self.global_embed.bias)
        nn.init.xavier_uniform_(self.obj_embed.weight)
        nn.init.zeros_(self.obj_embed.bias)

    def train(self, mode=True):
        # Override the train method to prevent Mask R-CNN from switching to train mode
        super(EncoderCNN, self).train(mode)
        self.mask_rcnn.eval()  # Ensure Mask R-CNN stays in eval mode

    def forward(self, images):
        """
        Forward pass through the encoder.
        Args:
            images: Tensor of shape (batch_size, C, H, W)
        Returns:
            combined_features: Tensor of shape (batch_size, embed_size * 2)
        """
        # Convert the batch tensor to a list of individual image tensors
        images = list(images)

        # Use Mask R-CNN's transform to preprocess the images
        transformed_images, _ = self.transform(images)

        # Extract features using the backbone
        with torch.no_grad():
            features = self.backbone(transformed_images.tensors)

        # Global feature extraction
        # Use the highest resolution feature map (assuming key '0')
        feature_map = features['0']  # Shape: (batch_size, C, H, W)
        global_features = self.global_pool(feature_map)  # Shape: (batch_size, C, 1, 1)
        global_features = global_features.view(global_features.size(0), -1)  # Shape: (batch_size, C)
        global_features = self.global_embed(global_features)  # Shape: (batch_size, embed_size)

        # Object detection and feature extraction
        # Get proposals from RPN
        with torch.no_grad():
            proposals, _ = self.rpn(transformed_images, features, None)

        # Get detections from ROI Heads
        detections, _ = self.roi_heads(features, proposals, transformed_images.image_sizes, None)

        # Extract object features using the box head
        object_features_list = []
        for i in range(len(detections)):
            boxes = detections[i]['boxes']  # Bounding boxes for detected objects

            if boxes.shape[0] > 0:
                # Perform RoI pooling on the detected boxes
                box_features = self.roi_heads.box_roi_pool(
                    features, [boxes], [transformed_images.image_sizes[i]]
                )
                # Pass through the box head to get object features
                box_features = self.roi_heads.box_head(box_features)  # Shape: (num_boxes, 1024)
                # Transform to embed_size
                object_features = self.obj_embed(box_features)  # Shape: (num_boxes, embed_size)
                # Aggregate object features by averaging
                aggregated_feats = object_features.mean(dim=0)  # Shape: (embed_size,)
            else:
                # If no objects detected, use a zero vector
                aggregated_feats = torch.zeros(self.obj_embed.out_features).to(self.device)

            object_features_list.append(aggregated_feats)

        # Combine object features into a tensor
        object_features = torch.stack(object_features_list, dim=0)  # Shape: (batch_size, embed_size)

        # Concatenate global and object features
        combined_features = torch.cat([global_features, object_features], dim=1)  # Shape: (batch_size, embed_size * 2)
        return combined_features

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        # Initialize weights for input, forget, cell, and output gates
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Input gate parameters
        self.W_i = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.U_i = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_i = nn.Parameter(torch.Tensor(hidden_size))

        # Forget gate parameters
        self.W_f = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.U_f = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_f = nn.Parameter(torch.Tensor(hidden_size))

        # Cell gate parameters
        self.W_c = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.U_c = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_c = nn.Parameter(torch.Tensor(hidden_size))

        # Output gate parameters
        self.W_o = nn.Parameter(torch.Tensor(input_size, hidden_size))
        self.U_o = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_o = nn.Parameter(torch.Tensor(hidden_size))

        self.init_weights()

    def init_weights(self):
        # Initialize all weights and biases
        for param in self.parameters():
            if param.data.ndimension() >= 2:
                nn.init.xavier_uniform_(param.data)
            else:
                nn.init.zeros_(param.data)

    def forward(self, x, h_prev, c_prev):
        # Compute gates
        i_t = torch.sigmoid(x @ self.W_i + h_prev @ self.U_i + self.b_i)
        f_t = torch.sigmoid(x @ self.W_f + h_prev @ self.U_f + self.b_f)
        g_t = torch.tanh(x @ self.W_c + h_prev @ self.U_c + self.b_c)
        o_t = torch.sigmoid(x @ self.W_o + h_prev @ self.U_o + self.b_o)

        # Update cell state
        c_t = f_t * c_prev + i_t * g_t
        # Update hidden state
        h_t = o_t * torch.tanh(c_t)
        return h_t, c_t
    
class DecoderRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, vocab_size, dropout=0.5):
        super(DecoderRNN, self).__init__()
        # Embedding layer to convert word indices to embeddings
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # Adjusted input size for the feature projection layer
        self.feature_proj = nn.Linear(input_size, embed_size)
        # Custom LSTM cell
        self.lstm_cell = LSTM(embed_size, hidden_size)
        # Fully connected layer to project hidden state to vocabulary space
        self.fc = nn.Linear(hidden_size, vocab_size)
        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        # Initialize weights
        self.init_weights()

    def init_weights(self):
        # Initialize weights for embedding and fully connected layers
        nn.init.uniform_(self.embedding.weight, -0.1, 0.1)
        nn.init.uniform_(self.fc.weight, -0.1, 0.1)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.xavier_uniform_(self.feature_proj.weight)
        nn.init.zeros_(self.feature_proj.bias)

    def forward(self, features, captions):
        """
        Forward pass through the decoder.
        Args:
            features: Combined image features from the encoder, shape (batch_size, input_size)
            captions: Caption sequences, shape (batch_size, max_seq_length)
        Returns:
            outputs: Predicted word distributions, shape (batch_size, seq_len, vocab_size)
        """
        # Project the combined features to embed_size
        features = self.feature_proj(features)  # Shape: (batch_size, embed_size)

        # Embed the captions (exclude the last word for teacher forcing)
        embeddings = self.embedding(captions[:, :-1])  # Shape: (batch_size, seq_len - 1, embed_size)
        # Concatenate image features as the first input
        embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        embeddings = self.dropout(embeddings)

        batch_size, seq_len, _ = embeddings.size()
        outputs = torch.zeros(batch_size, seq_len, self.fc.out_features).to(features.device)

        # Initialize hidden and cell states to zeros
        h_t = torch.zeros(batch_size, self.hidden_size).to(features.device)
        c_t = torch.zeros(batch_size, self.hidden_size).to(features.device)

        # Unroll the LSTM for seq_len time steps
        for t in range(seq_len):
            x_t = embeddings[:, t, :]  # Input at time step t
            h_t, c_t = self.lstm_cell(x_t, h_t, c_t)  # Update hidden and cell states
            output = self.fc(h_t)  # Compute output word distribution
            outputs[:, t, :] = output  # Store output

        return outputs

    def sample(self, features, max_len=20, end_token_idx=None):
        """
        Generate captions for given image features using greedy search.
        Args:
            features: Combined image features from the encoder, shape (1, input_size)
            max_len: Maximum length of the generated caption
            end_token_idx: Index of the <end> token
        Returns:
            sampled_ids: List of predicted word indices
        """
        # Project the combined features to embed_size
        features = self.feature_proj(features)  # Shape: (1, embed_size)

        sampled_ids = []
        inputs = features  # Initial input is the image features
        h_t = torch.zeros(1, self.hidden_size).to(features.device)
        c_t = torch.zeros(1, self.hidden_size).to(features.device)

        for _ in range(max_len):
            h_t, c_t = self.lstm_cell(inputs, h_t, c_t)
            outputs = self.fc(h_t)  # Compute word distribution
            predicted = outputs.argmax(1)  # Get the index with the highest probability
            sampled_ids.append(predicted.item())

            if predicted.item() == end_token_idx:
                break  # Stop if <end> token is generated

            # Prepare input for next time step
            inputs = self.embedding(predicted)
            inputs = self.dropout(inputs)

        return sampled_ids

In [3]:
import os
from PIL import Image

import torch
from torch.utils.data import Dataset
from torchvision import transforms

class FlickrDataset(Dataset):
    """
    Custom Dataset for loading Flickr images and captions.
    This dataset handles the basic functionality of loading images and their corresponding captions.
    """

    def __init__(self, image_dir, image_ids, captions_seqs, transform=None, mode='train'):
        """
        Args:
            image_dir (str): Directory with all the images.
            image_ids (list): List of image filenames.
            captions_seqs (dict): Dictionary mapping image filenames to caption sequences.
            transform (callable, optional): Optional transform to be applied on an image.
            mode (str): Mode of the dataset, 'train' or 'test'.
        """
        self.image_dir = image_dir
        self.transform = transform
        self.mode = mode
        self.images = []
        self.captions = []
        self.image_ids = []

        if self.mode == 'train':
            # Pair each image with its captions for training
            for img_id in image_ids:
                captions = captions_seqs[img_id]
                for caption_seq in captions:
                    self.images.append(img_id)
                    self.captions.append(caption_seq)
        elif self.mode == 'test':
            # For testing, pair each image with all its captions
            for img_id in image_ids:
                captions = captions_seqs[img_id]
                # Optionally, you can choose to handle multiple captions per image
                # Here, we'll keep one caption per image for simplicity
                if captions:
                    caption_seq = random.choice(captions)
                else:
                    caption_seq = []  # Handle images without captions appropriately
                self.images.append(img_id)
                self.captions.append(caption_seq)
                self.image_ids.append(img_id)
        else:
            raise ValueError("Mode should be either 'train' or 'test'.")

    def __len__(self):
        """Returns the total number of image-caption pairs."""
        return len(self.images)

    def __getitem__(self, idx):
        """
        Retrieves the image and caption at the specified index.
        Args:
            idx (int): Index
        Returns:
            image (Tensor): Transformed image tensor.
            caption_seq (Tensor): Corresponding caption sequence tensor.
            image_id (str): Filename of the image.
        """
        img_id = self.images[idx]
        caption_seq = self.captions[idx]
        img_path = os.path.join(self.image_dir, img_id)

        # Open and convert image to RGB
        image = Image.open(img_path).convert("RGB")

        # Apply transformations if any
        if self.transform:
            image = self.transform(image)

        # Convert caption sequence to tensor
        caption_seq = torch.tensor(caption_seq)

        if self.mode == 'train':
            return image, caption_seq
        elif self.mode == 'test':
            return image, caption_seq, img_id

def collate_fn(batch):
    """
    Custom collate function to handle variable-length captions.
    This function pads captions to the length of the longest caption in the batch.
    Args:
        batch (list): List of tuples (image, caption_seq) or (image, caption_seq, image_id)
    Returns:
        If training:
            images (Tensor): Batch of images.
            targets (Tensor): Padded caption sequences.
            lengths (list): Original lengths of each caption before padding.
        If testing:
            images (Tensor): Batch of images.
            targets (Tensor): Padded caption sequences.
            image_ids (list): List of image filenames.
    """
    if len(batch[0]) == 3:
        # Test mode
        images, captions, image_ids = zip(*batch)
    else:
        # Train mode
        images, captions = zip(*batch)
        image_ids = None

    # Stack images
    images = torch.stack(images, 0)

    # Get lengths of each caption
    lengths = [len(cap) for cap in captions]

    # Pad captions to the length of the longest caption
    max_length = max(lengths)
    targets = torch.zeros(len(captions), max_length).long()
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]

    if image_ids is not None:
        return images, targets, image_ids
    else:
        return images, targets, lengths

def get_transform(train=True):
    """
    Returns the image transformations for training or evaluation.
    Args:
        train (bool): Flag indicating whether transformations are for training or evaluation.
    Returns:
        transform (callable): Composed transformations.
    """
    if train:
        transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],  # ImageNet mean
                std=[0.229, 0.224, 0.225],   # ImageNet std
            ),
        ])
    else:
        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],  # ImageNet mean
                std=[0.229, 0.224, 0.225],   # ImageNet std
            ),
        ])
    return transform

import re
from collections import Counter
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')  # Ensure the Punkt tokenizer is downloaded

def tokenize(text):
    """
    Tokenizes the input text into words.
    Args:
        text (str): Input caption text.
    Returns:
        tokens (list): List of word tokens.
    """
    text = text.lower()
    tokens = nltk.tokenize.word_tokenize(text)
    return tokens

def build_vocabulary(caption_df, vocab_size=5000):
    """
    Builds word-to-index and index-to-word mappings based on caption data.
    Args:
        caption_df (DataFrame): DataFrame containing image filenames and their captions.
        vocab_size (int): Maximum size of the vocabulary.
    Returns:
        word2idx (dict): Mapping from words to indices.
        idx2word (dict): Mapping from indices to words.
        image_captions (dict): Mapping from image filenames to their captions.
    """
    # Group captions by image
    image_captions = caption_df.groupby("image")["caption"].apply(list).to_dict()

    # Collect all captions
    all_captions = [
        caption for captions in image_captions.values() for caption in captions
    ]

    # Tokenize all captions and count word frequencies
    all_words = [token for caption in all_captions for token in tokenize(caption)]
    word_counts = Counter(all_words)

    # Define special tokens
    special_tokens = ["<pad>", "<start>", "<end>", "<unk>"]

    # Initialize word-to-index and index-to-word mappings
    word2idx = {token: idx for idx, token in enumerate(special_tokens)}
    idx2word = {idx: token for idx, token in enumerate(special_tokens)}

    # Add most common words to the vocabulary
    most_common = word_counts.most_common(vocab_size - len(special_tokens))
    for idx, (word, _) in enumerate(most_common, start=len(special_tokens)):
        word2idx[word] = idx
        idx2word[idx] = word

    return word2idx, idx2word, image_captions

def convert_captions_to_sequences(image_captions, word2idx):
    """
    Converts captions to sequences of word indices.
    Args:
        image_captions (dict): Mapping from image filenames to their captions.
        word2idx (dict): Mapping from words to indices.
    Returns:
        captions_seqs (dict): Mapping from image filenames to sequences of word indices.
        max_length (int): Maximum length of the captions.
    """
    captions_seqs = {}
    max_length = 0

    for img_name, captions in image_captions.items():
        seqs = []
        for caption in captions:
            # Tokenize and add start and end tokens
            tokens = ["<start>"] + tokenize(caption) + ["<end>"]
            # Convert tokens to indices, use <unk> for unknown words
            seq = [word2idx.get(token, word2idx["<unk>"]) for token in tokens]
            seqs.append(seq)
            # Update maximum caption length
            max_length = max(max_length, len(seq))
        captions_seqs[img_name] = seqs

    return captions_seqs, max_length

def get_splits(image_names, test_size=0.2):
    """
    Splits the dataset into training, validation, and test sets.
    Args:
        image_names (list): List of image filenames.
        test_size (float): Proportion of the dataset to include in the test split.
    Returns:
        train_images (list): List of training image filenames.
        val_images (list): List of validation image filenames.
        test_images (list): List of test image filenames.
    """
    # Split into training and temp (validation + test) sets
    train_images, temp_images = train_test_split(
        image_names, test_size=test_size, random_state=42
    )
    # Split temp set into validation and test sets
    val_images, test_images = train_test_split(
        temp_images, test_size=0.1, random_state=42
    )
    return train_images, val_images, test_images

def prepare_image2captions(image_ids, captions_seqs, idx2word):
    """
    Prepares a mapping from image IDs to their corresponding captions in word form.
    Args:
        image_ids (list): List of image filenames.
        captions_seqs (dict): Mapping from image filenames to sequences of word indices.
        idx2word (dict): Mapping from indices to words.
    Returns:
        image2captions (dict): Mapping from image filenames to their captions as word lists.
    """
    image2captions = {}
    for img_id in image_ids:
        seqs = captions_seqs[img_id]
        captions_list = []
        for seq in seqs:
            # Convert indices back to words
            caption = [idx2word.get(idx, "<unk>") for idx in seq]
            # Remove special tokens
            caption = [
                word.lower()
                for word in caption
                if word not in ["<start>", "<end>", "<pad>"]
            ]
            captions_list.append(caption)
        image2captions[img_id] = captions_list
    return image2captions


[nltk_data] Downloading package punkt to /home/jed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import os

import torch
import torch.nn as nn
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from nltk.translate.meteor_score import meteor_score
from PIL import Image

# Evaluate function: Computes validation loss on a given dataset
def evaluate(encoder, decoder, data_loader, criterion, device, vocab_size):
    """
    Evaluate the model on the validation set.
    Args:
        encoder: Encoder model.
        decoder: Decoder model.
        data_loader: DataLoader for the validation set.
        criterion: Loss function.
        device: Computation device (CPU or GPU).
        vocab_size: Size of the vocabulary.
    Returns:
        average_loss: Average validation loss.
    """
    encoder.eval()  # Set encoder to evaluation mode
    decoder.eval()  # Set decoder to evaluation mode
    total_loss = 0
    total_samples = 0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for images, captions, _ in data_loader:
            # Move data to the computation device
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass through encoder and decoder
            features = encoder(images)
            outputs = decoder(features, captions)

            # Exclude the first time step from outputs and targets
            outputs = outputs[:, 1:, :]  # Ensure outputs and targets have the same length
            targets = captions[:, 1:]  # Exclude the first <start> token from targets

            # Reshape outputs and targets for loss computation
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            # Compute loss
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            total_samples += 1

    # Calculate average loss
    average_loss = total_loss / total_samples
    return average_loss

# Function to calculate BLEU score for generated captions
def calculate_bleu_score(
    encoder,
    decoder,
    image_dir,
    image_ids,
    image2captions,
    transform,
    idx2word,
    device,
    word2idx,
):
    """
    Calculate BLEU score for the generated captions.
    Args:
        encoder: Encoder model.
        decoder: Decoder model.
        image_dir: Directory containing images.
        image_ids: List of image IDs.
        image2captions: Dictionary mapping image IDs to reference captions.
        transform: Preprocessing transformation for images.
        idx2word: Mapping from word indices to words.
        device: Computation device (CPU or GPU).
        word2idx: Mapping from words to word indices.
    Returns:
        bleu_score: Corpus BLEU score for generated captions.
    """
    encoder.eval()  # Set encoder to evaluation mode
    decoder.eval()  # Set decoder to evaluation mode
    references = []  # List to store reference captions
    hypotheses = []  # List to store generated captions
    smoothie = SmoothingFunction().method4  # Smoothing function for BLEU score

    with torch.no_grad():
        for img_id in image_ids:
            # Load and preprocess image
            img_path = os.path.join(image_dir, img_id)
            image = Image.open(img_path).convert("RGB")
            image = transform(image).unsqueeze(0).to(device)

            # Generate caption
            features = encoder(image)
            end_token_idx = word2idx["<end>"]
            sampled_ids = decoder.sample(features, end_token_idx=end_token_idx)
            sampled_caption = [
                idx2word.get(word_id, "<unk>") for word_id in sampled_ids
            ]

            # Prepare hypothesis (generated caption tokens)
            hypothesis = [
                word.lower()
                for word in sampled_caption
                if word not in ["<start>", "<end>", "<pad>", "<unk>"]
            ]
            hypotheses.append(hypothesis)

            # Prepare references (list of lists of tokens)
            ref_captions = image2captions[img_id]
            refs = [
                [
                    word.lower()
                    for word in ref
                    if word not in ["<start>", "<end>", "<pad>", "<unk>"]
                ]
                for ref in ref_captions
            ]
            references.append(refs)

    # Compute corpus BLEU score
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothie)
    return bleu_score

# Function to calculate METEOR score for generated captions
def calculate_meteor_score(
    encoder,
    decoder,
    image_dir,
    image_ids,
    image2captions,
    transform,
    idx2word,
    device,
    word2idx,
):
    """
    Calculate METEOR score for the generated captions.
    Args:
        encoder: Encoder model.
        decoder: Decoder model.
        image_dir: Directory containing images.
        image_ids: List of image IDs.
        image2captions: Dictionary mapping image IDs to reference captions.
        transform: Preprocessing transformation for images.
        idx2word: Mapping from word indices to words.
        device: Computation device (CPU or GPU).
        word2idx: Mapping from words to word indices.
    Returns:
        average_meteor: Average METEOR score.
    """
    encoder.eval()  # Set encoder to evaluation mode
    decoder.eval()  # Set decoder to evaluation mode
    meteor_scores = []  # List to store METEOR scores

    with torch.no_grad():
        for img_id in image_ids:
            # Load and preprocess image
            img_path = os.path.join(image_dir, img_id)
            image = Image.open(img_path).convert("RGB")
            image = transform(image).unsqueeze(0).to(device)

            # Generate caption
            features = encoder(image)
            end_token_idx = word2idx["<end>"]
            sampled_ids = decoder.sample(features, end_token_idx=end_token_idx)
            sampled_caption = [
                idx2word.get(word_id, "<unk>") for word_id in sampled_ids
            ]

            # Prepare hypothesis (generated caption tokens)
            hypothesis = [
                word.lower()
                for word in sampled_caption
                if word not in ["<start>", "<end>", "<pad>", "<unk>"]
            ]

            # Prepare references (list of lists of tokens)
            references = [
                [
                    word.lower()
                    for word in ref
                    if word not in ["<start>", "<end>", "<pad>", "<unk>"]
                ]
                for ref in image2captions[img_id]
            ]

            # Calculate METEOR score for the current image
            score = meteor_score(references, hypothesis)
            meteor_scores.append(score)

    # Compute average METEOR score
    average_meteor = sum(meteor_scores) / len(meteor_scores)
    return average_meteor

# Function to calculate CIDEr score for generated captions
def calculate_cider_score(
    encoder,
    decoder,
    image_dir,
    image_ids,
    image2captions,
    transform,
    idx2word,
    device,
    word2idx,
):
    """
    Calculate CIDEr score for the generated captions.
    Args:
        encoder: Encoder model.
        decoder: Decoder model.
        image_dir: Directory containing images.
        image_ids: List of image IDs.
        image2captions: Dictionary mapping image IDs to reference captions.
        transform: Preprocessing transformation for images.
        idx2word: Mapping from word indices to words.
        device: Computation device (CPU or GPU).
        word2idx: Mapping from words to word indices.
    Returns:
        cider_score: CIDEr score for generated captions.
    """
    from pycocoevalcap.cider.cider import Cider
    from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer

    encoder.eval()  # Set encoder to evaluation mode
    decoder.eval()  # Set decoder to evaluation mode
    gts = {}  # Ground truth captions
    res = {}  # Generated captions
    tokenizer = PTBTokenizer()  # Tokenizer for captions

    with torch.no_grad():
        for img_id in image_ids:
            # Load and preprocess image
            img_path = os.path.join(image_dir, img_id)
            image = Image.open(img_path).convert("RGB")
            image = transform(image).unsqueeze(0).to(device)

            # Generate caption
            features = encoder(image)
            end_token_idx = word2idx["<end>"]
            sampled_ids = decoder.sample(features, end_token_idx=end_token_idx)
            sampled_caption = [
                idx2word.get(word_id, "<unk>") for word_id in sampled_ids
            ]
            # Prepare generated caption
            sampled_caption = [
                word.lower()
                for word in sampled_caption
                if word not in ["<start>", "<end>", "<pad>", "<unk>"]
            ]
            sampled_caption_str = " ".join(sampled_caption)

            # Prepare references
            references = [
                " ".join(
                    [
                        word.lower()
                        for word in ref
                        if word not in ["<start>", "<end>", "<pad>", "<unk>"]
                    ]
                )
                for ref in image2captions[img_id]
            ]

            # Update dictionaries with tokenized captions
            gts[img_id] = [{'caption': ref} for ref in references]
            res[img_id] = [{'caption': sampled_caption_str}]

    # Tokenize captions
    gts = tokenizer.tokenize(gts)
    res = tokenizer.tokenize(res)

    # Compute CIDEr score
    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(gts, res)
    return cider_score


In [5]:
def main():
    # Define dataset type
    dataset = "Flickr8k"  # Change to "Flickr30k" if needed

    # Paths
    dataset_dir = f"../../flickr_data/{dataset}_Dataset/Images"
    captions_file = f"../../flickr_data/{dataset}_Dataset/captions.txt"
    image_dir = dataset_dir

    train_losses = []
    val_losses = []
    bleu_scores = []
    meteor_scores = []
    cider_scores = []
    
    # Load captions
    caption_df = pd.read_csv(captions_file).dropna().drop_duplicates()
    print(f"Total captions loaded: {len(caption_df)}")

    # Build vocabulary
    word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=5000)
    print(f"Vocabulary size: {len(word2idx)}")

    # Convert captions to sequences
    captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)
    print(f"Maximum caption length: {max_length}")

    # Get data transformations
    train_transform = get_transform(train=True)
    val_transform = get_transform(train=False)

    # Split data into training and validation sets
    image_names = list(image_captions.keys())
    train_images, val_images, _ = get_splits(image_names, test_size=0.2)
    print(f"Training samples: {len(train_images)}")
    print(f"Validation samples: {len(val_images)}")

    # Create datasets and data loaders
    train_dataset = FlickrDataset(
        image_dir, train_images, captions_seqs, transform=train_transform
    )
    val_dataset = FlickrDataset(
        image_dir, val_images, captions_seqs, transform=val_transform
    )
    train_loader = DataLoader(
        train_dataset,
        batch_size=32, 
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2,
    )
    print(f"Number of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(val_loader)}")

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize models
    embed_size = 256
    hidden_size = 512
    vocab_size = len(word2idx)
    input_size = embed_size * 2  # Combined feature size (global + object features)

    # Initialize models
    encoder = EncoderCNN(embed_size=embed_size, device=device).to(device)
    decoder = DecoderRNN(
        input_size=input_size,
        embed_size=embed_size,
        hidden_size=hidden_size,
        vocab_size=vocab_size
    ).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
    params = list(filter(lambda p: p.requires_grad, encoder.parameters())) + list(
        decoder.parameters()
    )
    optimizer = optim.Adam(params, lr=1e-4)
    # Since you can only run once, we might not need a scheduler
    # Adjust learning rate manually if needed

    # Prepare image to captions mapping for evaluation
    val_image2captions = prepare_image2captions(val_images, captions_seqs, idx2word)

    # Training settings
    num_epochs = 10  # Adjust as needed
    total_step = len(train_loader)
    end_token_idx = word2idx["<end>"]

    # Training loop
    for epoch in range(num_epochs):
        start_time = time.time()
        encoder.train()
        decoder.train()
        total_loss = 0

        for i, (images, captions, _) in enumerate(train_loader):
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            features = encoder(images)
            outputs = decoder(features, captions)

            # Exclude the first time step from outputs
            outputs = outputs[:, 1:, :]  # Shape: (batch_size, seq_len -1 , vocab_size)
            targets = captions[:, 1:]  # Exclude the first <start> token

            # Reshape for loss computation
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            # Compute loss
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=5)
            optimizer.step()

            total_loss += loss.item()

            if i % 300 == 0:
                print(
                    f"Epoch [{epoch+1}/{num_epochs}], Step [{i}/{total_step}], Loss: {loss.item():.4f}"
                )

        # Calculate average training loss for the epoch
        avg_train_loss = total_loss / total_step

        # Validation
        val_loss = evaluate(encoder, decoder, val_loader, criterion, device, vocab_size)

        # Calculate evaluation metrics
        bleu = calculate_bleu_score(
            encoder,
            decoder,
            image_dir,
            val_images,
            val_image2captions,
            val_transform,
            idx2word,
            device,
            word2idx,
        )
        meteor = calculate_meteor_score(
            encoder,
            decoder,
            image_dir,
            val_images,
            val_image2captions,
            val_transform,
            idx2word,
            device,
            word2idx,
        )
        cider = calculate_cider_score(
            encoder,
            decoder,
            image_dir,
            val_images,
            val_image2captions,
            val_transform,
            idx2word,
            device,
            word2idx,
        )

        # Print epoch summary
        epoch_duration = time.time() - start_time
        print(
            f"Epoch [{epoch+1}/{num_epochs}], "
            f"Training Loss: {avg_train_loss:.4f}, "
            f"Validation Loss: {val_loss:.4f}, "
            f"BLEU: {bleu:.4f}, "
            f"METEOR: {meteor:.4f}, "
            f"CIDEr: {cider:.4f}, "
            f"Time: {epoch_duration:.2f}s"
        )
        
        # **Append average training loss instead of total loss**
        train_losses.append(avg_train_loss)
        val_losses.append(val_loss)
        bleu_scores.append(bleu)
        meteor_scores.append(meteor)
        cider_scores.append(cider)

    # Save the models
    os.makedirs("models/model_2_image_segmentation_lstm", exist_ok=True)
    torch.save(encoder.state_dict(), "models/model_2_image_segmentation_lstm/encoder.pth")
    torch.save(decoder.state_dict(), "models/model_2_image_segmentation_lstm/decoder.pth")
    print("Models saved successfully.")
    
    # Plot training and validation loss
    plt.figure()
    plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
    plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    plt.savefig('models/model_2_image_segmentation_lstm/loss_plot.png')
    plt.close()

    # Plot evaluation metrics
    plt.figure()
    plt.plot(range(1, num_epochs + 1), bleu_scores, label='BLEU Score')
    plt.plot(range(1, num_epochs + 1), meteor_scores, label='METEOR Score')
    plt.plot(range(1, num_epochs + 1), cider_scores, label='CIDEr Score')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.title('Evaluation Metrics over Epochs')
    plt.legend()
    plt.savefig('models/model_2_image_segmentation_lstm/metrics_plot.png')
    plt.close()

In [6]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

if __name__ == "__main__":
    main()

Total captions loaded: 40445
Vocabulary size: 5000
Maximum caption length: 40
Training samples: 6472
Validation samples: 1457
Number of training batches: 1011
Number of validation batches: 228
Using device: cuda
Epoch [1/10], Step [0/1011], Loss: 8.5263
Epoch [1/10], Step [300/1011], Loss: 4.7293
Epoch [1/10], Step [600/1011], Loss: 4.2431
Epoch [1/10], Step [900/1011], Loss: 4.0574


In [None]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

if __name__ == "__main__":
    main()

Total captions loaded: 40445
Vocabulary size: 5000
Maximum caption length: 40
Training samples: 5663
Validation samples: 1214
Number of training batches: 443
Number of validation batches: 95
Using device: cuda
Epoch [1/10], Step [0/443], Loss: 8.5226
Epoch [1/10], Step [250/443], Loss: 4.8943


PTBTokenizer tokenized 76799 tokens at 604079.23 tokens per second.
PTBTokenizer tokenized 12137 tokens at 178160.94 tokens per second.


Epoch [1/10], Training Loss: 5.1644, Validation Loss: 4.5385, BLEU: 0.0628, METEOR: 0.2274, CIDEr: 0.0520, Time: 1652.05s
Epoch [2/10], Step [0/443], Loss: 4.5203
Epoch [2/10], Step [250/443], Loss: 4.2543


PTBTokenizer tokenized 76799 tokens at 600593.52 tokens per second.
PTBTokenizer tokenized 18105 tokens at 240000.19 tokens per second.


Epoch [2/10], Training Loss: 4.3273, Validation Loss: 4.1133, BLEU: 0.0548, METEOR: 0.2730, CIDEr: 0.0824, Time: 1648.78s
Epoch [3/10], Step [0/443], Loss: 4.2273
Epoch [3/10], Step [250/443], Loss: 3.7781


PTBTokenizer tokenized 76799 tokens at 602266.88 tokens per second.
PTBTokenizer tokenized 18371 tokens at 240267.32 tokens per second.


Epoch [3/10], Training Loss: 3.9480, Validation Loss: 3.8199, BLEU: 0.0737, METEOR: 0.3094, CIDEr: 0.1460, Time: 1650.13s
Epoch [4/10], Step [0/443], Loss: 4.0033
Epoch [4/10], Step [250/443], Loss: 3.6350


PTBTokenizer tokenized 76799 tokens at 602651.70 tokens per second.
PTBTokenizer tokenized 17944 tokens at 236690.10 tokens per second.


Epoch [4/10], Training Loss: 3.7100, Validation Loss: 3.6409, BLEU: 0.0754, METEOR: 0.2974, CIDEr: 0.1628, Time: 1647.24s
Epoch [5/10], Step [0/443], Loss: 3.5938
Epoch [5/10], Step [250/443], Loss: 3.5168


PTBTokenizer tokenized 76799 tokens at 599218.59 tokens per second.
PTBTokenizer tokenized 15235 tokens at 208185.08 tokens per second.


Epoch [5/10], Training Loss: 3.5450, Validation Loss: 3.5092, BLEU: 0.1002, METEOR: 0.3227, CIDEr: 0.1924, Time: 1641.45s
Epoch [6/10], Step [0/443], Loss: 3.3434
Epoch [6/10], Step [250/443], Loss: 3.5189


PTBTokenizer tokenized 76799 tokens at 600269.87 tokens per second.
PTBTokenizer tokenized 14860 tokens at 203758.95 tokens per second.


Epoch [6/10], Training Loss: 3.4182, Validation Loss: 3.4157, BLEU: 0.1119, METEOR: 0.3361, CIDEr: 0.2282, Time: 1646.17s
Epoch [7/10], Step [0/443], Loss: 3.3806
Epoch [7/10], Step [250/443], Loss: 3.4217


PTBTokenizer tokenized 76799 tokens at 597234.62 tokens per second.
PTBTokenizer tokenized 14921 tokens at 203617.09 tokens per second.


Epoch [7/10], Training Loss: 3.3160, Validation Loss: 3.3350, BLEU: 0.1253, METEOR: 0.3585, CIDEr: 0.2606, Time: 1643.65s
Epoch [8/10], Step [0/443], Loss: 3.4387
Epoch [8/10], Step [250/443], Loss: 3.0857


PTBTokenizer tokenized 76799 tokens at 613918.33 tokens per second.
PTBTokenizer tokenized 15646 tokens at 212247.02 tokens per second.


Epoch [8/10], Training Loss: 3.2297, Validation Loss: 3.2728, BLEU: 0.1195, METEOR: 0.3495, CIDEr: 0.2583, Time: 1643.00s
Epoch [9/10], Step [0/443], Loss: 2.9769
Epoch [9/10], Step [250/443], Loss: 3.2154


PTBTokenizer tokenized 76799 tokens at 600416.36 tokens per second.
PTBTokenizer tokenized 15075 tokens at 204673.55 tokens per second.


Epoch [9/10], Training Loss: 3.1562, Validation Loss: 3.2245, BLEU: 0.1328, METEOR: 0.3646, CIDEr: 0.2890, Time: 1642.47s
Epoch [10/10], Step [0/443], Loss: 3.0885
Epoch [10/10], Step [250/443], Loss: 3.2401


PTBTokenizer tokenized 76799 tokens at 598609.04 tokens per second.
PTBTokenizer tokenized 15604 tokens at 210714.61 tokens per second.


Epoch [10/10], Training Loss: 3.0911, Validation Loss: 3.1788, BLEU: 0.1383, METEOR: 0.3685, CIDEr: 0.3079, Time: 1640.26s
Models saved successfully.


# END