In [1]:
import os
import time
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from pathlib import Path

notebook_dir = Path(os.getcwd()).resolve()  # Get the current working directory
project_root = notebook_dir.parents[1]  # Adjust the number to go up to the project root
sys.path.append(str(project_root))

print(f"Project root: {project_root}")

Project root: /Users/jed/anaconda3/omscs/CS7643/image-captioning-project


In [2]:
from models.model_2_image_segmentation_lstm.model import *
from data.dataset import *
from data.preprocessing import *
from metrics import *

[nltk_data] Downloading package punkt to /Users/jed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def evaluate(encoder, decoder, data_loader, criterion, device, vocab_size):
    """
    Evaluate the model on the validation set.

    Args:
        encoder (EncoderCNN): The encoder model.
        decoder (DecoderRNN): The decoder model.
        data_loader (DataLoader): DataLoader for the validation set.
        criterion (nn.Module): Loss function.
        device (torch.device): Device to run the evaluation on.
        vocab_size (int): Size of the vocabulary.

    Returns:
        float: Average validation loss.
    """
    encoder.eval()
    decoder.eval()
    total_loss = 0
    total_batches = 0

    with torch.no_grad():
        for images, captions, _ in data_loader:
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            memory = encoder(images)  # memory: (batch_size, memory_seq_len, embed_size)
            captions_input = captions[:, :-1]
            targets = captions[:, 1:]

            outputs = decoder(captions_input, memory)  # outputs: (batch_size, seq_len -1, vocab_size)

            # Reshape outputs and targets for loss computation
            outputs = outputs.reshape(-1, vocab_size)  # Shape: (batch_size * (seq_len -1), vocab_size)
            targets = targets.reshape(-1)              # Shape: (batch_size * (seq_len -1))

            # Compute loss
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            total_batches += 1

    average_loss = total_loss / total_batches
    return average_loss

In [4]:
def main():
    # Define dataset type
    dataset = "Flickr8k"  # Change to "Flickr30k" if needed

    random.seed(7643)
    
    # Paths
    dataset_dir = f"../../../../flickr_data/{dataset}_Dataset/Images"
    captions_file = f"../../../../flickr_data/{dataset}_Dataset/captions.txt"
    image_dir = dataset_dir

    train_losses = []
    val_losses = []
    bleu_scores = []
    meteor_scores = []
    cider_scores = []
    
    # Load captions
    caption_df = pd.read_csv(captions_file).dropna().drop_duplicates()

    # Build vocabulary
    word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=8000)
    print(f"Vocabulary size: {len(word2idx)}")

    # Convert captions to sequences
    captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)
    print(f"Maximum caption length: {max_length}")

    # Get data transformations
    train_transform = get_transform(train=True)
    val_transform = get_transform(train=False)

    # Split data into training and validation sets
    image_names = list(image_captions.keys())
    random.shuffle(image_names)
    val_size = int(0.2 * len(image_names))  # 20% for validation
    train_images = image_names[val_size:]
    val_images = image_names[:val_size]
    print(f"Training samples: {len(train_images)}")
    print(f"Validation samples: {len(val_images)}")

    # Create datasets and data loaders
    # Note the mode='train' for val_dataset to ensure it behaves like the training dataset
    train_dataset = FlickrDataset(
        image_dir, train_images, captions_seqs, transform=train_transform, mode='train'
    )
    val_dataset = FlickrDataset(
        image_dir, val_images, captions_seqs, transform=val_transform, mode='train'
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2,
    )

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize models
    embed_size = 256
    hidden_size = 512
    vocab_size = len(word2idx)
    encoder = Encoder(embed_size=embed_size).to(device)
    decoder = Decoder(
        embed_size=embed_size, hidden_size=hidden_size, vocab_size=vocab_size
    ).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
    params = list(filter(lambda p: p.requires_grad, encoder.parameters())) + list(
        decoder.parameters()
    )
    optimizer = optim.Adam(params, lr=3e-4)
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

    num_epochs = 10
    total_step = len(train_loader)

    val_image_ids = val_images
    image2captions = prepare_image2captions(val_image_ids, captions_seqs, idx2word)

    for epoch in range(num_epochs):
        start_time = time.time()
        encoder.train()
        decoder.train()
        total_loss = 0

        for i, (images, captions, lengths) in enumerate(train_loader):
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            features = encoder(images)
            outputs = decoder(features, captions)

            # Prepare targets
            targets = captions[:, 1:]  # Exclude the first <start> token

            # Exclude the first time step from outputs
            outputs = outputs[:, 1:, :]

            # Reshape for loss computation
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            # Compute loss
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=5)
            optimizer.step()

            total_loss += loss.item()

            if i % 300 == 0:
                print(
                    f"Epoch [{epoch+1}/{num_epochs}], Step [{i}/{total_step}], Loss: {loss.item():.4f}"
                )

        # Calculate average training loss
        avg_train_loss = total_loss / total_step
        train_losses.append(avg_train_loss)

        # Adjust learning rate
        scheduler.step()

        # Validation
        val_loss = evaluate(encoder, decoder, val_loader, criterion, device, vocab_size)
        print(
            f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}"
        )

        # Calculate evaluation metrics
        bleu = calculate_bleu_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        meteor = calculate_meteor_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        cider = calculate_cider_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        end_time = time.time()
        epoch_time = end_time - start_time

        print(
            f"Epoch [{epoch+1}/{num_epochs}] completed in {epoch_time:.2f} seconds."
        )
        print(
            f"BLEU Score: {bleu:.4f}, METEOR Score: {meteor:.4f}, CIDEr Score: {cider:.4f}\n"
        )

        val_losses.append(val_loss)
        bleu_scores.append(bleu)
        meteor_scores.append(meteor)
        cider_scores.append(cider)

    # Save the models
    os.makedirs("models/model_2_image_segmentation_lstm", exist_ok=True)
    torch.save(encoder.state_dict(), "models/model_2_image_segmentation_lstm/encoder.pth")
    torch.save(decoder.state_dict(), "models/model_2_image_segmentation_lstm/decoder.pth")
    print("Models saved successfully.")
    
    # Plot training and validation loss
    plt.figure()
    plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
    plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    plt.savefig('models/model_2_image_segmentation_lstm/loss_plot.png')
    plt.close()

    # Plot evaluation metrics
    plt.figure()
    plt.plot(range(1, num_epochs + 1), bleu_scores, label='BLEU Score')
    plt.plot(range(1, num_epochs + 1), meteor_scores, label='METEOR Score')
    plt.plot(range(1, num_epochs + 1), cider_scores, label='CIDEr Score')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.title('Evaluation Metrics over Epochs')
    plt.legend()
    plt.savefig('models/model_2_image_segmentation_lstm/metrics_plot.png')
    plt.close()

In [8]:
%%time

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

if __name__ == "__main__":
    main()

Vocabulary size: 8000
Maximum caption length: 40
Training samples: 6473
Validation samples: 1618
Using device: cuda
Epoch [1/10], Step [0/1012], Loss: 8.9864
Epoch [1/10], Step [300/1012], Loss: 4.3169
Epoch [1/10], Step [600/1012], Loss: 3.6412
Epoch [1/10], Step [900/1012], Loss: 3.5921
Epoch [1/10], Training Loss: 4.2353, Validation Loss: 3.6200


PTBTokenizer tokenized 103083 tokens at 731726.90 tokens per second.
PTBTokenizer tokenized 19633 tokens at 251619.97 tokens per second.


Epoch [1/10] completed in 338.07 seconds.
BLEU Score: 0.0968, METEOR Score: 0.3177, CIDEr Score: 0.1680

Epoch [2/10], Step [0/1012], Loss: 3.6844
Epoch [2/10], Step [300/1012], Loss: 3.3987
Epoch [2/10], Step [600/1012], Loss: 3.4978
Epoch [2/10], Step [900/1012], Loss: 3.4225
Epoch [2/10], Training Loss: 3.4205, Validation Loss: 3.3537


PTBTokenizer tokenized 103083 tokens at 731919.47 tokens per second.
PTBTokenizer tokenized 24556 tokens at 293989.62 tokens per second.


Epoch [2/10] completed in 315.61 seconds.
BLEU Score: 0.0822, METEOR Score: 0.3166, CIDEr Score: 0.1535

Epoch [3/10], Step [0/1012], Loss: 3.0825
Epoch [3/10], Step [300/1012], Loss: 3.2001
Epoch [3/10], Step [600/1012], Loss: 3.1571
Epoch [3/10], Step [900/1012], Loss: 2.9089
Epoch [3/10], Training Loss: 3.1857, Validation Loss: 3.2249


PTBTokenizer tokenized 103083 tokens at 727355.68 tokens per second.
PTBTokenizer tokenized 23698 tokens at 288130.64 tokens per second.


Epoch [3/10] completed in 340.15 seconds.
BLEU Score: 0.0968, METEOR Score: 0.3309, CIDEr Score: 0.1840

Epoch [4/10], Step [0/1012], Loss: 3.0111
Epoch [4/10], Step [300/1012], Loss: 2.9187
Epoch [4/10], Step [600/1012], Loss: 2.9521
Epoch [4/10], Step [900/1012], Loss: 3.5380
Epoch [4/10], Training Loss: 3.0294, Validation Loss: 3.1316


PTBTokenizer tokenized 103083 tokens at 662865.49 tokens per second.
PTBTokenizer tokenized 25863 tokens at 308172.51 tokens per second.


Epoch [4/10] completed in 400.92 seconds.
BLEU Score: 0.0940, METEOR Score: 0.3304, CIDEr Score: 0.1802

Epoch [5/10], Step [0/1012], Loss: 2.8132
Epoch [5/10], Step [300/1012], Loss: 2.9351
Epoch [5/10], Step [600/1012], Loss: 3.0704
Epoch [5/10], Step [900/1012], Loss: 2.7597
Epoch [5/10], Training Loss: 2.9112, Validation Loss: 3.0780


PTBTokenizer tokenized 103083 tokens at 722502.89 tokens per second.
PTBTokenizer tokenized 23971 tokens at 290312.47 tokens per second.


Epoch [5/10] completed in 385.19 seconds.
BLEU Score: 0.1017, METEOR Score: 0.3293, CIDEr Score: 0.1874

Epoch [6/10], Step [0/1012], Loss: 2.8013
Epoch [6/10], Step [300/1012], Loss: 2.7281
Epoch [6/10], Step [600/1012], Loss: 3.0820
Epoch [6/10], Step [900/1012], Loss: 2.7048
Epoch [6/10], Training Loss: 2.7905, Validation Loss: 3.0552


PTBTokenizer tokenized 103083 tokens at 729631.37 tokens per second.
PTBTokenizer tokenized 22786 tokens at 279105.05 tokens per second.


Epoch [6/10] completed in 362.72 seconds.
BLEU Score: 0.1010, METEOR Score: 0.3300, CIDEr Score: 0.2012

Epoch [7/10], Step [0/1012], Loss: 2.6420
Epoch [7/10], Step [300/1012], Loss: 2.6843
Epoch [7/10], Step [600/1012], Loss: 2.6753
Epoch [7/10], Step [900/1012], Loss: 2.8778
Epoch [7/10], Training Loss: 2.7721, Validation Loss: 3.0496


PTBTokenizer tokenized 103083 tokens at 723954.52 tokens per second.
PTBTokenizer tokenized 22698 tokens at 279797.00 tokens per second.


Epoch [7/10] completed in 327.76 seconds.
BLEU Score: 0.0995, METEOR Score: 0.3305, CIDEr Score: 0.2025

Epoch [8/10], Step [0/1012], Loss: 2.7047
Epoch [8/10], Step [300/1012], Loss: 2.9350
Epoch [8/10], Step [600/1012], Loss: 2.8687
Epoch [8/10], Step [900/1012], Loss: 2.9722
Epoch [8/10], Training Loss: 2.7609, Validation Loss: 3.0455


PTBTokenizer tokenized 103083 tokens at 729351.80 tokens per second.
PTBTokenizer tokenized 22654 tokens at 278315.47 tokens per second.


Epoch [8/10] completed in 313.63 seconds.
BLEU Score: 0.1028, METEOR Score: 0.3305, CIDEr Score: 0.2049

Epoch [9/10], Step [0/1012], Loss: 2.4623
Epoch [9/10], Step [300/1012], Loss: 2.6595
Epoch [9/10], Step [600/1012], Loss: 2.6937
Epoch [9/10], Step [900/1012], Loss: 2.7075
Epoch [9/10], Training Loss: 2.7480, Validation Loss: 3.0444


PTBTokenizer tokenized 103083 tokens at 720846.40 tokens per second.
PTBTokenizer tokenized 22747 tokens at 276437.58 tokens per second.


Epoch [9/10] completed in 360.59 seconds.
BLEU Score: 0.1010, METEOR Score: 0.3313, CIDEr Score: 0.2005

Epoch [10/10], Step [0/1012], Loss: 2.7413
Epoch [10/10], Step [300/1012], Loss: 2.8509
Epoch [10/10], Step [600/1012], Loss: 2.8172
Epoch [10/10], Step [900/1012], Loss: 2.9725
Epoch [10/10], Training Loss: 2.7384, Validation Loss: 3.0408


PTBTokenizer tokenized 103083 tokens at 725554.42 tokens per second.
PTBTokenizer tokenized 22407 tokens at 276139.12 tokens per second.


Epoch [10/10] completed in 403.17 seconds.
BLEU Score: 0.1068, METEOR Score: 0.3327, CIDEr Score: 0.2094

Models saved successfully.
CPU times: user 13h 40min 30s, sys: 1min 42s, total: 13h 42min 12s
Wall time: 59min 16s


# Testing

In [5]:
import random

dataset = "Flickr8k"

captions_file_path = f"{project_root}/flickr_data/{dataset}_Dataset/captions.txt"
image_dir = f"{project_root}/flickr_data/{dataset}_Dataset/Images"

# Load captions
caption_df = pd.read_csv(captions_file_path).dropna().drop_duplicates()

# Build vocabulary with vocab_size=5000
word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=8000)

# Convert captions to sequences
captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)

# Get data transformations
test_transform = get_transform(train=False)

# Split data into training and validation sets
image_names = list(image_captions.keys())
random.shuffle(image_names)
val_size = int(0.2 * len(image_names))  # 20% for validation
test_images = image_names[:val_size]

# Randomly select 6 images from the test_images
sampled_test_images = random.sample(test_images, 6)

# Prepare image to captions mapping for ground truth captions
test_image2captions = prepare_image2captions(sampled_test_images, captions_seqs, idx2word)

# Create test dataset and data loader for only those 6 randomly selected images
test_dataset = FlickrDataset(
    image_dir, sampled_test_images, captions_seqs, transform=test_transform, mode='test'
)
test_loader = DataLoader(
    test_dataset,
    batch_size=1,  # Process one image at a time
    shuffle=False,
    collate_fn=collate_fn, 
    num_workers=2,
)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize models
embed_size = 256
hidden_size = 512
vocab_size = len(word2idx)

encoder = Encoder(embed_size=embed_size).to(device)
decoder = Decoder(
    embed_size=embed_size,
    hidden_size=hidden_size,
    vocab_size=vocab_size
).to(device)

# Load trained models
encoder_path = os.path.join(project_root, "models/model_2_image_segmentation_lstm/encoder.pth")
decoder_path = os.path.join(project_root, "models/model_2_image_segmentation_lstm/decoder.pth")

encoder.load_state_dict(
    torch.load(encoder_path, map_location=device)
)
decoder.load_state_dict(
    torch.load(decoder_path, map_location=device)
)

encoder.eval()
decoder.eval()
end_token_idx = word2idx.get('<end>', None)

# Generate captions on the randomly selected test images
for i, (images, captions, image_ids) in enumerate(test_loader):
    images = images.to(device)
    with torch.no_grad():
        features = encoder(images)
        sampled_ids = decoder.sample(features, end_token_idx=end_token_idx)
    
    # Convert word IDs to words
    sampled_caption = [idx2word.get(word_id, '<unk>') for word_id in sampled_ids]
    
    # Remove words after (and including) the '<end>' token
    if '<end>' in sampled_caption:
        end_index = sampled_caption.index('<end>')
        sampled_caption = sampled_caption[:end_index]
    
    generated_caption = ' '.join(sampled_caption)

    # Get ground truth captions
    image_name = image_ids[0]
    gt_captions = test_image2captions.get(image_name, [])

    if not gt_captions:
        print(f'Image ID: {image_name}')
        print('Generated Caption:', generated_caption)
        print('Ground Truth Captions: None')
        print('------------------------------------')
        continue

    print(f'Image ID: {image_name}')
    print(f'Generated Caption: {generated_caption}')
    print('Ground Truth Captions:')
    for gt_caption in gt_captions:
        print(f'- {" ".join(gt_caption)}')
    print('------------------------------------')

Using device: cpu


  torch.load(encoder_path, map_location=device)
  torch.load(decoder_path, map_location=device)


Image ID: 2661294481_b86058b504.jpg
Generated Caption: a man in a blue shirt is walking down a street .
Ground Truth Captions:
- a crowd begins to march in a small downtown setting .
- a group of people holding signs and marching through the streets .
- a group of people with signs and banners walking down the street
- a protest march is crossing the intersection of two streets near to a building with red awnings .
- many people hold signs and march down the street .
------------------------------------
Image ID: 3334537556_a2cf4e9b9a.jpg
Generated Caption: a group of people are standing on a rocky beach .
Ground Truth Captions:
- a group of people on skis with two dogs .
- a man dressed in a horned hat poses for a picture on skis with three other people and a dog .
- four people holding each others shoulders with a brown dog in front of them , standing on snow .
- people dressed in costumes are on skis .
- people , some dressed in costumes , and dogs on a snowy mountain .
------------

In [5]:
image2captions = prepare_image2captions(test_images, captions_seqs, idx2word)

# Calculate BLEU, METEOR, and CIDEr scores on the sampled test images
bleu_score = calculate_bleu_score(
    encoder=encoder,
    decoder=decoder,
    image_dir=image_dir,
    image_ids=test_images,
    image2captions=image2captions,
    transform=test_transform,
    idx2word=idx2word,
    device=device,
    word2idx=word2idx
)

meteor = calculate_meteor_score(
    encoder=encoder,
    decoder=decoder,
    image_dir=image_dir,
    image_ids=test_images,
    image2captions=image2captions,
    transform=test_transform,
    idx2word=idx2word,
    device=device,
    word2idx=word2idx
)

cider = calculate_cider_score(
    encoder=encoder,
    decoder=decoder,
    image_dir=image_dir,
    image_ids=test_images,
    image2captions=image2captions,
    transform=test_transform,
    idx2word=idx2word,
    device=device,
    word2idx=word2idx
)

print("BLEU Score:", bleu_score)
print("METEOR Score:", meteor)
print("CIDEr Score:", cider)

PTBTokenizer tokenized 103060 tokens at 880286.97 tokens per second.
PTBTokenizer tokenized 22341 tokens at 317247.62 tokens per second.


BLEU Score: 0.1144375291394959
METEOR Score: 0.3384300144639662
CIDEr Score: 0.2268791450509689


# END