In [1]:
import os
import time
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from pathlib import Path

notebook_dir = Path(os.getcwd()).resolve()  # Get the current working directory
project_root = notebook_dir.parents[1]  # Adjust the number to go up to the project root
sys.path.append(str(project_root))

print(f"Project root: {project_root}")

Project root: /Users/jed/anaconda3/omscs/CS7643/image-captioning-project


In [2]:
from models.model_1_baseline_cnn_lstm.model import *
from data.dataset import *
from data.preprocessing import *
from metrics import *

[nltk_data] Downloading package punkt to /Users/jed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Evaluate function: Computes validation loss on a given dataset
def evaluate(encoder, decoder, data_loader, criterion, device, vocab_size):
    """
    Evaluate the model on the validation set.
    Args:
        encoder: Encoder model.
        decoder: Decoder model.
        data_loader: DataLoader for the validation set.
        criterion: Loss function.
        device: Computation device (CPU or GPU).
        vocab_size: Size of the vocabulary.
    Returns:
        average_loss: Average validation loss.
    """
    encoder.eval()  # Set encoder to evaluation mode
    decoder.eval()  # Set decoder to evaluation mode
    total_loss = 0
    total_samples = 0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for images, captions, _ in data_loader:
            # Move data to the computation device
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass through encoder and decoder
            features = encoder(images)
            outputs = decoder(features, captions)

            # Exclude the first time step from outputs and targets
            outputs = outputs[:, 1:, :]  # Ensure outputs and targets have the same length
            targets = captions[:, 1:]  # Exclude the first <start> token from targets

            # Reshape outputs and targets for loss computation
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            # Compute loss
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            total_samples += 1

    # Calculate average loss
    average_loss = total_loss / total_samples
    return average_loss

# Training

In [5]:
def main():
    # Define dataset type
    dataset = "Flickr8k"  # or Flickr30k if needed
    
    random.seed(7643)

    # Paths
    dataset_dir = f"{project_root}/flickr_data/{dataset}_Dataset/Images"
    captions_file = f"{project_root}/flickr_data/{dataset}_Dataset/captions.txt"
    image_dir = dataset_dir

    train_losses = []
    val_losses = []
    bleu_scores = []
    meteor_scores = []
    cider_scores = []

    # Load captions
    caption_df = pd.read_csv(captions_file).dropna().drop_duplicates()

    # Build vocabulary
    word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=8000)
    print(f"Vocabulary size: {len(word2idx)}")

    # Convert captions to sequences
    captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)
    print(f"Maximum caption length: {max_length}")

    # Get data transformations
    train_transform = get_transform(train=True)
    val_transform = get_transform(train=False)

    # Split data into training and validation sets
    image_names = list(image_captions.keys())
    random.shuffle(image_names)
    val_size = int(0.2 * len(image_names))  # 20% for validation
    train_images = image_names[val_size:]
    val_images = image_names[:val_size]
    print(f"Training samples: {len(train_images)}")
    print(f"Validation samples: {len(val_images)}")

    # Create datasets and data loaders
    # Note the mode='train' for val_dataset to ensure it behaves like the training dataset
    train_dataset = FlickrDataset(
        image_dir, train_images, captions_seqs, transform=train_transform, mode='train'
    )
    val_dataset = FlickrDataset(
        image_dir, val_images, captions_seqs, transform=val_transform, mode='train'
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=64,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=64,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2,
    )
    print(f"Number of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(val_loader)}")

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize models
    embed_size = 256
    hidden_size = 512
    vocab_size = len(word2idx)
    encoder = Encoder(embed_size=embed_size).to(device)
    decoder = Decoder(
        embed_size=embed_size, hidden_size=hidden_size, vocab_size=vocab_size
    ).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
    params = list(filter(lambda p: p.requires_grad, encoder.parameters())) + list(
        decoder.parameters()
    )
    optimizer = optim.Adam(params, lr=3e-4)
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

    num_epochs = 10
    total_step = len(train_loader)
    end_token_idx = word2idx["<end>"]

    val_image_ids = val_images
    image2captions = prepare_image2captions(val_image_ids, captions_seqs, idx2word)

    for epoch in range(num_epochs):
        start_time = time.time()
        encoder.train()
        decoder.train()
        total_loss = 0

        for i, (images, captions, lengths) in enumerate(train_loader):
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            features = encoder(images)
            outputs = decoder(features, captions)

            # Prepare targets
            targets = captions[:, 1:]  # Exclude the first <start> token

            # Exclude the first time step from outputs
            outputs = outputs[:, 1:, :]

            # Reshape for loss computation
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            # Compute loss
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=5)
            optimizer.step()

            total_loss += loss.item()

            if i % 300 == 0:
                print(
                    f"Epoch [{epoch+1}/{num_epochs}], Step [{i}/{total_step}], Loss: {loss.item():.4f}"
                )

        # Calculate average training loss
        avg_train_loss = total_loss / total_step
        train_losses.append(avg_train_loss)

        # Adjust learning rate
        scheduler.step()

        # Validation
        val_loss = evaluate(encoder, decoder, val_loader, criterion, device, vocab_size)
        print(
            f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}"
        )

        # Calculate evaluation metrics
        bleu = calculate_bleu_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        meteor = calculate_meteor_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        cider = calculate_cider_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        end_time = time.time()
        epoch_time = end_time - start_time

        print(
            f"Epoch [{epoch+1}/{num_epochs}] completed in {epoch_time:.2f} seconds."
        )
        print(
            f"BLEU Score: {bleu:.4f}, METEOR Score: {meteor:.4f}, CIDEr Score: {cider:.4f}\n"
        )

        val_losses.append(val_loss)
        bleu_scores.append(bleu)
        meteor_scores.append(meteor)
        cider_scores.append(cider)

    # Save the models
    os.makedirs("models/model_1_baseline_cnn_lstm", exist_ok=True)
    torch.save(encoder.state_dict(), "models/model_1_baseline_cnn_lstm/encoder.pth")
    torch.save(decoder.state_dict(), "models/model_1_baseline_cnn_lstm/decoder.pth")
    
    # Plot training and validation loss
    plt.figure()
    plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
    plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    plt.savefig('models/model_1_baseline_cnn_lstm/loss_plot.png')
    plt.close()

    # Plot evaluation metrics
    plt.figure()
    plt.plot(range(1, num_epochs + 1), bleu_scores, label='BLEU Score')
    plt.plot(range(1, num_epochs + 1), meteor_scores, label='METEOR Score')
    plt.plot(range(1, num_epochs + 1), cider_scores, label='CIDEr Score')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.title('Evaluation Metrics over Epochs')
    plt.legend()
    plt.savefig('models/model_1_baseline_cnn_lstm/metrics_plot.png')
    plt.close()

In [6]:
%%time

if __name__ == "__main__":
    main()

Using device: cuda
Epoch [1/10], Step [0/506], Loss: 8.9884
Epoch [1/10], Step [300/506], Loss: 4.2047
Epoch [1/10], Training Loss: 4.5941, Validation Loss: 3.8460


PTBTokenizer tokenized 103464 tokens at 737051.85 tokens per second.
PTBTokenizer tokenized 25461 tokens at 305143.86 tokens per second.


Epoch [1/10] completed in 325.10 seconds.
BLEU Score: 0.0750, METEOR Score: 0.2961, CIDEr Score: 0.1440

Epoch [2/10], Step [0/506], Loss: 3.9051
Epoch [2/10], Step [300/506], Loss: 3.5489
Epoch [2/10], Training Loss: 3.6125, Validation Loss: 3.4748


PTBTokenizer tokenized 103464 tokens at 725576.81 tokens per second.
PTBTokenizer tokenized 22131 tokens at 274827.31 tokens per second.


Epoch [2/10] completed in 314.77 seconds.
BLEU Score: 0.0920, METEOR Score: 0.3070, CIDEr Score: 0.1711

Epoch [3/10], Step [0/506], Loss: 3.3565
Epoch [3/10], Step [300/506], Loss: 3.5784
Epoch [3/10], Training Loss: 3.3238, Validation Loss: 3.2837


PTBTokenizer tokenized 103464 tokens at 725878.16 tokens per second.
PTBTokenizer tokenized 23096 tokens at 283143.32 tokens per second.


Epoch [3/10] completed in 316.39 seconds.
BLEU Score: 0.1031, METEOR Score: 0.3239, CIDEr Score: 0.2145

Epoch [4/10], Step [0/506], Loss: 3.0823
Epoch [4/10], Step [300/506], Loss: 3.4129
Epoch [4/10], Training Loss: 3.1408, Validation Loss: 3.1623


PTBTokenizer tokenized 103464 tokens at 721784.90 tokens per second.
PTBTokenizer tokenized 25094 tokens at 298166.41 tokens per second.


Epoch [4/10] completed in 322.50 seconds.
BLEU Score: 0.0965, METEOR Score: 0.3154, CIDEr Score: 0.2121

Epoch [5/10], Step [0/506], Loss: 2.8671
Epoch [5/10], Step [300/506], Loss: 3.1885
Epoch [5/10], Training Loss: 3.0091, Validation Loss: 3.0762


PTBTokenizer tokenized 103464 tokens at 729525.75 tokens per second.
PTBTokenizer tokenized 22572 tokens at 277436.43 tokens per second.


Epoch [5/10] completed in 320.14 seconds.
BLEU Score: 0.1267, METEOR Score: 0.3569, CIDEr Score: 0.2776

Epoch [6/10], Step [0/506], Loss: 2.8784
Epoch [6/10], Step [300/506], Loss: 2.9786
Epoch [6/10], Training Loss: 2.8976, Validation Loss: 3.0542


PTBTokenizer tokenized 103464 tokens at 731172.43 tokens per second.
PTBTokenizer tokenized 23036 tokens at 279777.02 tokens per second.


Epoch [6/10] completed in 318.46 seconds.
BLEU Score: 0.1262, METEOR Score: 0.3557, CIDEr Score: 0.2839

Epoch [7/10], Step [0/506], Loss: 2.8493
Epoch [7/10], Step [300/506], Loss: 2.8290
Epoch [7/10], Training Loss: 2.8821, Validation Loss: 3.0466


PTBTokenizer tokenized 103464 tokens at 731738.16 tokens per second.
PTBTokenizer tokenized 22423 tokens at 268795.62 tokens per second.


Epoch [7/10] completed in 319.11 seconds.
BLEU Score: 0.1308, METEOR Score: 0.3605, CIDEr Score: 0.2875

Epoch [8/10], Step [0/506], Loss: 2.7277
Epoch [8/10], Step [300/506], Loss: 2.7741
Epoch [8/10], Training Loss: 2.8712, Validation Loss: 3.0405


PTBTokenizer tokenized 103464 tokens at 733707.25 tokens per second.
PTBTokenizer tokenized 22985 tokens at 281248.35 tokens per second.


Epoch [8/10] completed in 316.73 seconds.
BLEU Score: 0.1291, METEOR Score: 0.3563, CIDEr Score: 0.2861

Epoch [9/10], Step [0/506], Loss: 2.9517
Epoch [9/10], Step [300/506], Loss: 2.9790
Epoch [9/10], Training Loss: 2.8599, Validation Loss: 3.0340


PTBTokenizer tokenized 103464 tokens at 730037.37 tokens per second.
PTBTokenizer tokenized 22840 tokens at 275565.72 tokens per second.


Epoch [9/10] completed in 318.02 seconds.
BLEU Score: 0.1289, METEOR Score: 0.3581, CIDEr Score: 0.2900

Epoch [10/10], Step [0/506], Loss: 2.8555
Epoch [10/10], Step [300/506], Loss: 2.9985
Epoch [10/10], Training Loss: 2.8482, Validation Loss: 3.0280


PTBTokenizer tokenized 103464 tokens at 726141.14 tokens per second.
PTBTokenizer tokenized 22620 tokens at 279787.09 tokens per second.


Epoch [10/10] completed in 399.83 seconds.
BLEU Score: 0.1305, METEOR Score: 0.3579, CIDEr Score: 0.2937

CPU times: user 7h 26min 58s, sys: 12min 43s, total: 7h 39min 42s
Wall time: 54min 40s


# Testing

In [4]:
import random

dataset = "Flickr8k"

captions_file_path = f"{project_root}/flickr_data/{dataset}_Dataset/captions.txt"
image_dir = f"{project_root}/flickr_data/{dataset}_Dataset/Images"

# Load captions
caption_df = pd.read_csv(captions_file_path).dropna().drop_duplicates()

# Build vocabulary with vocab_size=5000
word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=8000)

# Convert captions to sequences
captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)

# Get data transformations
test_transform = get_transform(train=False)

# Split data into training and validation sets
image_names = list(image_captions.keys())
random.shuffle(image_names)
val_size = int(0.2 * len(image_names))  # 20% for validation
test_images = image_names[:val_size]

# Randomly select 6 images from the test_images
sampled_test_images = random.sample(test_images, 6)

# Prepare image to captions mapping for ground truth captions
test_image2captions = prepare_image2captions(sampled_test_images, captions_seqs, idx2word)

# Create test dataset and data loader for only those 6 randomly selected images
test_dataset = FlickrDataset(
    image_dir, sampled_test_images, captions_seqs, transform=test_transform, mode='test'
)
test_loader = DataLoader(
    test_dataset,
    batch_size=1,  # Process one image at a time
    shuffle=False,
    collate_fn=collate_fn, 
    num_workers=2,
)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize models
embed_size = 256
hidden_size = 512
vocab_size = len(word2idx)

encoder = Encoder(embed_size=embed_size).to(device)
decoder = Decoder(
    embed_size=embed_size,
    hidden_size=hidden_size,
    vocab_size=vocab_size
).to(device)

# Load trained models
encoder_path = os.path.join(project_root, "models/model_1_baseline_cnn_lstm/encoder.pth")
decoder_path = os.path.join(project_root, "models/model_1_baseline_cnn_lstm/decoder.pth")

encoder.load_state_dict(
    torch.load(encoder_path, map_location=device)
)
decoder.load_state_dict(
    torch.load(decoder_path, map_location=device)
)

encoder.eval()
decoder.eval()
end_token_idx = word2idx.get('<end>', None)

# Generate captions on the randomly selected test images
for i, (images, captions, image_ids) in enumerate(test_loader):
    images = images.to(device)
    with torch.no_grad():
        features = encoder(images)
        sampled_ids = decoder.sample(features, end_token_idx=end_token_idx)
    
    # Convert word IDs to words
    sampled_caption = [idx2word.get(word_id, '<unk>') for word_id in sampled_ids]
    
    # Remove words after (and including) the '<end>' token
    if '<end>' in sampled_caption:
        end_index = sampled_caption.index('<end>')
        sampled_caption = sampled_caption[:end_index]
    
    generated_caption = ' '.join(sampled_caption)

    # Get ground truth captions
    image_name = image_ids[0]
    gt_captions = test_image2captions.get(image_name, [])

    if not gt_captions:
        print(f'Image ID: {image_name}')
        print('Generated Caption:', generated_caption)
        print('Ground Truth Captions: None')
        print('------------------------------------')
        continue

    print(f'Image ID: {image_name}')
    print(f'Generated Caption: {generated_caption}')
    print('Ground Truth Captions:')
    for gt_caption in gt_captions:
        print(f'- {" ".join(gt_caption)}')
    print('------------------------------------')

Using device: cpu


  torch.load(encoder_path, map_location=device)
  torch.load(decoder_path, map_location=device)


Image ID: 2371809188_b805497cba.jpg
Generated Caption: a man in a red shirt is standing on a bench with a red and white dog .
Ground Truth Captions:
- a boy climbs an indoor rock climbing wall .
- a boy climbs a rock wall .
- a boy is climbing up a rock-climbing wall while an older boy stands on the ground
- boy rock climbing on a blue wall while an adult looks away .
- little boy climbing an indoor rock climbing wall .
------------------------------------
Image ID: 3430607596_7e4f74e3ff.jpg
Generated Caption: a boy in a red shirt is jumping into the water .
Ground Truth Captions:
- a boy in a red suit plays in the water .
- a boy in a red swimsuit jumps into the water to join two people .
- a boy takes a flying leap into the water .
- the boy in the red shorts jumps into the water to join other people .
- the boy wearing red shorts is jumping into the river as other children swim .
------------------------------------
Image ID: 3545779287_8f52e06909.jpg
Generated Caption: a black dog 

In [8]:
image2captions = prepare_image2captions(test_images, captions_seqs, idx2word)

# Calculate BLEU, METEOR, and CIDEr scores on the sampled test images
bleu_score = calculate_bleu_score(
    encoder=encoder,
    decoder=decoder,
    image_dir=image_dir,
    image_ids=test_images,
    image2captions=image2captions,
    transform=test_transform,
    idx2word=idx2word,
    device=device,
    word2idx=word2idx
)

meteor = calculate_meteor_score(
    encoder=encoder,
    decoder=decoder,
    image_dir=image_dir,
    image_ids=test_images,
    image2captions=image2captions,
    transform=test_transform,
    idx2word=idx2word,
    device=device,
    word2idx=word2idx
)

cider = calculate_cider_score(
    encoder=encoder,
    decoder=decoder,
    image_dir=image_dir,
    image_ids=test_images,
    image2captions=image2captions,
    transform=test_transform,
    idx2word=idx2word,
    device=device,
    word2idx=word2idx
)

print("BLEU Score:", bleu_score)
print("METEOR Score:", meteor)
print("CIDEr Score:", cider)

PTBTokenizer tokenized 102583 tokens at 819776.73 tokens per second.
PTBTokenizer tokenized 24623 tokens at 333123.07 tokens per second.


BLEU Score: 0.1271150728175305
METEOR Score: 0.36926633570867284
CIDEr Score: 0.323913207173524


# END