In [10]:
import os
import time
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from pathlib import Path

notebook_dir = Path(os.getcwd()).resolve()  # Get the current working directory
project_root = notebook_dir.parents[1]  # Adjust the number to go up to the project root
sys.path.append(str(project_root))

print(f"Project root: {project_root}")

Project root: /Users/jed/anaconda3/omscs/CS7643/image-captioning-project


In [11]:
from models.model_1_baseline_cnn_lstm.model import *
from data.dataset import *
from data.preprocessing import *
from metrics import *

[nltk_data] Downloading package punkt to /Users/jed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Evaluate function: Computes validation loss on a given dataset
def evaluate(encoder, decoder, data_loader, criterion, device, vocab_size):
    """
    Evaluate the model on the validation set.
    Args:
        encoder: Encoder model.
        decoder: Decoder model.
        data_loader: DataLoader for the validation set.
        criterion: Loss function.
        device: Computation device (CPU or GPU).
        vocab_size: Size of the vocabulary.
    Returns:
        average_loss: Average validation loss.
    """
    encoder.eval()  # Set encoder to evaluation mode
    decoder.eval()  # Set decoder to evaluation mode
    total_loss = 0
    total_samples = 0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for images, captions, _ in data_loader:
            # Move data to the computation device
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass through encoder and decoder
            features = encoder(images)
            outputs = decoder(features, captions)

            # Exclude the first time step from outputs and targets
            outputs = outputs[:, 1:, :]  # Ensure outputs and targets have the same length
            targets = captions[:, 1:]  # Exclude the first <start> token from targets

            # Reshape outputs and targets for loss computation
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            # Compute loss
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            total_samples += 1

    # Calculate average loss
    average_loss = total_loss / total_samples
    return average_loss

def main():
    # Define dataset type
    dataset = "Flickr8k"  # Change to "Flickr30k" if needed

    # Paths
    captions_file_path = f"{project_root}/flickr_data/{dataset}_Dataset/captions.txt"
    image_dir = f"{project_root}/flickr_data/{dataset}_Dataset/Images"

    train_losses = []
    val_losses = []
    bleu_scores = []
    meteor_scores = []
    cider_scores = []

    # Load captions
    caption_df = pd.read_csv(captions_file_path).dropna().drop_duplicates()

    # Build vocabulary
    word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=5000)

    # Convert captions to sequences
    captions_seqs, max_length = convert_captions_to_sequences(
        image_captions, word2idx
    )

    # Get data transformations
    train_transform = get_transform(train=True)
    val_transform = get_transform(train=False)

    # Split data into training and validation sets
    image_names = list(image_captions.keys())
    train_images, val_images, _ = get_splits(image_names, test_size=0.2)

    # Create datasets and data loaders
    train_dataset = FlickrDataset(
        image_dir, train_images, captions_seqs, transform=train_transform
    )
    val_dataset = FlickrDataset(
        image_dir, val_images, captions_seqs, transform=val_transform
    )
    train_loader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2,
    )

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize models
    embed_size = 256
    hidden_size = 512
    vocab_size = len(word2idx)
    encoder = EncoderCNN(embed_size=embed_size).to(device)
    decoder = DecoderRNN(
        embed_size=embed_size, hidden_size=hidden_size, vocab_size=vocab_size
    ).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
    params = list(filter(lambda p: p.requires_grad, encoder.parameters())) + list(
        decoder.parameters()
    )
    optimizer = optim.Adam(params, lr=3e-5)
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

    # Training settings
    num_epochs = 10
    total_step = len(train_loader)
    end_token_idx = word2idx["<end>"]

    # Prepare validation image IDs and references for metrics
    val_image_ids = val_images
    image2captions = prepare_image2captions(val_image_ids, captions_seqs, idx2word)

    for epoch in range(num_epochs):
        start_time = time.time()
        encoder.train()
        decoder.train()
        total_loss = 0

        for i, (images, captions, lengths) in enumerate(train_loader):
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            features = encoder(images)
            outputs = decoder(features, captions)

            # Prepare targets
            targets = captions[:, 1:]  # Exclude the first <start> token

            # Exclude the first time step from outputs
            outputs = outputs[:, 1:, :]  # Now outputs and targets have the same sequence length

            # Reshape for loss computation
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            # Compute loss
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=5)
            optimizer.step()

            total_loss += loss.item()

            if i % 300 == 0:
                print(
                    f"Epoch [{epoch+1}/{num_epochs}], Step [{i}/{total_step}], Loss: {loss.item():.4f}"
                )

        # Calculate average training loss
        avg_train_loss = total_loss / total_step
        train_losses.append(avg_train_loss)

        # Adjust learning rate
        scheduler.step()

        # Validation
        val_loss = evaluate(encoder, decoder, val_loader, criterion, device, vocab_size)
        print(
            f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}"
        )

        # Calculate evaluation metrics
        bleu = calculate_bleu_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        meteor = calculate_meteor_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        cider = calculate_cider_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        end_time = time.time()
        epoch_time = end_time - start_time

        print(
            f"Epoch [{epoch+1}/{num_epochs}] completed in {epoch_time:.2f} seconds."
        )
        print(
            f"BLEU Score: {bleu:.4f}, METEOR Score: {meteor:.4f}, CIDEr Score: {cider:.4f}\n"
        )
        
        val_losses.append(val_loss)
        bleu_scores.append(bleu)
        meteor_scores.append(meteor)
        cider_scores.append(cider)

    # Save the models
    torch.save(encoder.state_dict(), f"{project_root}/models/model_1_baseline_cnn_lstm/encoder.pth")
    torch.save(decoder.state_dict(), f"{project_root}/models/model_1_baseline_cnn_lstm/decoder.pth")
    
    # Plot training and validation loss
    plt.figure()
    plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
    plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    plt.savefig(f'{project_root}/models/model_1_baseline_cnn_lstm/loss_plot.png')
    plt.close()

    # Plot evaluation metrics
    plt.figure()
    plt.plot(range(1, num_epochs + 1), bleu_scores, label='BLEU Score')
    plt.plot(range(1, num_epochs + 1), meteor_scores, label='METEOR Score')
    plt.plot(range(1, num_epochs + 1), cider_scores, label='CIDEr Score')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.title('Evaluation Metrics over Epochs')
    plt.legend()
    plt.savefig(f'{project_root}/models/model_1_baseline_cnn_lstm/metrics_plot.png')
    plt.close()

In [25]:
%%time

if __name__ == "__main__":
    main()

Using device: cuda
Epoch [1/10], Step [0/1011], Loss: 8.5159
Epoch [1/10], Step [300/1011], Loss: 4.9667
Epoch [1/10], Step [600/1011], Loss: 4.8035
Epoch [1/10], Step [900/1011], Loss: 4.9348
Epoch [1/10], Training Loss: 5.3401, Validation Loss: 4.7018


PTBTokenizer tokenized 92084 tokens at 676314.06 tokens per second.
PTBTokenizer tokenized 15957 tokens at 221611.79 tokens per second.


Epoch [1/10] completed in 194.49 seconds.
BLEU Score: 0.0555, METEOR Score: 0.1973, CIDEr Score: 0.0523

Epoch [2/10], Step [0/1011], Loss: 4.6950
Epoch [2/10], Step [300/1011], Loss: 4.8153
Epoch [2/10], Step [600/1011], Loss: 4.5185
Epoch [2/10], Step [900/1011], Loss: 4.3503
Epoch [2/10], Training Loss: 4.5759, Validation Loss: 4.4510


PTBTokenizer tokenized 92084 tokens at 677939.03 tokens per second.
PTBTokenizer tokenized 16013 tokens at 222323.06 tokens per second.


Epoch [2/10] completed in 195.56 seconds.
BLEU Score: 0.0633, METEOR Score: 0.2389, CIDEr Score: 0.1170

Epoch [3/10], Step [0/1011], Loss: 4.4257
Epoch [3/10], Step [300/1011], Loss: 4.5446
Epoch [3/10], Step [600/1011], Loss: 4.3620
Epoch [3/10], Step [900/1011], Loss: 4.4452
Epoch [3/10], Training Loss: 4.3319, Validation Loss: 4.2152


PTBTokenizer tokenized 92084 tokens at 682342.21 tokens per second.
PTBTokenizer tokenized 22604 tokens at 281506.69 tokens per second.


Epoch [3/10] completed in 207.49 seconds.
BLEU Score: 0.0327, METEOR Score: 0.2220, CIDEr Score: 0.0651

Epoch [4/10], Step [0/1011], Loss: 4.5496
Epoch [4/10], Step [300/1011], Loss: 3.9361
Epoch [4/10], Step [600/1011], Loss: 3.9469
Epoch [4/10], Step [900/1011], Loss: 4.2073
Epoch [4/10], Training Loss: 4.1181, Validation Loss: 4.0288


PTBTokenizer tokenized 92084 tokens at 678154.89 tokens per second.
PTBTokenizer tokenized 24170 tokens at 290910.99 tokens per second.


Epoch [4/10] completed in 212.46 seconds.
BLEU Score: 0.0501, METEOR Score: 0.2550, CIDEr Score: 0.0953

Epoch [5/10], Step [0/1011], Loss: 3.8538
Epoch [5/10], Step [300/1011], Loss: 4.0162
Epoch [5/10], Step [600/1011], Loss: 4.0143
Epoch [5/10], Step [900/1011], Loss: 3.9329
Epoch [5/10], Training Loss: 3.9509, Validation Loss: 3.8874


PTBTokenizer tokenized 92084 tokens at 673874.23 tokens per second.
PTBTokenizer tokenized 23161 tokens at 285962.68 tokens per second.


Epoch [5/10] completed in 209.47 seconds.
BLEU Score: 0.0673, METEOR Score: 0.3086, CIDEr Score: 0.1281

Epoch [6/10], Step [0/1011], Loss: 4.0383
Epoch [6/10], Step [300/1011], Loss: 4.1280
Epoch [6/10], Step [600/1011], Loss: 3.7116
Epoch [6/10], Step [900/1011], Loss: 3.7947
Epoch [6/10], Training Loss: 3.8663, Validation Loss: 3.8708


PTBTokenizer tokenized 92084 tokens at 676357.75 tokens per second.
PTBTokenizer tokenized 22001 tokens at 276481.06 tokens per second.


Epoch [6/10] completed in 208.24 seconds.
BLEU Score: 0.0712, METEOR Score: 0.3188, CIDEr Score: 0.1382

Epoch [7/10], Step [0/1011], Loss: 3.9954
Epoch [7/10], Step [300/1011], Loss: 3.8535
Epoch [7/10], Step [600/1011], Loss: 3.9468
Epoch [7/10], Step [900/1011], Loss: 3.6664
Epoch [7/10], Training Loss: 3.8525, Validation Loss: 3.8585


PTBTokenizer tokenized 92084 tokens at 668102.87 tokens per second.
PTBTokenizer tokenized 22142 tokens at 271929.79 tokens per second.


Epoch [7/10] completed in 207.46 seconds.
BLEU Score: 0.0718, METEOR Score: 0.3209, CIDEr Score: 0.1440

Epoch [8/10], Step [0/1011], Loss: 3.8600
Epoch [8/10], Step [300/1011], Loss: 3.9928
Epoch [8/10], Step [600/1011], Loss: 3.8367
Epoch [8/10], Step [900/1011], Loss: 3.8964
Epoch [8/10], Training Loss: 3.8399, Validation Loss: 3.8471


PTBTokenizer tokenized 92084 tokens at 652659.83 tokens per second.
PTBTokenizer tokenized 22541 tokens at 274991.92 tokens per second.


Epoch [8/10] completed in 208.61 seconds.
BLEU Score: 0.0743, METEOR Score: 0.3215, CIDEr Score: 0.1481

Epoch [9/10], Step [0/1011], Loss: 3.5798
Epoch [9/10], Step [300/1011], Loss: 3.9434
Epoch [9/10], Step [600/1011], Loss: 3.8031
Epoch [9/10], Step [900/1011], Loss: 3.8459
Epoch [9/10], Training Loss: 3.8283, Validation Loss: 3.8356


PTBTokenizer tokenized 92084 tokens at 672525.38 tokens per second.
PTBTokenizer tokenized 22455 tokens at 278572.85 tokens per second.


Epoch [9/10] completed in 208.06 seconds.
BLEU Score: 0.0757, METEOR Score: 0.3252, CIDEr Score: 0.1536

Epoch [10/10], Step [0/1011], Loss: 4.0527
Epoch [10/10], Step [300/1011], Loss: 3.9571
Epoch [10/10], Step [600/1011], Loss: 3.8751
Epoch [10/10], Step [900/1011], Loss: 3.9422
Epoch [10/10], Training Loss: 3.8159, Validation Loss: 3.8251


PTBTokenizer tokenized 92084 tokens at 680738.39 tokens per second.
PTBTokenizer tokenized 22716 tokens at 279755.23 tokens per second.


Epoch [10/10] completed in 207.69 seconds.
BLEU Score: 0.0771, METEOR Score: 0.3266, CIDEr Score: 0.1555

CPU times: user 6h 19min 13s, sys: 1min 27s, total: 6h 20min 40s
Wall time: 34min 27s


In [20]:
dataset = "Flickr8k"

captions_file_path = f"{project_root}/flickr_data/{dataset}_Dataset/captions.txt"
image_dir = f"{project_root}/flickr_data/{dataset}_Dataset/Images"

# Load captions
caption_df = pd.read_csv(captions_file_path).dropna().drop_duplicates()

# Build vocabulary
word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=5000)

# Convert captions to sequences
captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)

# Get data transformations
test_transform = get_transform(train=False)

# Split data into training, validation, and test sets
image_names = list(image_captions.keys())
_, _, test_images = get_splits(image_names, test_size=0.2)

# Prepare image to captions mapping for ground truth captions
test_image2captions = prepare_image2captions(test_images, captions_seqs, idx2word)

# Create test dataset and data loader
test_dataset = FlickrDataset(
    image_dir, test_images, captions_seqs, transform=test_transform, mode='test'
)
test_loader = DataLoader(
    test_dataset,
    batch_size=1,  # Process one image at a time
    shuffle=False,
    collate_fn=collate_fn, 
    num_workers=2,
)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize models
embed_size = 256
hidden_size = 512
vocab_size = len(word2idx)

encoder = EncoderCNN(embed_size=embed_size).to(device)
decoder = DecoderRNN(
    embed_size=embed_size,
    hidden_size=hidden_size,
    vocab_size=vocab_size
).to(device)

# Load trained models
encoder_path = os.path.join(project_root, "models/model_1_baseline_cnn_lstm/encoder.pth")
decoder_path = os.path.join(project_root, "models/model_1_baseline_cnn_lstm/decoder.pth")

encoder.load_state_dict(
    torch.load(encoder_path, map_location=device, weights_only=True)
)
decoder.load_state_dict(
    torch.load(decoder_path, map_location=device, weights_only=True)
)

encoder.eval()
decoder.eval()
end_token_idx = word2idx.get('<end>', None)

if end_token_idx is None:
    raise ValueError("The '<end>' token was not found in the vocabulary.")

# Generate captions on test images
for i, (images, captions, image_ids) in enumerate(test_loader):
    if i >= 6:
        break  # Stop after processing 6 images

    images = images.to(device)
    with torch.no_grad():
        features = encoder(images)
        sampled_ids = decoder.sample(features, end_token_idx=end_token_idx)
    
    # Convert word IDs to words
    sampled_caption = [idx2word.get(word_id, '<unk>') for word_id in sampled_ids]
    
    # Remove words after (and including) the '<end>' token
    if '<end>' in sampled_caption:
        end_index = sampled_caption.index('<end>')
        sampled_caption = sampled_caption[:end_index]
    
    generated_caption = ' '.join(sampled_caption)

    # Get ground truth captions
    image_name = image_ids[0]
    gt_captions = test_image2captions.get(image_name, [])

    if not gt_captions:
        print(f'Image ID: {image_name}')
        print('Generated Caption:', generated_caption)
        print('Ground Truth Captions: None')
        print('------------------------------------')
        continue

    print(f'Image ID: {image_name}')
    print(f'Generated Caption: {generated_caption}')
    print('Ground Truth Captions:')
    for gt_caption in gt_captions:
        print(f'- {gt_caption}')
    print('------------------------------------')

Image ID: 2714703706_d21c5cb8df.jpg
Generated Caption: a two dog is running in the grass .
Ground Truth Captions:
- ['dogs', 'playing']
- ['a', 'brown', 'dog', 'is', 'biting', 'a', 'white', 'and', 'tan', 'dog', 'on', 'the', '<unk>', '.']
- ['the', 'brown', 'dog', 'has', 'a', 'hold', 'of', 'the', 'other', 'dogs', 'cheek', 'with', 'its', 'teeth', '.']
- ['two', 'dogs', 'are', 'nuzzling', 'each', 'other', 'nose', 'to', 'nose', '.']
- ['two', 'dogs', 'bite', 'at', 'each', 'other', 'on', 'the', 'carpet', '.']
------------------------------------
Image ID: 3532194771_07faf20d76.jpg
Generated Caption: a two dog is running in the water .
Ground Truth Captions:
- ['a', 'man', 'is', 'heading', 'out', 'to', 'see', 'with', 'his', 'surfboard', 'in', 'hand', '.']
- ['a', 'man', 'with', 'a', 'white', 'surfboard', 'is', 'walking', 'into', 'the', 'water', '.']
- ['a', 'person', 'walks', 'into', 'the', 'water', 'carrying', 'a', 'white', 'surfboard', '.']
- ['a', 'surfer', 'walking', 'into', 'the', 'ocea

# END