In [1]:
import os
import time
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from pathlib import Path

notebook_dir = Path(os.getcwd()).resolve()  # Get the current working directory
project_root = notebook_dir.parents[1]  # Adjust the number to go up to the project root
sys.path.append(str(project_root))

print(f"Project root: {project_root}")

Project root: /Users/jed/anaconda3/omscs/CS7643/image-captioning-project


In [2]:
from models.model_2_image_segmentation_lstm.model import *
from data.dataset import *
from data.preprocessing import *
from metrics import *

[nltk_data] Downloading package punkt to /Users/jed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def main():
    # Define dataset type
    dataset = "Flickr8k"  # Change to "Flickr30k" if needed

    # Paths
    dataset_dir = f"../../../../flickr_data/{dataset}_Dataset/Images"
    captions_file = f"../../../../flickr_data/{dataset}_Dataset/captions.txt"
    image_dir = dataset_dir

    train_losses = []
    val_losses = []
    bleu_scores = []
    meteor_scores = []
    cider_scores = []
    
    # Load captions
    caption_df = pd.read_csv(captions_file).dropna().drop_duplicates()
    print(f"Total captions loaded: {len(caption_df)}")

    # Build vocabulary
    word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=10000)
    print(f"Vocabulary size: {len(word2idx)}")

    # Convert captions to sequences
    captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)
    print(f"Maximum caption length: {max_length}")

    # Get data transformations
    train_transform = get_transform(train=True)
    val_transform = get_transform(train=False)

    # Split data into training and validation sets
    image_names = list(image_captions.keys())
    train_images, val_images, _ = get_splits(image_names, test_size=0.2)
    print(f"Training samples: {len(train_images)}")
    print(f"Validation samples: {len(val_images)}")

    # Create datasets and data loaders
    train_dataset = FlickrDataset(
        image_dir, train_images, captions_seqs, transform=train_transform
    )
    val_dataset = FlickrDataset(
        image_dir, val_images, captions_seqs, transform=val_transform
    )
    train_loader = DataLoader(
        train_dataset,
        batch_size=32, 
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2,
    )
    print(f"Number of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(val_loader)}")

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize models
    embed_size = 256
    hidden_size = 512
    vocab_size = len(word2idx)
    input_size = embed_size  # Must match EncoderCNN's embed_size
    top_k = 5  # Number of objects to consider

    # Initialize encoder and decoder
    encoder = EncoderCNN(embed_size=embed_size, device=device, top_k=top_k).to(device)
    decoder = DecoderRNN(
        input_size=input_size,
        embed_size=embed_size,
        hidden_size=hidden_size,
        vocab_size=vocab_size,
        dropout=0.4
    ).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
    params = list(encoder.parameters()) + list(decoder.parameters())
    optimizer = optim.Adam(params, lr=1e-3, weight_decay=5e-5)

    # Initialize the learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 
        mode='min',           # We want to minimize the validation loss
        factor=0.5,           # Factor by which the learning rate will be reduced
        patience=2,           # Number of epochs with no improvement after which learning rate will be reduced
        verbose=True          # Print a message when the learning rate is updated
    )
    
    # Prepare image to captions mapping for evaluation
    val_image2captions = prepare_image2captions(val_images, captions_seqs, idx2word)

    # Training settings
    num_epochs = 10
    total_step = len(train_loader)
    end_token_idx = word2idx["<end>"]

    # Training loop
    for epoch in range(num_epochs):
        start_time = time.time()
        encoder.train()
        decoder.train()
        total_loss = 0

        for i, (images, captions, _) in enumerate(train_loader):
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            global_features, object_features = encoder(images)
            outputs = decoder(global_features, object_features, captions)

            # Exclude the first time step from outputs and targets
            outputs = outputs[:, 1:, :]  # Shape: (batch_size, seq_len -1, vocab_size)
            targets = captions[:, 1:]     # Shape: (batch_size, seq_len -1)

            # Reshape outputs and targets for loss computation
            outputs = outputs.reshape(-1, vocab_size)  # Shape: (batch_size * (seq_len -1), vocab_size)
            targets = targets.reshape(-1)              # Shape: (batch_size * (seq_len -1))

            # Compute loss
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=5)
            optimizer.step()

            total_loss += loss.item()

            if i % 300 == 0:
                print(
                    f"Epoch [{epoch+1}/{num_epochs}], Step [{i}/{total_step}], Loss: {loss.item():.4f}"
                )

        # Calculate average training loss for the epoch
        avg_train_loss = total_loss / total_step

        # Validation
        val_loss = evaluate(encoder, decoder, val_loader, criterion, device, vocab_size)

        # Step the scheduler with the validation loss
        scheduler.step(val_loss)

        # Calculate evaluation metrics
        bleu = calculate_bleu_score(
            encoder,
            decoder,
            image_dir,
            val_images,
            val_image2captions,
            val_transform,
            idx2word,
            device,
            word2idx,
        )
        meteor = calculate_meteor_score(
            encoder,
            decoder,
            image_dir,
            val_images,
            val_image2captions,
            val_transform,
            idx2word,
            device,
            word2idx,
        )
        cider = calculate_cider_score(
            encoder,
            decoder,
            image_dir,
            val_images,
            val_image2captions,
            val_transform,
            idx2word,
            device,
            word2idx,
        )

        # Print epoch summary
        epoch_duration = time.time() - start_time
        print(
            f"Epoch [{epoch+1}/{num_epochs}], "
            f"Training Loss: {avg_train_loss:.4f}, "
            f"Validation Loss: {val_loss:.4f}, "
            f"BLEU: {bleu:.4f}, "
            f"METEOR: {meteor:.4f}, "
            f"CIDEr: {cider:.4f}, "
            f"Time: {epoch_duration:.2f}s"
        )

        # Save metrics
        train_losses.append(avg_train_loss)
        val_losses.append(val_loss)
        bleu_scores.append(bleu)
        meteor_scores.append(meteor)
        cider_scores.append(cider)

    # Save the models
    os.makedirs("models/model_2_image_segmentation_lstm", exist_ok=True)
    torch.save(encoder.state_dict(), "models/model_2_image_segmentation_lstm/encoder.pth")
    torch.save(decoder.state_dict(), "models/model_2_image_segmentation_lstm/decoder.pth")
    print("Models saved successfully.")
    
    # Plot training and validation loss
    plt.figure()
    plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
    plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    plt.savefig('models/model_2_image_segmentation_lstm/loss_plot.png')
    plt.close()

    # Plot evaluation metrics
    plt.figure()
    plt.plot(range(1, num_epochs + 1), bleu_scores, label='BLEU Score')
    plt.plot(range(1, num_epochs + 1), meteor_scores, label='METEOR Score')
    plt.plot(range(1, num_epochs + 1), cider_scores, label='CIDEr Score')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.title('Evaluation Metrics over Epochs')
    plt.legend()
    plt.savefig('models/model_2_image_segmentation_lstm/metrics_plot.png')
    plt.close()

In [28]:
%%time

if __name__ == "__main__":
    main()

Total captions loaded: 40445
Vocabulary size: 8921
Maximum caption length: 40
Training samples: 6472
Validation samples: 1457
Number of training batches: 1011
Number of validation batches: 228
Using device: cuda
Epoch [1/10], Step [0/1011], Loss: 9.0604
Epoch [1/10], Step [300/1011], Loss: 3.8427
Epoch [1/10], Step [600/1011], Loss: 3.7357
Epoch [1/10], Step [900/1011], Loss: 3.4613


PTBTokenizer tokenized 92805 tokens at 670934.36 tokens per second.
PTBTokenizer tokenized 21329 tokens at 267976.51 tokens per second.


Epoch [1/10], Training Loss: 3.9267, Validation Loss: 3.4723, BLEU: 0.0611, METEOR: 0.2740, CIDEr: 0.1020, Time: 838.20s
Epoch [2/10], Step [0/1011], Loss: 3.3912
Epoch [2/10], Step [300/1011], Loss: 3.4182
Epoch [2/10], Step [600/1011], Loss: 3.3431
Epoch [2/10], Step [900/1011], Loss: 3.1535


PTBTokenizer tokenized 92805 tokens at 678450.08 tokens per second.
PTBTokenizer tokenized 20201 tokens at 254925.37 tokens per second.


Epoch [2/10], Training Loss: 3.3222, Validation Loss: 3.2644, BLEU: 0.0768, METEOR: 0.2802, CIDEr: 0.1337, Time: 835.28s
Epoch [3/10], Step [0/1011], Loss: 2.9873
Epoch [3/10], Step [300/1011], Loss: 2.9588
Epoch [3/10], Step [600/1011], Loss: 3.3234
Epoch [3/10], Step [900/1011], Loss: 3.0310


PTBTokenizer tokenized 92805 tokens at 443964.61 tokens per second.
PTBTokenizer tokenized 20335 tokens at 257877.16 tokens per second.


Epoch [3/10], Training Loss: 3.1378, Validation Loss: 3.1608, BLEU: 0.0792, METEOR: 0.2913, CIDEr: 0.1290, Time: 832.71s
Epoch [4/10], Step [0/1011], Loss: 2.7448
Epoch [4/10], Step [300/1011], Loss: 2.9936
Epoch [4/10], Step [600/1011], Loss: 3.0017
Epoch [4/10], Step [900/1011], Loss: 2.9421


PTBTokenizer tokenized 92805 tokens at 662622.00 tokens per second.
PTBTokenizer tokenized 20677 tokens at 259637.15 tokens per second.


Epoch [4/10], Training Loss: 3.0213, Validation Loss: 3.1039, BLEU: 0.0716, METEOR: 0.2829, CIDEr: 0.1196, Time: 833.56s
Epoch [5/10], Step [0/1011], Loss: 2.8381
Epoch [5/10], Step [300/1011], Loss: 3.1110
Epoch [5/10], Step [600/1011], Loss: 3.1529
Epoch [5/10], Step [900/1011], Loss: 3.0109


PTBTokenizer tokenized 92805 tokens at 625816.37 tokens per second.
PTBTokenizer tokenized 22167 tokens at 271565.39 tokens per second.


Epoch [5/10], Training Loss: 2.9450, Validation Loss: 3.0626, BLEU: 0.0753, METEOR: 0.2934, CIDEr: 0.1216, Time: 851.76s
Epoch [6/10], Step [0/1011], Loss: 2.9760
Epoch [6/10], Step [300/1011], Loss: 2.8621
Epoch [6/10], Step [600/1011], Loss: 3.1394
Epoch [6/10], Step [900/1011], Loss: 2.9161


PTBTokenizer tokenized 92805 tokens at 640975.94 tokens per second.
PTBTokenizer tokenized 19443 tokens at 184455.88 tokens per second.


Epoch [6/10], Training Loss: 2.8842, Validation Loss: 3.0446, BLEU: 0.0738, METEOR: 0.2839, CIDEr: 0.1296, Time: 845.96s
Epoch [7/10], Step [0/1011], Loss: 2.8646
Epoch [7/10], Step [300/1011], Loss: 2.8090
Epoch [7/10], Step [600/1011], Loss: 2.8453
Epoch [7/10], Step [900/1011], Loss: 2.8014


PTBTokenizer tokenized 92805 tokens at 642207.29 tokens per second.
PTBTokenizer tokenized 20378 tokens at 217364.22 tokens per second.


Epoch [7/10], Training Loss: 2.8369, Validation Loss: 3.0225, BLEU: 0.0877, METEOR: 0.2895, CIDEr: 0.1495, Time: 843.42s
Epoch [8/10], Step [0/1011], Loss: 2.8178
Epoch [8/10], Step [300/1011], Loss: 2.6607
Epoch [8/10], Step [600/1011], Loss: 2.9273
Epoch [8/10], Step [900/1011], Loss: 2.7484


PTBTokenizer tokenized 92805 tokens at 681898.92 tokens per second.
PTBTokenizer tokenized 22781 tokens at 280283.75 tokens per second.


Epoch [8/10], Training Loss: 2.7949, Validation Loss: 3.0084, BLEU: 0.0721, METEOR: 0.2771, CIDEr: 0.1039, Time: 843.74s
Epoch [9/10], Step [0/1011], Loss: 2.7224
Epoch [9/10], Step [300/1011], Loss: 2.5997
Epoch [9/10], Step [600/1011], Loss: 2.7574
Epoch [9/10], Step [900/1011], Loss: 2.5210


PTBTokenizer tokenized 92805 tokens at 589373.57 tokens per second.
PTBTokenizer tokenized 21954 tokens at 265437.45 tokens per second.


Epoch [9/10], Training Loss: 2.7575, Validation Loss: 2.9986, BLEU: 0.0772, METEOR: 0.2845, CIDEr: 0.1184, Time: 843.67s
Epoch [10/10], Step [0/1011], Loss: 2.6978
Epoch [10/10], Step [300/1011], Loss: 2.6263
Epoch [10/10], Step [600/1011], Loss: 2.6463
Epoch [10/10], Step [900/1011], Loss: 2.7768


PTBTokenizer tokenized 92805 tokens at 671204.43 tokens per second.
PTBTokenizer tokenized 23748 tokens at 286621.65 tokens per second.


Epoch [10/10], Training Loss: 2.7264, Validation Loss: 2.9893, BLEU: 0.0783, METEOR: 0.2909, CIDEr: 0.1302, Time: 842.33s
Models saved successfully.
CPU times: user 21h 44min 42s, sys: 1min 52s, total: 21h 46min 35s
Wall time: 2h 20min 19s


In [4]:
dataset = "Flickr8k"

captions_file_path = f"{project_root}/flickr_data/{dataset}_Dataset/captions.txt"
image_dir = f"{project_root}/flickr_data/{dataset}_Dataset/Images"

# Load captions
caption_df = pd.read_csv(captions_file_path).dropna().drop_duplicates()

# Build vocabulary
word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=5000)

# Convert captions to sequences
captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)

# Get data transformations
test_transform = get_transform(train=False)

# Split data into training, validation, and test sets
image_names = list(image_captions.keys())
_, _, test_images = get_splits(image_names, test_size=0.2)

# Prepare image to captions mapping for ground truth captions
test_image2captions = prepare_image2captions(test_images, captions_seqs, idx2word)

# Create test dataset and data loader
test_dataset = FlickrDataset(
    image_dir, test_images, captions_seqs, transform=test_transform, mode='test'
)
test_loader = DataLoader(
    test_dataset,
    batch_size=1,  # Process one image at a time
    shuffle=False,
    collate_fn=collate_fn, 
    num_workers=2,
)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize models
embed_size = 256
hidden_size = 512
vocab_size = len(word2idx)
input_size = embed_size  # As per EncoderCNN's output

# Initialize EncoderCNN with device
encoder = EncoderCNN(embed_size=embed_size, device=device, top_k=5).to(device)
decoder = DecoderRNN(
    input_size=input_size,
    embed_size=embed_size,
    hidden_size=hidden_size,
    vocab_size=vocab_size
).to(device)

# Load trained models
encoder_path = os.path.join(project_root, "models/model_2_image_segmentation_lstm/encoder.pth")
decoder_path = os.path.join(project_root, "models/model_2_image_segmentation_lstm/decoder.pth")

encoder.load_state_dict(
    torch.load(encoder_path, map_location=device)
)
decoder.load_state_dict(
    torch.load(decoder_path, map_location=device)
)

encoder.eval()
decoder.eval()

# Retrieve <end> and <start> token indices
end_token_idx = word2idx.get('<end>', None)
start_token_idx = word2idx.get('<start>', None)

if end_token_idx is None:
    raise ValueError("The '<end>' token was not found in the vocabulary.")
if start_token_idx is None:
    raise ValueError("The '<start>' token was not found in the vocabulary.")

# Generate captions on test images
for i, (images, captions, image_ids) in enumerate(test_loader):
    if i >= 6:
        break  # Stop after processing 10 images

    images = images.to(device)
    with torch.no_grad():
        # Forward pass through encoder
        global_features, object_features = encoder(images)

        # Forward pass through decoder's sample method with correct arguments
        sampled_ids = decoder.sample(
            global_features,
            object_features,
            start_token_idx=start_token_idx,
            end_token_idx=end_token_idx
        )

    # Convert word IDs to words
    sampled_caption = [idx2word.get(word_id, '<unk>') for word_id in sampled_ids]

    # Remove words after (and including) the '<end>' token
    if '<end>' in sampled_caption:
        end_index = sampled_caption.index('<end>')
        sampled_caption = sampled_caption[:end_index]

    generated_caption = ' '.join(sampled_caption)

    # Get ground truth captions
    image_name = image_ids[0]
    gt_captions = test_image2captions.get(image_name, [])

    if not gt_captions:
        print(f'Image ID: {image_name}')
        print('Generated Caption:', generated_caption)
        print('Ground Truth Captions: None')
        print('------------------------------------')
        continue

    print(f'Image ID: {image_name}')
    print(f'Generated Caption: {generated_caption}')
    print('Ground Truth Captions:')
    for gt_caption in gt_captions:
        print(f'- {gt_caption}')
    print('------------------------------------')

Using device: cpu


  torch.load(encoder_path, map_location=device)
  torch.load(decoder_path, map_location=device)


Image ID: 2714703706_d21c5cb8df.jpg
Generated Caption: a dog is running through the grass .
Ground Truth Captions:
- ['dogs', 'playing']
- ['a', 'brown', 'dog', 'is', 'biting', 'a', 'white', 'and', 'tan', 'dog', 'on', 'the', '<unk>', '.']
- ['the', 'brown', 'dog', 'has', 'a', 'hold', 'of', 'the', 'other', 'dogs', 'cheek', 'with', 'its', 'teeth', '.']
- ['two', 'dogs', 'are', 'nuzzling', 'each', 'other', 'nose', 'to', 'nose', '.']
- ['two', 'dogs', 'bite', 'at', 'each', 'other', 'on', 'the', 'carpet', '.']
------------------------------------
Image ID: 3532194771_07faf20d76.jpg
Generated Caption: a man is jumping on a beach .
Ground Truth Captions:
- ['a', 'man', 'is', 'heading', 'out', 'to', 'see', 'with', 'his', 'surfboard', 'in', 'hand', '.']
- ['a', 'man', 'with', 'a', 'white', 'surfboard', 'is', 'walking', 'into', 'the', 'water', '.']
- ['a', 'person', 'walks', 'into', 'the', 'water', 'carrying', 'a', 'white', 'surfboard', '.']
- ['a', 'surfer', 'walking', 'into', 'the', 'ocean']
-

# END