In [6]:
import os
import time
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from pathlib import Path

notebook_dir = Path(os.getcwd()).resolve()  # Get the current working directory
project_root = notebook_dir.parents[1]  # Adjust the number to go up to the project root
sys.path.append(str(project_root))

print(f"Project root: {project_root}")

Project root: /Users/jed/anaconda3/omscs/CS7643/image-captioning-project


In [7]:
from models.model_2_image_segmentation_lstm.model import *
from data.dataset import *
from data.preprocessing import *
from metrics import *

In [None]:
def main():
    # Define dataset type
    dataset = "Flickr8k"  # Change to "Flickr30k" if needed

    # Paths
    dataset_dir = f"../../../../flickr_data/{dataset}_Dataset/Images"
    captions_file = f"../../../../flickr_data/{dataset}_Dataset/captions.txt"
    image_dir = dataset_dir

    train_losses = []
    val_losses = []
    bleu_scores = []
    meteor_scores = []
    cider_scores = []
    
    # Load captions
    caption_df = pd.read_csv(captions_file).dropna().drop_duplicates()
    print(f"Total captions loaded: {len(caption_df)}")

    # Build vocabulary
    word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=10000)
    print(f"Vocabulary size: {len(word2idx)}")

    # Convert captions to sequences
    captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)
    print(f"Maximum caption length: {max_length}")

    # Get data transformations
    train_transform = get_transform(train=True)
    val_transform = get_transform(train=False)

    # Split data into training and validation sets
    image_names = list(image_captions.keys())
    train_images, val_images, _ = get_splits(image_names, test_size=0.2)
    print(f"Training samples: {len(train_images)}")
    print(f"Validation samples: {len(val_images)}")

    # Create datasets and data loaders
    train_dataset = FlickrDataset(
        image_dir, train_images, captions_seqs, transform=train_transform
    )
    val_dataset = FlickrDataset(
        image_dir, val_images, captions_seqs, transform=val_transform
    )
    train_loader = DataLoader(
        train_dataset,
        batch_size=32, 
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2,
    )
    print(f"Number of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(val_loader)}")

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize models
    embed_size = 256
    hidden_size = 512
    vocab_size = len(word2idx)
    top_k = 5  # Number of objects to consider

    # Initialize encoder and decoder
    encoder = EncoderCNN(embed_size=embed_size, device=device, top_k=top_k).to(device)
    decoder = DecoderRNN(
        input_size=embed_size,
        embed_size=embed_size,
        hidden_size=hidden_size,
        vocab_size=vocab_size,
        dropout=0.5
    ).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
    params = list(encoder.parameters()) + list(decoder.parameters())
    optimizer = optim.Adam(params, lr=5e-4, weight_decay=3e-4)

    # Initialize the learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 
        mode='min',           # We want to minimize the validation loss
        factor=0.5,           # Factor by which the learning rate will be reduced
        patience=2,           # Number of epochs with no improvement after which learning rate will be reduced
    )
    
    # Prepare image to captions mapping for evaluation
    val_image2captions = prepare_image2captions(val_images, captions_seqs, idx2word)

    # Training settings
    num_epochs = 10
    total_step = len(train_loader)
    end_token_idx = word2idx["<end>"]

    # Training loop
    for epoch in range(num_epochs):
        start_time = time.time()
        encoder.train()
        decoder.train()
        total_loss = 0

        for i, (images, captions, _) in enumerate(train_loader):
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            global_features, object_features = encoder(images)
            outputs = decoder(global_features, object_features, captions)  # No slicing

            targets = captions[:, 1:]     # Shape: (batch_size, seq_len -1)

            # Reshape outputs and targets for loss computation
            outputs = outputs.reshape(-1, vocab_size)  # Shape: (batch_size * (seq_len -1), vocab_size)
            targets = targets.reshape(-1)              # Shape: (batch_size * (seq_len -1))

            # Compute loss
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=5)
            optimizer.step()

            total_loss += loss.item()

            if i % 300 == 0:
                print(
                    f"Epoch [{epoch+1}/{num_epochs}], Step [{i}/{total_step}], Loss: {loss.item():.4f}"
                )

        # Calculate average training loss for the epoch
        avg_train_loss = total_loss / total_step

        # Validation
        val_loss = evaluate(encoder, decoder, val_loader, criterion, device, vocab_size)

        # Step the scheduler with the validation loss
        scheduler.step(val_loss)

        # Calculate evaluation metrics
        bleu = calculate_bleu_score(
            encoder,
            decoder,
            image_dir,
            val_images,
            val_image2captions,
            val_transform,
            idx2word,
            device,
            word2idx,
        )
        meteor = calculate_meteor_score(
            encoder,
            decoder,
            image_dir,
            val_images,
            val_image2captions,
            val_transform,
            idx2word,
            device,
            word2idx,
        )
        cider = calculate_cider_score(
            encoder,
            decoder,
            image_dir,
            val_images,
            val_image2captions,
            val_transform,
            idx2word,
            device,
            word2idx,
        )

        # Print epoch summary
        epoch_duration = time.time() - start_time
        print(
            f"Epoch [{epoch+1}/{num_epochs}], "
            f"Training Loss: {avg_train_loss:.4f}, "
            f"Validation Loss: {val_loss:.4f}, "
            f"BLEU: {bleu:.4f}, "
            f"METEOR: {meteor:.4f}, "
            f"CIDEr: {cider:.4f}, "
            f"Time: {epoch_duration:.2f}s"
        )

        # Save metrics
        train_losses.append(avg_train_loss)
        val_losses.append(val_loss)
        bleu_scores.append(bleu)
        meteor_scores.append(meteor)
        cider_scores.append(cider)

    # Save the models
    os.makedirs("models/model_2_image_segmentation_lstm", exist_ok=True)
    torch.save(encoder.state_dict(), "models/model_2_image_segmentation_lstm/encoder.pth")
    torch.save(decoder.state_dict(), "models/model_2_image_segmentation_lstm/decoder.pth")
    print("Models saved successfully.")
    
    # Plot training and validation loss
    plt.figure()
    plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
    plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    plt.savefig('models/model_2_image_segmentation_lstm/loss_plot.png')
    plt.close()

    # Plot evaluation metrics
    plt.figure()
    plt.plot(range(1, num_epochs + 1), bleu_scores, label='BLEU Score')
    plt.plot(range(1, num_epochs + 1), meteor_scores, label='METEOR Score')
    plt.plot(range(1, num_epochs + 1), cider_scores, label='CIDEr Score')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.title('Evaluation Metrics over Epochs')
    plt.legend()
    plt.savefig('models/model_2_image_segmentation_lstm/metrics_plot.png')
    plt.close()

In [9]:
%%time

if __name__ == "__main__":
    main()

Total captions loaded: 40445
Vocabulary size: 8921
Maximum caption length: 40
Training samples: 6472
Validation samples: 1457
Number of training batches: 1011
Number of validation batches: 228
Using device: cuda
Epoch [1/10], Step [0/1011], Loss: 9.1193
Epoch [1/10], Step [300/1011], Loss: 4.3396
Epoch [1/10], Step [600/1011], Loss: 4.2018
Epoch [1/10], Step [900/1011], Loss: 3.4711


PTBTokenizer tokenized 92805 tokens at 419333.73 tokens per second.
PTBTokenizer tokenized 23211 tokens at 160156.42 tokens per second.


Epoch [1/10], Training Loss: 4.2250, Validation Loss: 3.7549, BLEU: 0.0720, METEOR: 0.3001, CIDEr: 0.1235, Time: 805.49s
Epoch [2/10], Step [0/1011], Loss: 3.6582
Epoch [2/10], Step [300/1011], Loss: 3.4671
Epoch [2/10], Step [600/1011], Loss: 3.5692
Epoch [2/10], Step [900/1011], Loss: 3.5124


PTBTokenizer tokenized 92805 tokens at 621947.09 tokens per second.
PTBTokenizer tokenized 18965 tokens at 131861.51 tokens per second.


Epoch [2/10], Training Loss: 3.6269, Validation Loss: 3.5632, BLEU: 0.0887, METEOR: 0.3111, CIDEr: 0.1398, Time: 784.51s
Epoch [3/10], Step [0/1011], Loss: 3.6404
Epoch [3/10], Step [300/1011], Loss: 3.5752
Epoch [3/10], Step [600/1011], Loss: 3.3426
Epoch [3/10], Step [900/1011], Loss: 3.6249


PTBTokenizer tokenized 92805 tokens at 440186.54 tokens per second.
PTBTokenizer tokenized 21672 tokens at 162940.51 tokens per second.


Epoch [3/10], Training Loss: 3.4779, Validation Loss: 3.4734, BLEU: 0.0785, METEOR: 0.2923, CIDEr: 0.1396, Time: 775.84s
Epoch [4/10], Step [0/1011], Loss: 3.6791
Epoch [4/10], Step [300/1011], Loss: 3.5684
Epoch [4/10], Step [600/1011], Loss: 3.3883
Epoch [4/10], Step [900/1011], Loss: 3.2462


PTBTokenizer tokenized 92805 tokens at 642159.65 tokens per second.
PTBTokenizer tokenized 22150 tokens at 271484.96 tokens per second.


Epoch [4/10], Training Loss: 3.3933, Validation Loss: 3.4015, BLEU: 0.0757, METEOR: 0.2819, CIDEr: 0.1410, Time: 774.47s
Epoch [5/10], Step [0/1011], Loss: 3.4310
Epoch [5/10], Step [300/1011], Loss: 3.3680
Epoch [5/10], Step [600/1011], Loss: 3.4036
Epoch [5/10], Step [900/1011], Loss: 3.3902


PTBTokenizer tokenized 92805 tokens at 679801.46 tokens per second.
PTBTokenizer tokenized 18672 tokens at 213155.49 tokens per second.


Epoch [5/10], Training Loss: 3.3354, Validation Loss: 3.3772, BLEU: 0.1034, METEOR: 0.3177, CIDEr: 0.1842, Time: 777.70s
Epoch [6/10], Step [0/1011], Loss: 3.1641
Epoch [6/10], Step [300/1011], Loss: 3.2723
Epoch [6/10], Step [600/1011], Loss: 3.3838
Epoch [6/10], Step [900/1011], Loss: 3.2807


PTBTokenizer tokenized 92805 tokens at 620864.84 tokens per second.
PTBTokenizer tokenized 18530 tokens at 187795.85 tokens per second.


Epoch [6/10], Training Loss: 3.2953, Validation Loss: 3.3460, BLEU: 0.0969, METEOR: 0.3128, CIDEr: 0.1763, Time: 754.44s
Epoch [7/10], Step [0/1011], Loss: 3.2362
Epoch [7/10], Step [300/1011], Loss: 3.0829
Epoch [7/10], Step [600/1011], Loss: 3.3943
Epoch [7/10], Step [900/1011], Loss: 3.0852


PTBTokenizer tokenized 92805 tokens at 684991.58 tokens per second.
PTBTokenizer tokenized 22178 tokens at 272545.39 tokens per second.


Epoch [7/10], Training Loss: 3.2650, Validation Loss: 3.3120, BLEU: 0.0888, METEOR: 0.3104, CIDEr: 0.1801, Time: 740.83s
Epoch [8/10], Step [0/1011], Loss: 3.3377
Epoch [8/10], Step [300/1011], Loss: 3.1427
Epoch [8/10], Step [600/1011], Loss: 3.1793
Epoch [8/10], Step [900/1011], Loss: 3.4772


PTBTokenizer tokenized 92805 tokens at 425800.80 tokens per second.
PTBTokenizer tokenized 20918 tokens at 237473.70 tokens per second.


Epoch [8/10], Training Loss: 3.2394, Validation Loss: 3.2999, BLEU: 0.0943, METEOR: 0.3164, CIDEr: 0.1833, Time: 731.18s
Epoch [9/10], Step [0/1011], Loss: 3.3060
Epoch [9/10], Step [300/1011], Loss: 3.3035
Epoch [9/10], Step [600/1011], Loss: 3.3795
Epoch [9/10], Step [900/1011], Loss: 2.9288


PTBTokenizer tokenized 92805 tokens at 646839.04 tokens per second.
PTBTokenizer tokenized 21534 tokens at 263678.44 tokens per second.


Epoch [9/10], Training Loss: 3.2212, Validation Loss: 3.2860, BLEU: 0.0846, METEOR: 0.3072, CIDEr: 0.1725, Time: 728.30s
Epoch [10/10], Step [0/1011], Loss: 3.0577
Epoch [10/10], Step [300/1011], Loss: 2.9297
Epoch [10/10], Step [600/1011], Loss: 3.1617
Epoch [10/10], Step [900/1011], Loss: 3.3832


PTBTokenizer tokenized 92805 tokens at 684904.09 tokens per second.
PTBTokenizer tokenized 22304 tokens at 263332.20 tokens per second.


Epoch [10/10], Training Loss: 3.2034, Validation Loss: 3.2720, BLEU: 0.0896, METEOR: 0.3106, CIDEr: 0.1598, Time: 727.12s
Models saved successfully.
CPU times: user 11h 26min 3s, sys: 1min 39s, total: 11h 27min 43s
Wall time: 2h 6min 52s


In [9]:
dataset = "Flickr8k"

captions_file_path = f"{project_root}/flickr_data/{dataset}_Dataset/captions.txt"
image_dir = f"{project_root}/flickr_data/{dataset}_Dataset/Images"

# Load captions
caption_df = pd.read_csv(captions_file_path).dropna().drop_duplicates()

# Build vocabulary
word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=8921)

# Convert captions to sequences
captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)

# Get data transformations
test_transform = get_transform(train=False)

# Split data into training, validation, and test sets
image_names = list(captions_seqs.keys())
_, _, test_images = get_splits(image_names, test_size=0.2)

# Prepare image to captions mapping for ground truth captions
test_image2captions = prepare_image2captions(test_images, captions_seqs, idx2word)

# Create test dataset and data loader
test_dataset = FlickrDataset(
    image_dir, test_images, captions_seqs, transform=test_transform, mode='test'
)
test_loader = DataLoader(
    test_dataset,
    batch_size=1,  # Process one image at a time
    shuffle=False,
    collate_fn=collate_fn, 
    num_workers=2,
)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize models
embed_size = 256
hidden_size = 512
vocab_size = len(word2idx)
input_size = embed_size  # As per EncoderCNN's output

# Initialize EncoderCNN with device
encoder = EncoderCNN(embed_size=embed_size, device=device, top_k=5).to(device)
decoder = DecoderRNN(
    input_size=input_size,
    embed_size=embed_size,
    hidden_size=hidden_size,
    vocab_size=vocab_size,
    dropout=0.5
).to(device)

# Load trained models
encoder_path = os.path.join(project_root, "models/model_2_image_segmentation_lstm/encoder.pth")
decoder_path = os.path.join(project_root, "models/model_2_image_segmentation_lstm/decoder.pth")

encoder.load_state_dict(
    torch.load(encoder_path, map_location=device)
)
decoder.load_state_dict(
    torch.load(decoder_path, map_location=device)
)

encoder.eval()
decoder.eval()

# Retrieve <end> and <start> token indices
end_token_idx = word2idx.get('<end>', None)
start_token_idx = word2idx.get('<start>', None)

if end_token_idx is None:
    raise ValueError("The '<end>' token was not found in the vocabulary.")
if start_token_idx is None:
    raise ValueError("The '<start>' token was not found in the vocabulary.")

# Generate captions on test images
for i, (images, captions, image_ids) in enumerate(test_loader):
    if i >= 6:
        break  # Stop after processing 10 images

    images = images.to(device)
    with torch.no_grad():
        # Forward pass through encoder
        global_features, object_features = encoder(images)

        # Forward pass through decoder's sample method with correct arguments
        sampled_ids = decoder.sample(
            global_features,
            object_features,
            start_token_idx=start_token_idx,
            end_token_idx=end_token_idx
        )

    # Convert word IDs to words
    sampled_caption = [idx2word.get(word_id, '<unk>') for word_id in sampled_ids]

    # Remove words after (and including) the '<end>' token
    if '<end>' in sampled_caption:
        end_index = sampled_caption.index('<end>')
        sampled_caption = sampled_caption[:end_index]

    generated_caption = ' '.join(sampled_caption)

    # Get ground truth captions
    image_name = image_ids[0]
    gt_captions = test_image2captions.get(image_name, [])

    if not gt_captions:
        print(f'Image ID: {image_name}')
        print('Generated Caption:', generated_caption)
        print('Ground Truth Captions: None')
        print('------------------------------------')
        continue

    print(f'Image ID: {image_name}')
    print(f'Generated Caption: {generated_caption}')
    print('Ground Truth Captions:')
    for gt_caption in gt_captions:
        print(f'- {gt_caption}')
    print('------------------------------------')

Using device: cpu


  torch.load(encoder_path, map_location=device)
  torch.load(decoder_path, map_location=device)


Image ID: 2714703706_d21c5cb8df.jpg
Generated Caption: a dog is running through the water .
Ground Truth Captions:
- ['dogs', 'playing']
- ['a', 'brown', 'dog', 'is', 'biting', 'a', 'white', 'and', 'tan', 'dog', 'on', 'the', 'snout', '.']
- ['the', 'brown', 'dog', 'has', 'a', 'hold', 'of', 'the', 'other', 'dogs', 'cheek', 'with', 'its', 'teeth', '.']
- ['two', 'dogs', 'are', 'nuzzling', 'each', 'other', 'nose', 'to', 'nose', '.']
- ['two', 'dogs', 'bite', 'at', 'each', 'other', 'on', 'the', 'carpet', '.']
------------------------------------
Image ID: 3532194771_07faf20d76.jpg
Generated Caption: a man is walking through the ocean .
Ground Truth Captions:
- ['a', 'man', 'is', 'heading', 'out', 'to', 'see', 'with', 'his', 'surfboard', 'in', 'hand', '.']
- ['a', 'man', 'with', 'a', 'white', 'surfboard', 'is', 'walking', 'into', 'the', 'water', '.']
- ['a', 'person', 'walks', 'into', 'the', 'water', 'carrying', 'a', 'white', 'surfboard', '.']
- ['a', 'surfer', 'walking', 'into', 'the', 'oc

# END