In [1]:
import os
import time
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from pathlib import Path

notebook_dir = Path(os.getcwd()).resolve()  # Get the current working directory
project_root = notebook_dir.parents[1]  # Adjust the number to go up to the project root
sys.path.append(str(project_root))

print(f"Project root: {project_root}")

Project root: /Users/jed/anaconda3/omscs/CS7643/image-captioning-project


In [2]:
from models.model_3_image_segmentation_attention_decoder.model import *
from data.dataset import *
from data.preprocessing import *
from metrics import *

[nltk_data] Downloading package punkt to /Users/jed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Evaluate function: Computes validation loss on a given dataset
def evaluate(encoder, decoder, data_loader, criterion, device, vocab_size):
    """
    Evaluate the model on the validation set.
    Args:
        encoder: Encoder model.
        decoder: Decoder model.
        data_loader: DataLoader for the validation set.
        criterion: Loss function.
        device: Computation device (CPU or GPU).
        vocab_size: Size of the vocabulary.
    Returns:
        average_loss: Average validation loss.
    """
    encoder.eval()  # Set encoder to evaluation mode
    decoder.eval()  # Set decoder to evaluation mode
    total_loss = 0
    total_samples = 0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for images, captions, lengths in data_loader:
            # Move data to the computation device
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass through encoder and decoder
            features = encoder(images)
            outputs = decoder(features, captions)

            # Prepare targets
            targets = captions[:, 1:]

            # Reshape outputs and targets for loss computation
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            # Compute loss
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            total_samples += 1

    # Calculate average loss
    average_loss = total_loss / total_samples
    return average_loss

In [4]:
def main():
    # Define dataset type
    dataset = "Flickr8k"  # Change to "Flickr30k" if needed

    random.seed(7643)
    
    # Paths
    dataset_dir = f"../../flickr_data/{dataset}_Dataset/Images"
    captions_file = f"../../flickr_data/{dataset}_Dataset/captions.txt"
    image_dir = dataset_dir

    train_losses = []
    val_losses = []
    bleu_scores = []
    meteor_scores = []
    cider_scores = []
    
    # Load captions
    caption_df = pd.read_csv(captions_file).dropna().drop_duplicates()

    # Build vocabulary
    word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=8000)
    print(f"Vocabulary size: {len(word2idx)}")

    # Convert captions to sequences
    captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)
    print(f"Maximum caption length: {max_length}")

    # Get data transformations
    train_transform = get_transform(train=True)
    val_transform = get_transform(train=False)

    # Split data into training and validation sets
    image_names = list(image_captions.keys())
    random.shuffle(image_names)
    val_size = int(0.2 * len(image_names))  # 20% for validation
    train_images = image_names[val_size:]
    val_images = image_names[:val_size]
    print(f"Training samples: {len(train_images)}")
    print(f"Validation samples: {len(val_images)}")

    # Create datasets and data loaders
    # Note the mode='train' for val_dataset to ensure it behaves like the training dataset
    train_dataset = FlickrDataset(
        image_dir, train_images, captions_seqs, transform=train_transform, mode='train'
    )
    val_dataset = FlickrDataset(
        image_dir, val_images, captions_seqs, transform=val_transform, mode='train'
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2,
    )

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize models
    embed_size = 256
    hidden_size = 512
    vocab_size = len(word2idx)
    encoder = Encoder(embed_size=embed_size).to(device)
    decoder = Decoder(
        embed_size=embed_size,
        hidden_size=hidden_size,
        vocab_size=vocab_size,
        num_heads=8,
        num_layers=2,
        dropout=0.5
    ).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<pad>"])
    params = list(filter(lambda p: p.requires_grad, encoder.parameters())) + list(
        decoder.parameters()
    )
    optimizer = optim.Adam(params, lr=3e-4)
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

    num_epochs = 10
    total_step = len(train_loader)

    val_image_ids = val_images
    image2captions = prepare_image2captions(val_image_ids, captions_seqs, idx2word)

    for epoch in range(num_epochs):
        start_time = time.time()
        encoder.train()
        decoder.train()
        total_loss = 0

        for i, (images, captions, lengths) in enumerate(train_loader):
            images = images.to(device)
            captions = captions.to(device)

            # Forward pass
            features = encoder(images)
            outputs = decoder(features, captions)

            # Prepare targets
            targets = captions[:, 1:]  # Exclude the first <start> token

            # Reshape for loss computation without excluding any time steps
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            # Compute loss
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=5)
            optimizer.step()

            total_loss += loss.item()

            if i % 300 == 0:
                print(
                    f"Epoch [{epoch+1}/{num_epochs}], Step [{i}/{total_step}], Loss: {loss.item():.4f}"
                )

        # Calculate average training loss
        avg_train_loss = total_loss / total_step
        train_losses.append(avg_train_loss)

        # Adjust learning rate
        scheduler.step()

        # Validation
        val_loss = evaluate(encoder, decoder, val_loader, criterion, device, vocab_size)
        print(
            f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}"
        )

        # Calculate evaluation metrics
        bleu = calculate_bleu_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        meteor = calculate_meteor_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        cider = calculate_cider_score(
            encoder=encoder,
            decoder=decoder,
            image_dir=image_dir,
            image_ids=val_image_ids,
            image2captions=image2captions,
            transform=val_transform,
            idx2word=idx2word,
            device=device,
            word2idx=word2idx,
        )

        end_time = time.time()
        epoch_time = end_time - start_time

        print(
            f"Epoch [{epoch+1}/{num_epochs}] completed in {epoch_time:.2f} seconds."
        )
        print(
            f"BLEU Score: {bleu:.4f}, METEOR Score: {meteor:.4f}, CIDEr Score: {cider:.4f}\n"
        )

        val_losses.append(val_loss)
        bleu_scores.append(bleu)
        meteor_scores.append(meteor)
        cider_scores.append(cider)

    # Save the models
    os.makedirs("models/transformer_model", exist_ok=True)
    torch.save(encoder.state_dict(), "models/transformer_model/encoder.pth")
    torch.save(decoder.state_dict(), "models/transformer_model/decoder.pth")
    print("Models saved successfully.")

    # Plot training and validation loss
    plt.figure()
    plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
    plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    plt.savefig('models/transformer_model/loss_plot.png')
    plt.close()

    # Plot evaluation metrics
    plt.figure()
    plt.plot(range(1, num_epochs + 1), bleu_scores, label='BLEU Score')
    plt.plot(range(1, num_epochs + 1), meteor_scores, label='METEOR Score')
    plt.plot(range(1, num_epochs + 1), cider_scores, label='CIDEr Score')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.title('Evaluation Metrics over Epochs')
    plt.legend()
    plt.savefig('models/transformer_model/metrics_plot.png')
    plt.close()

In [41]:
%%time

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

if __name__ == "__main__":
    main()

Vocabulary size: 8000
Maximum caption length: 40
Training samples: 6473
Validation samples: 1618
Using device: cuda
Epoch [1/10], Step [0/1012], Loss: 9.7795
Epoch [1/10], Step [300/1012], Loss: 4.6037
Epoch [1/10], Step [600/1012], Loss: 3.4147
Epoch [1/10], Step [900/1012], Loss: 3.1340
Epoch [1/10], Training Loss: 4.0127, Validation Loss: 3.4985


PTBTokenizer tokenized 103083 tokens at 726853.61 tokens per second.
PTBTokenizer tokenized 20308 tokens at 248518.18 tokens per second.


Epoch [1/10] completed in 414.41 seconds.
BLEU Score: 0.1041, METEOR Score: 0.3254, CIDEr Score: 0.1986

Epoch [2/10], Step [0/1012], Loss: 3.3632
Epoch [2/10], Step [300/1012], Loss: 3.6082
Epoch [2/10], Step [600/1012], Loss: 3.2547
Epoch [2/10], Step [900/1012], Loss: 3.3890
Epoch [2/10], Training Loss: 3.3526, Validation Loss: 3.2753


PTBTokenizer tokenized 103083 tokens at 726412.30 tokens per second.
PTBTokenizer tokenized 22224 tokens at 276874.12 tokens per second.


Epoch [2/10] completed in 427.90 seconds.
BLEU Score: 0.1012, METEOR Score: 0.3319, CIDEr Score: 0.1889

Epoch [3/10], Step [0/1012], Loss: 3.2842
Epoch [3/10], Step [300/1012], Loss: 3.0268
Epoch [3/10], Step [600/1012], Loss: 3.0981
Epoch [3/10], Step [900/1012], Loss: 2.7776
Epoch [3/10], Training Loss: 3.1417, Validation Loss: 3.1903


PTBTokenizer tokenized 103083 tokens at 614643.46 tokens per second.
PTBTokenizer tokenized 27127 tokens at 246910.70 tokens per second.


Epoch [3/10] completed in 471.73 seconds.
BLEU Score: 0.0800, METEOR Score: 0.3222, CIDEr Score: 0.1995

Epoch [4/10], Step [0/1012], Loss: 2.9163
Epoch [4/10], Step [300/1012], Loss: 3.1673
Epoch [4/10], Step [600/1012], Loss: 2.9491
Epoch [4/10], Step [900/1012], Loss: 3.3108
Epoch [4/10], Training Loss: 3.0120, Validation Loss: 3.1264


PTBTokenizer tokenized 103083 tokens at 728899.53 tokens per second.
PTBTokenizer tokenized 20539 tokens at 259546.38 tokens per second.


Epoch [4/10] completed in 430.74 seconds.
BLEU Score: 0.1018, METEOR Score: 0.3294, CIDEr Score: 0.2038

Epoch [5/10], Step [0/1012], Loss: 2.9262
Epoch [5/10], Step [300/1012], Loss: 2.6092
Epoch [5/10], Step [600/1012], Loss: 2.6975
Epoch [5/10], Step [900/1012], Loss: 3.3131
Epoch [5/10], Training Loss: 2.9205, Validation Loss: 3.1248


PTBTokenizer tokenized 103083 tokens at 728198.39 tokens per second.
PTBTokenizer tokenized 20942 tokens at 263670.03 tokens per second.


Epoch [5/10] completed in 425.80 seconds.
BLEU Score: 0.1200, METEOR Score: 0.3405, CIDEr Score: 0.2443

Epoch [6/10], Step [0/1012], Loss: 2.9137
Epoch [6/10], Step [300/1012], Loss: 2.4160
Epoch [6/10], Step [600/1012], Loss: 2.7559
Epoch [6/10], Step [900/1012], Loss: 2.7258
Epoch [6/10], Training Loss: 2.7706, Validation Loss: 3.0596


PTBTokenizer tokenized 103083 tokens at 727765.79 tokens per second.
PTBTokenizer tokenized 22660 tokens at 279572.15 tokens per second.


Epoch [6/10] completed in 424.21 seconds.
BLEU Score: 0.1155, METEOR Score: 0.3484, CIDEr Score: 0.2418

Epoch [7/10], Step [0/1012], Loss: 2.8240
Epoch [7/10], Step [300/1012], Loss: 2.7229
Epoch [7/10], Step [600/1012], Loss: 3.0268
Epoch [7/10], Step [900/1012], Loss: 2.9078
Epoch [7/10], Training Loss: 2.7342, Validation Loss: 3.0475


PTBTokenizer tokenized 103083 tokens at 719047.73 tokens per second.
PTBTokenizer tokenized 22754 tokens at 280086.27 tokens per second.


Epoch [7/10] completed in 440.42 seconds.
BLEU Score: 0.1115, METEOR Score: 0.3462, CIDEr Score: 0.2409

Epoch [8/10], Step [0/1012], Loss: 2.5466
Epoch [8/10], Step [300/1012], Loss: 2.7575
Epoch [8/10], Step [600/1012], Loss: 2.4985
Epoch [8/10], Step [900/1012], Loss: 2.6328
Epoch [8/10], Training Loss: 2.7163, Validation Loss: 3.0446


PTBTokenizer tokenized 103083 tokens at 729663.45 tokens per second.
PTBTokenizer tokenized 22036 tokens at 273005.65 tokens per second.


Epoch [8/10] completed in 441.83 seconds.
BLEU Score: 0.1186, METEOR Score: 0.3485, CIDEr Score: 0.2488

Epoch [9/10], Step [0/1012], Loss: 2.5299
Epoch [9/10], Step [300/1012], Loss: 2.7497
Epoch [9/10], Step [600/1012], Loss: 2.8348
Epoch [9/10], Step [900/1012], Loss: 2.8622
Epoch [9/10], Training Loss: 2.6997, Validation Loss: 3.0398


PTBTokenizer tokenized 103083 tokens at 730151.73 tokens per second.
PTBTokenizer tokenized 22192 tokens at 272883.73 tokens per second.


Epoch [9/10] completed in 401.47 seconds.
BLEU Score: 0.1190, METEOR Score: 0.3483, CIDEr Score: 0.2452

Epoch [10/10], Step [0/1012], Loss: 2.6733
Epoch [10/10], Step [300/1012], Loss: 2.8143
Epoch [10/10], Step [600/1012], Loss: 2.5130
Epoch [10/10], Step [900/1012], Loss: 2.7729
Epoch [10/10], Training Loss: 2.6848, Validation Loss: 3.0325


PTBTokenizer tokenized 103083 tokens at 732243.43 tokens per second.
PTBTokenizer tokenized 21574 tokens at 267513.28 tokens per second.


Epoch [10/10] completed in 401.14 seconds.
BLEU Score: 0.1236, METEOR Score: 0.3535, CIDEr Score: 0.2596

Models saved successfully.
CPU times: user 1d 7h 3min 10s, sys: 1min 23s, total: 1d 7h 4min 33s
Wall time: 1h 11min 29s


# Testing

In [5]:
dataset = "Flickr8k"

captions_file_path = f"{project_root}/flickr_data/{dataset}_Dataset/captions.txt"
image_dir = f"{project_root}/flickr_data/{dataset}_Dataset/Images"

# Load captions
caption_df = pd.read_csv(captions_file_path).dropna().drop_duplicates()

# Build vocabulary
word2idx, idx2word, image_captions = build_vocabulary(caption_df, vocab_size=8000)

# Convert captions to sequences
captions_seqs, max_length = convert_captions_to_sequences(image_captions, word2idx)

# Get data transformations
test_transform = get_transform(train=False)

# Split data into training and validation sets
image_names = list(image_captions.keys())
random.shuffle(image_names)
val_size = int(0.2 * len(image_names))  # 20% for validation
test_images = image_names[:val_size]

# Randomly select 6 images from the test_images
sampled_test_images = random.sample(test_images, 6)

# Prepare image to captions mapping for ground truth captions
test_image2captions = prepare_image2captions(sampled_test_images, captions_seqs, idx2word)

# Create test dataset and data loader for only those 6 randomly selected images
test_dataset = FlickrDataset(
    image_dir, sampled_test_images, captions_seqs, transform=test_transform, mode='test'
)
test_loader = DataLoader(
    test_dataset,
    batch_size=1,  # Process one image at a time
    shuffle=False,
    collate_fn=collate_fn, 
    num_workers=2,
)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define hyperparameters (ensure they match training)
embed_size = 256
num_layers = 2
num_heads = 8
dropout = 0.5
vocab_size = len(word2idx)

# Initialize EncoderCNN with device
encoder = Encoder(embed_size=embed_size).to(device)
decoder = Decoder(
    embed_size=embed_size,
    vocab_size=vocab_size,
    num_heads=num_heads,
    num_layers=num_layers,
    dropout=dropout
).to(device)

# Load trained models
encoder_path = os.path.join(project_root, "models/model_3_image_segmentation_attention_decoder/encoder.pth")
decoder_path = os.path.join(project_root, "models/model_3_image_segmentation_attention_decoder/decoder.pth")

if not os.path.exists(encoder_path):
    raise FileNotFoundError(f"Encoder model not found at {encoder_path}")
if not os.path.exists(decoder_path):
    raise FileNotFoundError(f"Decoder model not found at {decoder_path}")

encoder.load_state_dict(
    torch.load(encoder_path, map_location=device, weights_only=True)
)
decoder.load_state_dict(
    torch.load(decoder_path, map_location=device, weights_only=True)
)

encoder.eval()
decoder.eval()

# Retrieve <end> and <start> token indices
end_token_idx = word2idx.get('<end>', None)
start_token_idx = word2idx.get('<start>', None)

if end_token_idx is None:
    raise ValueError("The '<end>' token was not found in the vocabulary.")
if start_token_idx is None:
    raise ValueError("The '<start>' token was not found in the vocabulary.")

# Generate captions on test images
for i, (images, captions, image_ids) in enumerate(test_loader):
    if i >= 6:  # Process only the first 6 images
        break

    images = images.to(device)
    with torch.no_grad():
        # Forward pass through encoder
        memory = encoder(images)  # memory: (batch_size, memory_seq_len, embed_size)

        # Forward pass through decoder's sample method with correct arguments
        # Assuming TransformerDecoder's sample method signature:
        # def sample(self, memory, start_token_idx, end_token_idx, max_len=20)
        sampled_ids = decoder.sample(
            memory,
            end_token_idx=end_token_idx,
            max_len=50
        )

    # Convert word IDs to words
    sampled_caption = [idx2word.get(word_id, '<unk>') for word_id in sampled_ids]

    # Remove words after (and including) the '<end>' token
    if '<end>' in sampled_caption:
        end_index = sampled_caption.index('<end>')
        sampled_caption = sampled_caption[:end_index]

    # Remove words before (and including) the '<start>' token
    if '<start>' in sampled_caption:
        start_index = sampled_caption.index('<start>')
        sampled_caption = sampled_caption[start_index+1:]
        
    generated_caption = ' '.join(sampled_caption)

    # Get ground truth captions
    image_name = image_ids[0]
    gt_captions = test_image2captions.get(image_name, [])

    if not gt_captions:
        print(f'Image ID: {image_name}')
        print('Generated Caption:', generated_caption)
        print('Ground Truth Captions: None')
        print('------------------------------------')
        continue

    print(f'Image ID: {image_name}')
    print(f'Generated Caption: {generated_caption}')
    print('Ground Truth Captions:')
    for gt_caption in gt_captions:
        print(f'- {" ".join(gt_caption)}')  # Join words for ground truth captions
    print('-------------------------------------')

Using device: cpu
Image ID: 3228960484_9aab98b91a.jpg
Generated Caption: a man in a black shirt is standing on a sidewalk .
Ground Truth Captions:
- children walking on a sidewalk with yellow backpacks .
- two children are walking along a street wearing yellow backpacks .
- two children are walking on a sidewalk wearing yellow and red backpacks .
- two kids walk up the sidewalk with their backpacks .
- two young children with yellow backpacks walking down a sidewalk
-------------------------------------
Image ID: 2656749876_e32495bd8c.jpg
Generated Caption: a man in a black shirt and a woman walking down a sidewalk .
Ground Truth Captions:
- a man talks on his cellphone while he walks down the street .
- a member of the clergy carries a blue bag in his hand and talks on the phone as he walks .
- a priest carrying a small blue bag walking down the street talking on a cellphone .
- priest in black walking on sidewalk carrying a blue bag and talking on cellphone .
- priest walking with bl

In [5]:
image2captions = prepare_image2captions(test_images, captions_seqs, idx2word)

# Calculate BLEU, METEOR, and CIDEr scores on the sampled test images
bleu_score = calculate_bleu_score(
    encoder=encoder,
    decoder=decoder,
    image_dir=image_dir,
    image_ids=test_images,
    image2captions=image2captions,
    transform=test_transform,
    idx2word=idx2word,
    device=device,
    word2idx=word2idx
)

meteor = calculate_meteor_score(
    encoder=encoder,
    decoder=decoder,
    image_dir=image_dir,
    image_ids=test_images,
    image2captions=image2captions,
    transform=test_transform,
    idx2word=idx2word,
    device=device,
    word2idx=word2idx
)

cider = calculate_cider_score(
    encoder=encoder,
    decoder=decoder,
    image_dir=image_dir,
    image_ids=test_images,
    image2captions=image2captions,
    transform=test_transform,
    idx2word=idx2word,
    device=device,
    word2idx=word2idx
)

print("BLEU Score:", bleu_score)
print("METEOR Score:", meteor)
print("CIDEr Score:", cider)

PTBTokenizer tokenized 103345 tokens at 953155.03 tokens per second.
PTBTokenizer tokenized 21347 tokens at 347864.11 tokens per second.


BLEU Score: 0.12747498835024573
METEOR Score: 0.3561023538912437
CIDEr Score: 0.2857489304258189


# END