# Setup

In [1]:
# ! pip install pycocoevalcap -q
# ! pip install evaluate -q

import os
import random
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision import transforms
from PIL import Image

from transformers import (
    VisionEncoderDecoderModel,
    AutoImageProcessor,
    GPT2TokenizerFast,
)
import evaluate

import nltk
from nltk.tokenize import word_tokenize
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer

from tqdm import tqdm
from pathlib import Path
import sys

import os
from tqdm import tqdm
import evaluate
import wandb

os.environ["TOKENIZERS_PARALLELISM"] = "false"  
notebook_dir = Path(os.getcwd()).resolve()  # Get the current working directory
project_root = notebook_dir.parents[1]  # Adjust the number to go up to the project root
sys.path.append(str(project_root))

print(f"Project root: {project_root}")

from data.dataset import FlickrDataset, collate_fn
from data.preprocessing import *
from metrics import calculate_metrics

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("wordnet")

_, _ = evaluate.load("bleu"), evaluate.load("meteor")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Project root: /Users/ivankoh/Personal/image-captioning-project


[nltk_data] Downloading package punkt to /Users/ivankoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ivankoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ivankoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Using the latest cached version of the module from /Users/ivankoh/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--bleu/9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Thu Dec  5 17:04:47 2024) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from /Users/ivankoh/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--meteor/e7ed321a1b44c34fa4679192809db2cee7e3bd4bba0fe8b76061d807706c2374 (last modified on Thu Dec  5 17:04:48 2024) since it couldn't be found locally at 

In [2]:
dataset = "Flickr8k"
encoder_name = "google/vit-base-patch16-224-in21k" # Vision encoder model
decoder_name = "openai-community/gpt2"  # Text decoder model
model_name = "vit_gpt2"  # Vision encoder-decoder model

# Hyperparameters
config = {
    "encoder_name": encoder_name,  # Vision encoder model
    "decoder_name": decoder_name,  # Text decoder model
    "image_dir": f"../../flickr_data/{dataset}_Dataset/Images",  # Path to images
    "captions_file": f"../../flickr_data/{dataset}_Dataset/captions.txt",  # Path to captions
    "model_save_dir": f"{model_name}",  # Directory to save models
    "vocab_size": 5000,  # Maximum vocabulary size
    "embed_size": 256,  # Embedding size (optional if not using separate embeddings)
    "hidden_size": 512,  # Hidden size for decoder (not directly used with Hugging Face models)
    "batch_size": 32,  # Batch size
    "num_epochs": 10,  # Number of training epochs
    "learning_rate": 5e-5,  # Learning rate
    "weight_decay": 1e-4,  # Weight decay for optimizer
    "num_beams": 1,  # Number of beams for beam search, 1 means greedy search
    "seed": 42,  # Random seed for reproducibility
}


# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    # Some additional settings for full reproducibility (optional)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(config["seed"])

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
# wandb login
! wandb login

In [None]:
wandb.init(project="image-captioning", config=config)

# Data

Captions

In [10]:
# Load captions
caption_df = pd.read_csv(config["captions_file"]).dropna().drop_duplicates()
print(f"Total captions loaded: {len(caption_df)}")

# Build vocabulary
tokenizer = GPT2TokenizerFast.from_pretrained(config["decoder_name"])
# https://github.com/huggingface/transformers/issues/2630
tokenizer.pad_token = (
    tokenizer.eos_token
)  # gpt2 does not have a pad token so we use eos token
tokenizer.padding_side = "left"

word2idx, idx2word, image_captions = build_vocabulary(
    caption_df,
    vocab_size=10000,
    tokenizing_fn=lambda x: tokenizer.tokenize(x, add_special_tokens=False),
)
word2idx_nltk, idx2word_nltk, image_captions_nltk = build_vocabulary(
    caption_df,
    vocab_size=10000,
    tokenizing_fn=word_tokenize,
)
print(f"Vocabulary size: {len(word2idx)}")
# set as vocab_size in config
config["vocab_size"] = len(word2idx)

# postprocess word2idx and idx2word: replace special tokens with gpt2 special tokens
word2idx[tokenizer.bos_token] = tokenizer.bos_token_id
word2idx[tokenizer.eos_token] = tokenizer.eos_token_id
word2idx[tokenizer.pad_token] = tokenizer.pad_token_id
word2idx[tokenizer.unk_token] = tokenizer.unk_token_id

idx2word[tokenizer.bos_token_id] = tokenizer.bos_token
idx2word[tokenizer.eos_token_id] = tokenizer.eos_token
idx2word[tokenizer.pad_token_id] = tokenizer.pad_token
idx2word[tokenizer.unk_token_id] = tokenizer.unk_token

special_token_mapping = {
    "start": tokenizer.bos_token,
    "end": tokenizer.eos_token,
    "pad": tokenizer.eos_token,
    "unk": tokenizer.unk_token,
}

print("Special token mapping", special_token_mapping)
# Convert captions to sequences
captions_seqs, max_length = convert_captions_to_sequences(
    image_captions,
    word2idx,
    special_token_mapping=special_token_mapping,
    tokenizing_fn=lambda x: tokenizer.tokenize(x, add_special_tokens=False),
)
captions_seqs_nltk, _ = convert_captions_to_sequences(
    image_captions_nltk,
    word2idx_nltk,
    tokenizing_fn=word_tokenize,
)
print(f"Maximum caption length: {max_length}")
# set as max_length in config
config["max_length"] = max_length

print(
    f"""Summary of special tokens replaced:
      <start>: {tokenizer.bos_token}
      <end>: {tokenizer.eos_token}
      <pad>: {tokenizer.pad_token}
      <unk>: {tokenizer.unk_token}
"""
)

Total captions loaded: 40445
Vocabulary size: 8521
Special token mapping {'start': '<|endoftext|>', 'end': '<|endoftext|>', 'pad': '<|endoftext|>', 'unk': '<|endoftext|>'}
Maximum caption length: 43
Summary of special tokens replaced:
      <start>: <|endoftext|>
      <end>: <|endoftext|>
      <pad>: <|endoftext|>
      <unk>: <|endoftext|>



Images

In [4]:
# Get data transformations - slightly modified for ViT
processor = AutoImageProcessor.from_pretrained(config["encoder_name"])
# we'll keep processor for inference


def get_transform(train=True):
    """
    Returns the image transformations for training or evaluation.
    Args:
        train (bool): Flag indicating whether transformations are for training or evaluation.
    Returns:
        transform (callable): Composed transformations.
    """
    if train:
        transform = transforms.Compose(
            [
                transforms.Resize((256, 256)),
                transforms.RandomCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=processor.image_mean, std=processor.image_std
                ),
            ]
        )
    else:
        transform = transforms.Compose(
            [
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=processor.image_mean, std=processor.image_std
                ),
            ]
        )
    return transform

train_transform = get_transform(train=True)
val_transform = get_transform(train=False)
# Split data into training and validation sets
image_names = list(image_captions.keys())
train_images, val_images, _ = get_splits(image_names, test_size=0.2)
print(f"Training samples: {len(train_images)}")
print(f"Validation samples: {len(val_images)}")

Training samples: 6472
Validation samples: 1457


DataLoader

In [5]:
# Create datasets and data loaders
train_dataset = FlickrDataset(
    config["image_dir"],
    train_images,
    captions_seqs,
    transform=train_transform,
)
val_dataset = FlickrDataset(
    config["image_dir"], val_images, captions_seqs, transform=val_transform
)
train_loader = DataLoader(
    train_dataset,
    batch_size=config["batch_size"],
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=4,
    pin_memory=True,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=config["batch_size"],
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=4,
    pin_memory=True,
)
print(f"Number of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Number of training batches: 1011
Number of validation batches: 228
Using device: cpu


In [12]:
prepare_image2captions(
    val_images,
    captions_seqs_nltk,
    idx2word_nltk,
    # special_token_mapping=special_token_mapping,
)

{'2967549094_d32422eb01.jpg': [['a',
   'man',
   'inside',
   'of',
   'a',
   'white',
   'subway',
   'train'],
  ['a', 'man', 'sits', 'by', 'a', 'window', 'on', 'a', 'train', '.'],
  ['a', 'man', 'sits', 'by', 'the', 'window', 'in', 'a', 'train', '.'],
  ['a', 'man', 'sits', 'in', 'the', 'window', 'of', 'a', 'train', '.'],
  ['the',
   'side',
   'of',
   'a',
   'subway',
   'cart',
   ',',
   'with',
   'one',
   'man',
   'in',
   'the',
   'window',
   '.']],
 '2929405404_1dff5ab847.jpg': [['a',
   'man',
   'rides',
   'a',
   'motorcycle',
   'on',
   'a',
   'track',
   '.'],
  ['a',
   'motorcycle',
   'racer',
   'is',
   'producing',
   'sparks',
   'by',
   'leaning',
   'during',
   'a',
   'turn',
   '.'],
  ['a',
   'motorcyclist',
   'is',
   'turning',
   'a',
   'sharp',
   'corner',
   'on',
   'his',
   'red',
   'motorbike',
   'and',
   'is',
   'scraping',
   'it',
   'on',
   'the',
   'road',
   '.'],
  ['a', 'motorcyclist', 'making', 'a', 'sharp', 'turn', '

# Model

In [42]:
print(f"loading encoder: {encoder_name} and decoder: {decoder_name}...")
# Load the model
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_name,
    decoder_name,
)

# slight modification for gpt2 tokenizer
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.vocab_size = config["vocab_size"]
model.config.beam_size = config["num_beams"]
model.config.max_length = config["max_length"]

model.to(device)

loading encoder: google/vit-base-patch16-224-in21k and decoder: openai-community/gpt2...


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weigh

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(i

# Train

In [43]:
% load_ext autoreload   
% autoreload 2
# Initialize metrics storage
train_losses = []
val_losses = []
bleu_scores = []
meteor_scores = []
cider_scores = []

# Define optimizer
optimizer = optim.Adam(
    model.parameters(),
    lr=config["learning_rate"],
    weight_decay=config["weight_decay"],
)

# Total training steps
num_epochs = config["num_epochs"]
total_steps = num_epochs * len(train_loader)

# Initialize the learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=2,
    verbose=True,
)

# Prepare image to captions mapping for evaluation
val_image2captions = prepare_image2captions(val_images, captions_seqs, idx2word, special_token_mapping=special_token_mapping)

# Training loop
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    total_train_loss = 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for images, captions, _ in pbar:
        images = images.to(device)
        captions = captions.to(device)

        # Forward pass
        outputs = model(pixel_values=images, labels=captions)
        loss = outputs.loss
        total_train_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()

        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update parameters
        optimizer.step()

        pbar.set_postfix({"loss": loss.item()})

    # Calculate average training loss for the epoch
    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        pbar_val = tqdm(val_loader, desc="Validation")
        for images, captions, _ in pbar_val:
            images = images.to(device)
            captions = captions.to(device)

            outputs = model(pixel_values=images, labels=captions)
            loss = outputs.loss
            total_val_loss += loss.item()
            pbar_val.set_postfix({"val_loss": loss.item()})

    val_loss = total_val_loss / len(val_loader)  # average validation loss
    val_losses.append(val_loss)

    # Calculate evaluation metrics
    metrics = calculate_metrics(
        model=model,
        image_dir=config["image_dir"],
        image_ids=val_images,
        image2captions=val_image2captions,
        transform=val_transform,
        tokenizer=tokenizer,
        device=device,
        max_length=config["max_length"],
        verbose=True,
        
    )
    bleu_scores.append(metrics["bleu"])
    meteor_scores.append(metrics["meteor"])
    cider_scores.append(metrics["cider"])

    # Update learning rate scheduler
    scheduler.step(val_loss)

    # Print epoch summary including new metrics
    epoch_duration = time.time() - start_time
    print(
        f"\nEpoch [{epoch+1}/{num_epochs}] completed in {epoch_duration:.2f}s"
        f"\nTraining Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}"
        f"\nBLEU Score: {metrics['bleu']:.4f}, METEOR Score: {metrics['meteor']:.4f}, CIDEr Score: {metrics['cider']:.4f}\n"
    )

Epoch 1/10:   4%|▍         | 44/1011 [04:31<1:42:30,  6.36s/it, loss=2.78]

In [None]:
# simply save the model
model.save_pretrained(config['model_save_dir'])

In [1]:
! pip list

Package                   Version
------------------------- ----------
accelerate                1.0.1
aiohappyeyeballs          2.4.3
aiohttp                   3.10.5
aiosignal                 1.3.1
altair                    5.4.1
annotated-types           0.7.0
anyio                     4.4.0
appnope                   0.1.4
asttokens                 2.4.1
async-timeout             4.0.3
attrs                     24.2.0
backcall                  0.2.0
backports.zoneinfo        0.2.1
blinker                   1.8.2
blis                      0.7.10
Brotli                    1.0.9
cachetools                5.5.0
catalogue                 2.0.10
certifi                   2024.8.30
cffi                      1.14.6
cfgv                      3.3.1
charset-normalizer        3.4.0
click                     8.1.7
cloudpathlib              0.20.0
colorama                  0.4.6
comm                      0.2.2
confection                0.1.4
contourpy                 1.0.5
cycler                 