<a href="https://colab.research.google.com/github/inuwamobarak/image-capturing-pre-trained/blob/main/Image_Caption_Generation_Using_Generative_Artificial_Intelligence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing requirements

In [None]:
# Install required packages for the code to run
!pip install transformers rouge_score evaluate datasets

## Libraries and Packages

In [None]:
# Importing necessary libraries and packages

import requests  # Library for making HTTP requests
import torch  # PyTorch for deep learning
from PIL import Image  # Library for image processing
from transformers import *  # Transformers library for NLP tasks
from tqdm import tqdm  # Library for displaying progress bars
import numpy as np # Library for numerical manipulation

device = "cuda" if torch.cuda.is_available() else "cpu"  # Checking for GPU availability and setting the device accordingly

## Loading Encoders and Decoders

In [None]:
# The model used for encoding the image and extracting image features
# Available encoder models:
# encoder_model = "WinKawaks/vit-small-patch16-224"
# encoder_model = "google/vit-base-patch16-224"
# encoder_model = "google/vit-base-patch16-224-in21k"
encoder_model = "microsoft/swin-base-patch4-window7-224-in22k"

# The model used for decoding the image features and generating captions
# Available decoder models:
# decoder_model = "bert-base-uncased"
# decoder_model = "prajjwal1/bert-tiny"
decoder_model = "gpt2"

## Load the pre-trained Encoder and Decoder models
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model, decoder_model
).to(device)

## Loading Tokenizers and Image Processors

In [None]:
## Initialize the Tokenizer

# The tokenizer is used to preprocess the text and convert it into numerical inputs for the model
# Available tokenizers:
# tokenizer = AutoTokenizer.from_pretrained(decoder_model)
tokenizer = GPT2TokenizerFast.from_pretrained(decoder_model)
# tokenizer = BertTokenizerFast.from_pretrained(decoder_model)

## Initialize the Image Processor

# The image processor is used to preprocess the images and extract visual features
# Available image processors:
# - ViTImageProcessor (for "google/vit-base-patch16-224" and "microsoft/swin-base-patch4-window7-224-in22k" encoder models)
image_processor = ViTImageProcessor.from_pretrained(encoder_model)

## Configuring the Model and Tokenizer for the Decoder

In [None]:
# If the decoder model is "gpt2"
if "gpt2" in decoder_model:
    # Adjust the tokenizer and model configurations for "gpt2"
    tokenizer.pad_token = tokenizer.eos_token  # Set pad_token_id as eos_token_id
    model.config.eos_token_id = tokenizer.eos_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.decoder_start_token_id = tokenizer.bos_token_id  # Set decoder_start_token_id as bos_token_id
else:
    # For other decoder models
    model.config.decoder_start_token_id = tokenizer.cls_token_id  # Set decoder_start_token_id as cls_token_id
    model.config.pad_token_id = tokenizer.pad_token_id  # Set pad_token_id as pad_token_id

# Importing and Loading the Dataset

In [None]:
from datasets import load_dataset

max_length = 32  # Maximum length of the captions in tokens
coco_dataset_ratio = 50  # 50% of the COCO2014 dataset

# Load the COCO2014 dataset for training, validation, and testing splits
train_ds = load_dataset("HuggingFaceM4/COCO", split=f"train[:{coco_dataset_ratio}%]")
valid_ds = load_dataset("HuggingFaceM4/COCO", split=f"validation[:{coco_dataset_ratio}%]")
test_ds = load_dataset("HuggingFaceM4/COCO", split="test")

# Get the number of examples in each split
train_len = len(train_ds)
valid_len = len(valid_ds)
test_len = len(test_ds)

train_len, valid_len, test_len  # Display the number of examples in each split

# Removing Images with Less than 3 Dimensions

In [None]:
# Filter out images with less than 3 dimensions (possibly grayscale images)
train_ds = train_ds.filter(lambda item: np.array(item["image"]).ndim in [3, 4], num_proc=2)
valid_ds = valid_ds.filter(lambda item: np.array(item["image"]).ndim in [3, 4], num_proc=2)
test_ds = test_ds.filter(lambda item: np.array(item["image"]).ndim in [3, 4], num_proc=2)

# Dataset Preprocessing

In [None]:
def preprocess(items):
    # Preprocess the image
    pixel_values = image_processor(items["image"], return_tensors="pt").pixel_values.to(device)

    # Tokenize the captions with truncation and padding
    targets = tokenizer([sentence["raw"] for sentence in items["sentences"]],
                        max_length=max_length, padding="max_length", truncation=True, return_tensors="pt").to(device)

    return {'pixel_values': pixel_values, 'labels': targets["input_ids"]}

# Apply the preprocess function to transform the datasets during training
train_dataset = train_ds.map(preprocess)
valid_dataset = valid_ds.map(preprocess)
test_dataset = test_ds.map(preprocess)

# Batch Collation Function

In [None]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.stack([x['labels'] for x in batch])
    }

# This function takes a batch of preprocessed examples and stacks the pixel values and labels into tensors. It will be used by the data loader to collate the samples into batches.

# Metrics Computation

In [None]:
import evaluate

# Load the Rouge and Bleu metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    preds = eval_pred.label_ids
    labels = eval_pred.predictions

    # Decode the predictions and labels
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute the Rouge score
    rouge_result = rouge.compute(predictions=pred_str, references=labels_str)
    rouge_result = {k: round(v * 100, 4) for k, v in rouge_result.items()}  # Multiply by 100 to get the same scale as the Rouge score

    # Compute the Bleu score
    bleu_result = bleu.compute(predictions=pred_str, references=labels_str)

    # Get the length of the generated captions
    generation_length = bleu_result["translation_length"]

    return {
        **rouge_result,
        "bleu": round(bleu_result["bleu"] * 100, 4),
        "gen_len": bleu_result["translation_length"] / len(preds)
    }

# This function takes the evaluation predictions (including label ids and predicted ids) and computes the Rouge and Bleu scores for the generated captions. It also calculates the average generation length.

# Training Parameters

In [None]:
num_epochs = 2  # Number of epochs
batch_size = 16  # Batch size

# Set the number of training epochs and the batch size. Adjust these values according to your specific requirements.

# Dataset Example Shapes

In [None]:
# Iterate over the training dataset and print the shapes of labels and pixel values for an example.
for item in train_dataset:
    print(item["labels"].shape)
    print(item["pixel_values"].shape)
    break

# Training Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,             # Use generate to calculate the loss
    num_train_epochs=num_epochs,            # Number of training epochs
    evaluation_strategy="steps",            # Evaluate after each eval_steps
    eval_steps=2000,                        # Evaluate after each 2000 steps
    logging_steps=2000,                     # Log after each 2000 steps
    save_steps=2000,                        # Save after each 2000 steps
    per_device_train_batch_size=batch_size, # Batch size for training
    per_device_eval_batch_size=batch_size,  # Batch size for evaluation
    output_dir="vit-swin-base-224-gpt2-image-captioning",  # Output directory for saving checkpoints and logs
    # push_to_hub=True # Whether you want to push the model to the hub
    # Check this guide for more details: https://huggingface.co/transformers/model_sharing.html
)

# DataLoader Functions

In [None]:
from torch.utils.data import DataLoader

def get_eval_loader(eval_dataset=None):
    return DataLoader(valid_dataset, collate_fn=collate_fn, batch_size=batch_size)

def get_test_loader(eval_dataset=None):
    return DataLoader(test_dataset, collate_fn=collate_fn, batch_size=batch_size)

# Override the `get_train_dataloader`, `get_eval_dataloader`, and `get_test_dataloader` methods of the trainer
# so that we can properly load the data

trainer.get_train_dataloader = lambda: DataLoader(train_dataset, collate_fn=collate_fn, batch_size=batch_size)
trainer.get_eval_dataloader = get_eval_loader
trainer.get_test_dataloader = get_test_loader

# These functions define the data loaders for training, evaluation, and testing. We override the default methods in the trainer to use our custom data loaders that properly collate the batches using the `collate_fn` function.