In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("prathamsaraf1389/senticap")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/prathamsaraf1389/senticap?dataset_version_number=1...


100%|██████████| 347M/347M [00:07<00:00, 48.1MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/prathamsaraf1389/senticap/versions/1


In [None]:
%pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import (
    VisionEncoderDecoderModel,
    ViTImageProcessor,
    AutoTokenizer,
    ViTModel,
    GPT2LMHeadModel,
    VisionEncoderDecoderConfig,
    GPT2Config,
    get_linear_schedule_with_warmup
)
from transformers import AdamW
from tqdm.auto import tqdm
import torch.cuda.amp as amp
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp

class ImageCaptioningDataset(Dataset):
    def __init__(self, csv_file, img_dir, processor, tokenizer, max_length=128):
        self.df = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.processor = processor
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Pre-filter invalid images to avoid checking during training
        self.df['image_path'] = self.df['filename'].apply(lambda x: os.path.join(img_dir, x))
        self.df = self.df[self.df['image_path'].apply(os.path.exists)].reset_index(drop=True)
        print(f"Found {len(self.df)} valid image-text pairs")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.df.iloc[idx]['image_path']
        text = self.df.iloc[idx]['raw']

        try:
            image = Image.open(image_path).convert('RGB')
            pixel_values = self.processor(image, return_tensors="pt").pixel_values.squeeze()

            labels = self.tokenizer(
                text,
                padding="max_length",
                max_length=self.max_length,
                truncation=True,
                return_tensors="pt"
            ).input_ids.squeeze()

            return {
                "pixel_values": pixel_values,
                "labels": labels
            }
        except Exception as e:
            print(f"Error processing {image_path}: {str(e)}")
            return None


def collate_fn(batch):
    """Remove None values and create batches"""
    batch = [item for item in batch if item is not None]
    if not batch:
        return None

    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])

    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

def train_model(
    csv_path,
    img_dir,
    output_dir,
    num_epochs=10,
    batch_size=32,  # Increased batch size
    learning_rate=2e-4,  # Slightly increased learning rate
    max_length=128,
    num_workers=4,
    gradient_accumulation_steps=4,
    warmup_steps=1000,
    fp16=True,
    device='cuda' if torch.cuda.is_available() else 'cpu'
):
    # Enable cuDNN autotuner
    torch.backends.cudnn.benchmark = True

    # Initialize distributed training if multiple GPUs available
    if torch.cuda.device_count() > 1:
        dist.init_process_group(backend='nccl')
        local_rank = dist.get_rank()
        torch.cuda.set_device(local_rank)
    else:
        local_rank = 0

    # Load encoder
    encoder = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

    # Create decoder config with cross-attention enabled
    decoder_config = GPT2Config.from_pretrained(
        "gpt2",
        add_cross_attention=True,
        is_decoder=True
    )

    # Load decoder with modified config
    decoder = GPT2LMHeadModel.from_pretrained(
        "gpt2",
        config=decoder_config
    )

    # Create encoder-decoder config
    config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(
        encoder.config,
        decoder_config
    )

    # Initialize model
    model = VisionEncoderDecoderModel(
        config=config,
        encoder=encoder,
        decoder=decoder
    )

    # Initialize processor and tokenizer
    processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
    tokenizer = AutoTokenizer.from_pretrained("gpt2")

    # Add special tokens
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.decoder_start_token_id = tokenizer.bos_token_id
    model.config.eos_token_id = tokenizer.eos_token_id

    # Create dataset and dataloader
    dataset = ImageCaptioningDataset(
        csv_file=csv_path,
        img_dir=img_dir,
        processor=processor,
        tokenizer=tokenizer,
        max_length=max_length
    )

    # Use DistributedSampler if using multiple GPUs
    if torch.cuda.device_count() > 1:
        sampler = torch.utils.data.DistributedSampler(dataset)
    else:
        sampler = None

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=(sampler is None),
        sampler=sampler,
        num_workers=num_workers,
        pin_memory=True,  # Faster data transfer to GPU
        collate_fn=collate_fn,
        prefetch_factor=2  # Prefetch next batch
    )

    # Move model to device and wrap with DDP if using multiple GPUs
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        model = DDP(model, device_ids=[local_rank])

    # Initialize optimizer with weight decay
    optimizer = AdamW(
        [
            {"params": model.parameters(), "lr": learning_rate},
        ],
        weight_decay=0.01
    )

    # Learning rate scheduler
    num_training_steps = len(dataloader) * num_epochs // gradient_accumulation_steps
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_training_steps
    )

    # Initialize gradient scaler for mixed precision training
    scaler = amp.GradScaler() if fp16 else None

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        if sampler is not None:
            sampler.set_epoch(epoch)

        total_loss = 0
        optimizer.zero_grad()
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")

        for step, batch in enumerate(progress_bar):
            if batch is None:
                continue

            # Move batch to device
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            # Mixed precision training
            if fp16:
                with amp.autocast():
                    outputs = model(pixel_values=pixel_values, labels=labels)
                    loss = outputs.loss / gradient_accumulation_steps

                scaler.scale(loss).backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    optimizer.zero_grad()
            else:
                outputs = model(pixel_values=pixel_values, labels=labels)
                loss = outputs.loss / gradient_accumulation_steps
                loss.backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()

            total_loss += loss.item() * gradient_accumulation_steps

            # Update progress bar
            progress_bar.set_postfix({
                "loss": loss.item() * gradient_accumulation_steps,
                "lr": scheduler.get_last_lr()[0]
            })

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        # Save checkpoint (only on main process if using DDP)
        if local_rank == 0 and (epoch + 1) % 1 == 0:
            checkpoint_dir = os.path.join(output_dir, f"checkpoint-epoch-{epoch + 1}")
            os.makedirs(checkpoint_dir, exist_ok=True)
            if torch.cuda.device_count() > 1:
                model.module.save_pretrained(checkpoint_dir)
            else:
                model.save_pretrained(checkpoint_dir)
            processor.save_pretrained(checkpoint_dir)
            tokenizer.save_pretrained(checkpoint_dir)

    # Save final model (only on main process if using DDP)
    if local_rank == 0:
        if torch.cuda.device_count() > 1:
            model.module.save_pretrained(output_dir)
        else:
            model.save_pretrained(output_dir)
        processor.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"Model saved to {output_dir}")

    # Cleanup DDP
    if torch.cuda.device_count() > 1:
        dist.destroy_process_group()



# Usage example
if __name__ == "__main__":
    train_model(
        csv_path=f"{path}/senticap.csv",
        img_dir=f"{path}/senticap_images",
        output_dir="./model",
        num_epochs=3,
        batch_size=32,
        learning_rate=5e-5
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.11.crossat

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "is_decoder": true,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.48.3",


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Found 39109 valid image-text pairs


  scaler = amp.GradScaler() if fp16 else None


Epoch 1/3:   0%|          | 0/1223 [00:00<?, ?it/s]

  with amp.autocast():


Epoch 1/3, Average Loss: 1.9757




Epoch 2/3:   0%|          | 0/1223 [00:00<?, ?it/s]

  with amp.autocast():


Epoch 2/3, Average Loss: 0.3308




Epoch 3/3:   0%|          | 0/1223 [00:00<?, ?it/s]

  with amp.autocast():


Epoch 3/3, Average Loss: 0.2711
Model saved to ./model


In [None]:
import torch
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image

# Replace with your repository name on Hugging Face
model_name = "hammadali1805/vit-gpt2-finetuned-senticap-image-captioning"

# Load the model, processor, and tokenizer from the Hub
model = VisionEncoderDecoderModel.from_pretrained(model_name)
processor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Move the model to the appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def generate_caption(image_path):
    # Open and preprocess the image
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

    # Generate output ids from the model
    output_ids = model.generate(pixel_values)

    # Decode the output ids to text
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

# Example usage:
image_path = "/content/image.jpg"
caption = generate_caption(image_path)
print("Caption:", caption)


config.json:   0%|          | 0.00/4.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/957M [00:00<?, ?B/s]

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "is_decoder": true,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.48.3",


generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Caption: A group of stupid people standing around a table with a beer.


In [None]:
import torch
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image

# Replace with your repository name on Hugging Face
model_name = "nlpconnect/vit-gpt2-image-captioning"

# Load the model, processor, and tokenizer from the Hub
model = VisionEncoderDecoderModel.from_pretrained(model_name)
processor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Move the model to the appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def generate_caption(image_path):
    # Open and preprocess the image
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

    # Generate output ids from the model
    output_ids = model.generate(pixel_values)

    # Decode the output ids to text
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

# Example usage:
image_path = "/content/image.jpg"
caption = generate_caption(image_path)
print("Caption:", caption)

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Caption: a man in a suit and tie standing next to a man in a white shirt 
