In [1]:
pip install torch torchvision transformers datasets evaluate nltk pillow


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia

In [2]:
!pip install --upgrade huggingface_hub transformers
from huggingface_hub import login

Collecting huggingface_hub
  Downloading huggingface_hub-0.31.1-py3-none-any.whl.metadata (13 kB)
Collecting hf-xet<2.0.0,>=1.1.0 (from huggingface_hub)
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading huggingface_hub-0.31.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.3/484.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet, huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.30.2
    Uninstalling huggingface-hub-0.30.2:
      Successfully uninstalled huggingface-hub-0.30.2
Successfully installed hf-xet-1.1.0 huggingface_hub-0.31.1


In [3]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import os
import random
import pandas as pd
import torch
from torch.optim import AdamW
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import evaluate

# ── 1) Paths & Device ─────────────────────────────────────────────────────────
images_dir   = "/content/drive/MyDrive/flickr30k_images/flickr30k_images"
captions_csv = "/content/drive/MyDrive/flickr30k_images/results.csv"
device       = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ── 2) Load & group your 5 refs per image ────────────────────────────────────
df = pd.read_csv(
    captions_csv,
    sep=r"\|",
    engine="python",
    names=["image_name", "comment_number", "comment"],
    header=0
)
refs = df.groupby("image_name")["comment"].apply(list).to_dict()

# ── 3) Sample 10% of image IDs for evaluation ─────────────────────────────────
all_ids    = list(refs.keys())
random.seed(42)
sample_ids = random.sample(all_ids, int(len(all_ids) * 0.1))

# ── 4) Load BLIP v1 model & processor ────────────────────────────────────────
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", use_fast=True)
model     = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ── 5) Optimizer & switch to train mode ──────────────────────────────────────
optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()

# ── 6) Robust image loader ───────────────────────────────────────────────────
def open_image(name):
    base, _ = os.path.splitext(name)
    for ext in (".jpg", ".jpeg", ".png"):
        path = os.path.join(images_dir, base + ext)
        if os.path.exists(path):
            return Image.open(path).convert("RGB")
    raise FileNotFoundError(f"No file found for base name '{name}' "
                            f"(looked for {base}.[jpg|jpeg|png])")

# ── 7) Fine-tune for 3 epochs over all (image, caption) pairs ───────────────
for epoch in range(3):
    total_loss = 0.0
    count = 0
    for img_name, captions in refs.items():
        try:
            img = open_image(img_name)
        except FileNotFoundError:
            # skip if image missing
            continue

        for caption in captions:
            encoding = processor(
                images=img,
                text=caption,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=32
            ).to(device)

            outputs = model(**encoding, labels=encoding["input_ids"])
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            count += 1

    avg_loss = total_loss / count if count > 0 else float("nan")
    print(f"Epoch {epoch+1}/3 — avg loss: {avg_loss:.4f}")

# ── 8) Switch back to evaluation mode ─────────────────────────────────────────
model.eval()

# ── 9) Generate captions on held-out 10% sample ──────────────────────────────
predictions = []
references_list = []

for img_name in sample_ids:
    try:
        img = open_image(img_name)
    except FileNotFoundError as e:
        print(e)
        continue

    inputs = processor(images=img, return_tensors="pt").to(device)
    out_ids = model.generate(**inputs, max_new_tokens=32)
    pred    = processor.decode(out_ids[0], skip_special_tokens=True).strip()

    predictions.append(pred)
    references_list.append(refs[img_name])

# ── 10) Compute BLEU-1, BLEU-2 & METEOR ───────────────────────────────────────
bleu   = evaluate.load("bleu")
meteor = evaluate.load("meteor")

bleu1 = bleu.compute(predictions=predictions,
                     references=references_list,
                     max_order=1)["bleu"]
bleu2 = bleu.compute(predictions=predictions,
                     references=references_list,
                     max_order=2)["bleu"]
met   = meteor.compute(predictions=predictions,
                       references=references_list)["meteor"]

print(f"\nPost-fine-tuning scores:")
print(f"  BLEU-1: {bleu1:.4f}")
print(f"  BLEU-2: {bleu2:.4f}")
print(f"  METEOR: {met:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]