In [1]:
pip install torch torchvision transformers datasets evaluate nltk pillow


Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia

In [2]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
!pip install --upgrade huggingface_hub transformers
from huggingface_hub import login






In [6]:
import os
import random
import pandas as pd
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import evaluate

# 1) Paths
images_dir   = "/content/drive/MyDrive/flickr30k_images/flickr30k_images"
captions_csv = "/content/drive/MyDrive/flickr30k_images/results.csv"

# 2) Load & group your 5 refs per image
df = pd.read_csv(
    captions_csv,
    sep=r"\|",
    engine="python",
    names=["image_name","comment_number","comment"],
    header=0
)
refs = df.groupby("image_name")["comment"].apply(list).to_dict()

# 3) Sample 10%
all_ids    = list(refs.keys())
random.seed(42)
sample_ids = random.sample(all_ids, int(len(all_ids)*0.1))

# 4) Load BLIP v1
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", use_fast=True)
model     = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
model.eval()

# Helpers to find and open the image
def open_image(name):
    # Try exact name, then try adding common extensions
    candidates = [name, name + ".jpg", name + ".jpeg", name + ".png"]
    for fn in candidates:
        path = os.path.join(images_dir, fn)
        if os.path.exists(path):
            return Image.open(path).convert("RGB")
    raise FileNotFoundError(f"No file found for base name '{name}'")

# 5) Generate & collect
predictions = []
references  = []

for img_name in sample_ids:
    try:
        image = open_image(img_name)
    except FileNotFoundError as e:
        print(e)
        continue

    inputs = processor(images=image, return_tensors="pt").to(device)
    out_ids = model.generate(**inputs, max_new_tokens=32)
    pred    = processor.decode(out_ids[0], skip_special_tokens=True).strip()

    predictions.append(pred)
    references.append(refs[img_name])

# 6) Score
bleu   = evaluate.load("bleu")
meteor = evaluate.load("meteor")

bleu1 = bleu.compute(predictions=predictions, references=references, max_order=1)["bleu"]
bleu2 = bleu.compute(predictions=predictions, references=references, max_order=2)["bleu"]
met   = meteor.compute(predictions=predictions, references=references)["meteor"]

print(f"BLEU-1: {bleu1:.4f}")
print(f"BLEU-2: {bleu2:.4f}")
print(f"METEOR: {met:.4f}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


BLEU-1: 0.5174
BLEU-2: 0.3589
METEOR: 0.3251
