### Import Packages

In [1]:
%pip install -U nltk rouge-score

^C
Note: you may need to restart the kernel to use updated packages.


Defaulting to user installation because normal site-packages is not writeable


In [None]:
import os
import random

import nltk
import numpy as np
import pandas as pd
import torch
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from PIL import Image
from rouge_score import rouge_scorer
from tqdm import tqdm
from transformers import AutoModelForVision2Seq, AutoProcessor
from transformers.image_utils import load_image

In [None]:
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)

### Environment Setup and Library Imports

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed()
device = "cuda" if torch.cuda.is_available() else "cpu"

### Evaluation Metrics: BLEU, ROUGE-L, and METEOR

In [None]:
def calculate_metrics(predictions, ground_truths):
    references_for_bleu = [[gt.split()] for gt in ground_truths]
    predictions_for_bleu = [pred.split() for pred in predictions]
    bleu = corpus_bleu(references_for_bleu, predictions_for_bleu)

    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    rouge_scores = []
    for pred, ref in zip(predictions, ground_truths):
        rouge_result = scorer.score(ref, pred)
        rouge_scores.append(rouge_result["rougeL"].fmeasure)
    avg_rouge = sum(rouge_scores) / len(rouge_scores) if rouge_scores else 0

    meteor_scores = []
    for pred, ref in zip(predictions, ground_truths):
        meteor_scores.append(meteor_score([ref.split()], pred.split()))
    avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0

    return {"BLEU": bleu, "ROUGE-L": avg_rouge, "METEOR": avg_meteor}

### Zero-Shot Image Captioning and Evaluation Pipeline

In [None]:
def zero_shot_captioning(
    image_path, model, processor, model_name="HuggingFaceTB/SmolVLM-Instruct"
):
    try:
        image = load_image(image_path)
        message = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": "Please Describe the Image"},
                ],
            }
        ]
        prompt = processor.apply_chat_template(message, add_generation_prompt=True)

        inputs = processor(
            images=image, text=prompt, return_tensors="pt", padding=True
        ).to(device)

        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=128)
            generated_ids = generated_ids[:, inputs["input_ids"].size(1) :]
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        print(f"Generated Caption: {text}")
        return text
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""


def evaluate_zero_shot(
    model_name="HuggingFaceTB/SmolVLM-Instruct",
    test_csv_path="./custom_captions_dataset/test.csv",
    image_dir="./custom_captions_dataset/test/",
):
    processor = AutoProcessor.from_pretrained(model_name)
    model = AutoModelForVision2Seq.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        _attn_implementation="eager",
    ).to(device)

    test_data_frame = pd.read_csv(test_csv_path)

    predictions, ground_truths = [], []

    for idx in tqdm(range(len(test_data_frame)), desc="Processing Images"):
        image_data = test_data_frame.iloc[idx]
        complete_path = os.path.join(image_dir, image_data["filename"])

        generated_text = zero_shot_captioning(
            complete_path, model, processor, model_name
        )
        if generated_text:
            predictions.append(generated_text)
            ground_truths.append(image_data["caption"])

        if device == "cuda":
            torch.cuda.empty_cache()

    results = calculate_metrics(predictions, ground_truths)

    print("Evaluation Results:")
    for metric, value in results.items():
        print(f"{metric} Score: {value:.4f}")

    return results


### Running Zero-Shot Evaluation on Custom Dataset


In [None]:
test_csv_path = "/kaggle/input/dl-assignment-2/custom_captions_dataset/test.csv"
image_dir = "/kaggle/input/dl-assignment-2/custom_captions_dataset/test/"

results_smol = evaluate_zero_shot(
    model_name="HuggingFaceTB/SmolVLM-Instruct",
    test_csv_path=test_csv_path,
    image_dir=image_dir,
)

### Storing results

In [None]:
results_df = pd.DataFrame([results_smol])
results_df.to_csv("/kaggle/working/smol_results.csv", index=False)