In [1]:
import nltk

# Download essential NLTK resources every time the script runs
try:
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    nltk.download('stopwords', quiet=True)  # Add this if you also need stopwords
    nltk.download('punkt_tab', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)  # Optional, for POS tagging if needed
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")



In [5]:
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import pandas as pd
import torch
import os
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer  # Correct import
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

# Constants
IMAGE_DIR = "/content/drive/MyDrive/assignment 2 deep learning/custom_captions_dataset/test"
CSV_PATH = "/content/drive/MyDrive/assignment 2 deep learning/custom_captions_dataset/test.csv"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load processor and model
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager"  # Force eager implementation for compatibility
).to(DEVICE)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/7.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.49G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [7]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch

from transformers import AutoModelForVision2Seq, AutoProcessor
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer

# --- CONFIGURATION ---
CSV_PATH = "/content/drive/MyDrive/assignment 2 deep learning/custom_captions_dataset/test.csv"
IMAGE_DIR = "/content/drive/MyDrive/assignment 2 deep learning/custom_captions_dataset/test"
FILENAME_COL = 'filename'  # Adjust if different
CAPTION_COL = 'caption'    # Adjust if different
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "HuggingFaceTB/SmolVLM-Instruct"

# --- LOAD MODEL & PROCESSOR ---
print("Loading SmolVLM model...")
model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME, torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32).to(DEVICE)
processor = AutoProcessor.from_pretrained(MODEL_NAME)

# --- ZERO-SHOT CAPTION FUNCTION ---
def zero_shot_caption(image_path, model, processor, device):
    try:
        image = Image.open(image_path).convert("RGB")

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": "Describe this image briefly."}
                ]
            }
        ]
        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device, model.dtype)

        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)

        raw_output = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

        marker = "Assistant:"
        if marker in raw_output:
            return raw_output.split(marker, 1)[-1].strip()

        return raw_output.strip()

    except Exception as e:
        print(f"Error generating caption for {image_path}: {e}")
        return ""

# --- EVALUATION METRICS ---
df = pd.read_csv(CSV_PATH)
bleu_scores = []
meteor_scores = []
rouge_l_scores = []
rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

print("Evaluating captions...")
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
    filename = row[FILENAME_COL]
    gt_caption = row[CAPTION_COL]

    if not isinstance(gt_caption, str) or not gt_caption.strip():
        continue

    image_path = os.path.join(IMAGE_DIR, filename)
    if not os.path.exists(image_path):
        print(f"Missing: {image_path}")
        continue

    pred_caption = zero_shot_caption(image_path, model, processor, DEVICE)

    if not pred_caption:
        bleu_scores.append(0.0)
        meteor_scores.append(0.0)
        rouge_l_scores.append(0.0)
        continue

    ref_tokens = word_tokenize(gt_caption.lower())
    pred_tokens = word_tokenize(pred_caption.lower())

    smoothie = SmoothingFunction().method4
    bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)
    bleu_scores.append(bleu)

    try:
        meteor = meteor_score([ref_tokens], pred_tokens)
        meteor_scores.append(meteor)
    except:
        meteor_scores.append(0.0)

    try:
        rouge_l = rouge.score(gt_caption, pred_caption)["rougeL"].fmeasure
        rouge_l_scores.append(rouge_l)
    except:
        rouge_l_scores.append(0.0)

# --- RESULTS ---
if bleu_scores:
    print("\n--- Results ---")
    print(f"Avg BLEU-4 Score:     {sum(bleu_scores)/len(bleu_scores):.4f}")
    print(f"Avg METEOR Score:     {sum(meteor_scores)/len(meteor_scores):.4f}")
    print(f"Avg ROUGE-L F1 Score: {sum(rouge_l_scores)/len(rouge_l_scores):.4f}")
else:
    print("No scores to report.")


Loading SmolVLM model...
Evaluating captions...


Evaluating: 100%|██████████| 928/928 [1:00:28<00:00,  3.91s/it]


--- Results ---
Avg BLEU-4 Score:     0.0597
Avg METEOR Score:     0.2765
Avg ROUGE-L F1 Score: 0.2564



