In [None]:
from huggingface_hub import login

your_hf_token = ""

login(token=your_hf_token)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from huggingface_hub import login
import os
import json
import torch
from PIL import Image
from tqdm import tqdm
from transformers import AutoProcessor, LlavaForConditionalGeneration
import re

In [None]:
MODEL_PATH = "llava-hf/llava-1.5-7b-hf"
IMAGE_DIR = "Thesis/coco2014/test2014"  # ← test2014, not val2014
OUTPUT_FILE = "llava_v1.5_7b_coco_test_results.json"
BATCH_SIZE = 4  # LLaVA is memory-heavy; keep batch small
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# Generation settings
MAX_NEW_TOKENS = 20  # LLaVA often needs more tokens for full captions
PROMPT = "Describe this image."

In [5]:
torch.cuda.empty_cache()

print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

Free GPU memory: 23.30 GB


In [6]:
# === Helper: Extract image_id from filename ===
def extract_image_id(filename):
    """
    Extract 12-digit image ID from COCO test filename.
    Example: 'COCO_test2014_000000123456.jpg' → 123456
    """
    match = re.search(r'(\d{12})\.(jpg|jpeg|png)$', filename, re.IGNORECASE)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f"Could not extract image ID from: {filename}")

# === Load Model & Processor ===
print(f"Loading model: {MODEL_PATH}")
model = LlavaForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map=DEVICE,
    low_cpu_mem_usage=True
).eval()

# Optional: compile for speed (PyTorch ≥2.0)
if hasattr(torch, 'compile'):
    model = torch.compile(model)

processor = AutoProcessor.from_pretrained(MODEL_PATH)
print(f"✓ Model loaded on {DEVICE}")

# === Get all test images ===
all_files = os.listdir(IMAGE_DIR)
image_files = [f for f in all_files if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
image_files.sort()  # Ensure reproducible order
print(f"\nFound {len(image_files)} images in {IMAGE_DIR}")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading model: llava-hf/llava-1.5-7b-hf


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.94it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✓ Model loaded on cuda

Found 40775 images in /home/kezouke/Thesis/coco2014/test2014


In [7]:
def generate_captions_batch(image_paths, model, processor, prompt="Describe this image."):
    images = []
    valid_paths = []
    for path in image_paths:
        try:
            img = Image.open(path).convert("RGB")
            images.append(img)
            valid_paths.append(path)
        except Exception as e:
            print(f"⚠️ Skipping {path}: {e}")
            continue

    if not images:
        return [], []

    conversations = [
        [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
        for _ in images
    ]

    prompt_texts = [
        processor.apply_chat_template(conv, add_generation_prompt=True)
        for conv in conversations
    ]

    inputs = processor(
        images=images,
        text=prompt_texts,
        return_tensors="pt",
        padding=True
    ).to(DEVICE)

    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
        )

    decoded = processor.batch_decode(outputs, skip_special_tokens=True)

    clean_captions = []
    for caption in decoded:
        for sep in ["ASSISTANT:", "Assistant:", "<|assistant|>"]:
            if sep in caption:
                caption = caption.split(sep)[-1].strip()
        clean_captions.append(caption.strip())

    image_ids = [extract_image_id(os.path.basename(p)) for p in valid_paths]
    return image_ids, clean_captions

In [8]:
# === Process all images in batches ===
results = []

for batch_start in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing batches"):
    batch_files = image_files[batch_start:batch_start + BATCH_SIZE]
    batch_paths = [os.path.join(IMAGE_DIR, f) for f in batch_files]

    image_ids, captions = generate_captions_batch(
        batch_paths, model, processor, prompt=PROMPT
    )

    for img_id, cap in zip(image_ids, captions):
        results.append({
            "image_id": img_id,
            "caption": cap
        })

# === Save results ===
print(f"\nSaving {len(results)} captions to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f, indent=2)

print("✅ Done!")

Processing batches: 100%|██████████| 10194/10194 [2:54:45<00:00,  1.03s/it] 


Saving 40775 captions to llava_v1.5_7b_coco_test_results.json
✅ Done!





In [9]:
import json
import unicodedata

# === 1. Load your existing result file ===
input_file = "llava_v1.5_7b_coco_test_results.json"
output_file = "llava_v1.5_7b_coco_test_results_ascii.json"  # cleaned version

with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# === 2. Clean captions: remove non-ASCII characters ===
def clean_caption(caption):
    """
    Convert caption to ASCII-only by:
    - Normalizing unicode (e.g., café → cafe)
    - Removing any remaining non-ASCII chars
    """
    if not isinstance(caption, str):
        caption = str(caption)
    # Normalize unicode (e.g., é → e + combining mark)
    nfkd = unicodedata.normalize('NFKD', caption)
    # Keep only ASCII characters
    ascii_str = nfkd.encode('ASCII', 'ignore').decode('ASCII')
    # Clean extra whitespace
    return ' '.join(ascii_str.split())

# Apply cleaning to every caption
cleaned_data = []
for item in data:
    cleaned_caption = clean_caption(item["caption"])
    # Fallback in case caption becomes empty
    if not cleaned_caption.strip():
        cleaned_caption = "a photo"
    cleaned_data.append({
        "image_id": item["image_id"],
        "caption": cleaned_caption.strip()
    })

# === 3. Save the cleaned JSON (ASCII-safe) ===
with open(output_file, 'w', encoding='ascii') as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=True)

print(f"✅ Cleaned {len(cleaned_data)} captions and saved to {output_file}")

✅ Cleaned 40775 captions and saved to llava_v1.5_7b_coco_test_results_ascii.json


In [10]:
import json
import unicodedata

# === 1. Load your existing result file ===
input_file = "llava_v1.5_7b_coco_results.json"
output_file = "llava_v1.5_7b_coco_results_ascii.json"  # cleaned version

with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# === 2. Clean captions: remove non-ASCII characters ===
def clean_caption(caption):
    """
    Convert caption to ASCII-only by:
    - Normalizing unicode (e.g., café → cafe)
    - Removing any remaining non-ASCII chars
    """
    if not isinstance(caption, str):
        caption = str(caption)
    # Normalize unicode (e.g., é → e + combining mark)
    nfkd = unicodedata.normalize('NFKD', caption)
    # Keep only ASCII characters
    ascii_str = nfkd.encode('ASCII', 'ignore').decode('ASCII')
    # Clean extra whitespace
    return ' '.join(ascii_str.split())

# Apply cleaning to every caption
cleaned_data = []
for item in data:
    cleaned_caption = clean_caption(item["caption"])
    # Fallback in case caption becomes empty
    if not cleaned_caption.strip():
        cleaned_caption = "a photo"
    cleaned_data.append({
        "image_id": item["image_id"],
        "caption": cleaned_caption.strip()
    })

# === 3. Save the cleaned JSON (ASCII-safe) ===
with open(output_file, 'w', encoding='ascii') as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=True)

print(f"✅ Cleaned {len(cleaned_data)} captions and saved to {output_file}")

✅ Cleaned 40504 captions and saved to llava_v1.5_7b_coco_results_ascii.json
