In [1]:
from huggingface_hub import login
import os
import json
import torch
from PIL import Image
from tqdm import tqdm
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import re

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Login to Hugging Face
your_hf_token = ""
login(token=your_hf_token)

In [None]:
# Configuration
MODEL_PATH = "Salesforce/blip2-opt-2.7b"
IMAGE_DIR = "Thesis/coco2014/test2014"
OUTPUT_FILE = "blip2_opt_2.7b_coco_results_test.json"
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Generation parameters
GENERATION_PARAMS = {
    "max_new_tokens": 10,
    "do_sample": False,  # Use greedy decoding
    "num_beams": 1,
}

In [4]:
torch.cuda.empty_cache()
print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

Free GPU memory: 23.30 GB


In [5]:
def load_model_and_processor(model_path, device):
    """Load BLIP-2 model and processor."""
    print(f"Loading model: {model_path}")
    model = Blip2ForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map=device
    ).eval()
    
    processor = AutoProcessor.from_pretrained(model_path)
    print(f"✓ Model loaded successfully on {device}")
    return model, processor

def generate_captions_batch(images, model, processor):
    """
    Generate captions for a batch of images using BLIP-2.
    """
    inputs = processor(images=images, return_tensors="pt", padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)

    with torch.inference_mode():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=GENERATION_PARAMS["max_new_tokens"],
            do_sample=GENERATION_PARAMS["do_sample"],
            num_beams=GENERATION_PARAMS["num_beams"],
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
        )

    captions = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return [caption.strip() for caption in captions]

def extract_image_id(filename):
    """
    Extract image ID from COCO-style filename.
    Example: 'COCO_test2014_000000123456.jpg' -> 123456
    """
    match = re.search(r'(\d{12})\.(jpg|jpeg|png)$', filename)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f"Could not extract image ID from filename: {filename}")

In [6]:
#  Load model and processor
model, processor = load_model_and_processor(MODEL_PATH, DEVICE)
if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB available")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading model: Salesforce/blip2-opt-2.7b


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.20it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✓ Model loaded successfully on cuda
GPU Memory: 16.13 GB available


In [7]:
# Get all image files in the test directory
image_files = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
print(f"\nFound {len(image_files)} images in {IMAGE_DIR}")

# Sort for reproducibility (optional)
image_files.sort()

results = []

# Process images in batches
for batch_idx in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing batches"):
    batch_files = image_files[batch_idx:batch_idx + BATCH_SIZE]
    images_batch = []
    image_ids_batch = []

    for filename in batch_files:
        image_path = os.path.join(IMAGE_DIR, filename)
        try:
            img = Image.open(image_path).convert("RGB")
            img_id = extract_image_id(filename)
            images_batch.append(img)
            image_ids_batch.append(img_id)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    if not images_batch:
        continue

    # Generate captions
    captions = generate_captions_batch(images_batch, model, processor)

    # Append to results
    for img_id, caption in zip(image_ids_batch, captions):
        results.append({
            "image_id": img_id,
            "caption": caption
        })


Found 40775 images in /home/kezouke/Thesis/coco2014/test2014


Processing batches: 100%|██████████| 5097/5097 [23:18<00:00,  3.64it/s]


In [8]:
# Save results
print(f"\nSaving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f, indent=2)
print("✓ Results saved in required format.")


Saving results to blip2_opt_2.7b_coco_results_test.json
✓ Results saved in required format.
