In [1]:
from huggingface_hub import login
import os
import json
import torch
from PIL import Image
from tqdm import tqdm
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Login to Hugging Face
your_hf_token = ""
login(token=your_hf_token)

In [None]:
# Configuration
MODEL_PATH = "Salesforce/blip2-opt-2.7b"
COCO_DIR = "Thesis/coco2014"
IMAGE_DIR = os.path.join(COCO_DIR, "2014")
ANNOTATION_FILE = os.path.join(COCO_DIR, "annotations", "captions_val2014.json")
OUTPUT_FILE = "blip2_opt_2.7b_coco_results.json"
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


In [4]:
# Generation parameters
GENERATION_PARAMS = {
    "max_new_tokens": 10,
    "do_sample": False,  # Use greedy decoding
    "num_beams": 1,
}

In [5]:
torch.cuda.empty_cache()
print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

Free GPU memory: 23.26 GB


In [6]:
def load_model_and_processor(model_path, device):
    """Load BLIP-2 model and processor."""
    print(f"Loading model: {model_path}")
    model = Blip2ForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map=device
    ).eval()
    
    processor = AutoProcessor.from_pretrained(model_path)
    print(f"✓ Model loaded successfully on {device}")
    return model, processor

In [7]:
def generate_captions_batch(images, model, processor, batch_size=8):
    """
    Generate captions for a batch of images using BLIP-2.
    
    Args:
        images: List of PIL Images
        model: BLIP-2 model
        processor: BLIP-2 processor
        batch_size: Batch size for processing
        
    Returns:
        List of generated captions
    """
    # Process images - BLIP-2 doesn't require text prompt for captioning
    inputs = processor(
        images=images,
        return_tensors="pt",
        padding=True
    )
    
    # Move to GPU
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)
    
    # Generate captions
    with torch.inference_mode():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=GENERATION_PARAMS["max_new_tokens"],
            do_sample=GENERATION_PARAMS["do_sample"],
            num_beams=GENERATION_PARAMS["num_beams"],
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
        )
    
    # Decode captions
    captions = processor.batch_decode(generated_ids, skip_special_tokens=True)
    
    # Clean captions (BLIP-2 usually generates clean output)
    clean_captions = [caption.strip() for caption in captions]
    
    return clean_captions

In [8]:
def evaluate_coco(annotation_file, results_file):
    """Evaluate results using pycocoevalcap."""
    print("\n" + "="*60)
    print("COCO Evaluation Results")
    print("="*60)
    
    # Load COCO API
    coco = COCO(annotation_file)
    coco_results = coco.loadRes(results_file)
    
    # Create evaluator
    coco_eval = COCOEvalCap(coco, coco_results)
    
    # Run evaluation
    coco_eval.evaluate()
    
    # Print scores
    print("\nMetrics:")
    for metric, score in coco_eval.eval.items():
        print(f"  {metric:10s}: {score:.3f}")
    
    return coco_eval.eval

In [9]:
# Load model and processor
model, processor = load_model_and_processor(MODEL_PATH, DEVICE)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading model: Salesforce/blip2-opt-2.7b


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.14it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✓ Model loaded successfully on cuda


In [10]:
if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB available")


GPU Memory: 16.09 GB available


In [11]:
# Load COCO annotations
print(f"\nLoading COCO annotations from {ANNOTATION_FILE}")
coco = COCO(ANNOTATION_FILE)
img_ids = coco.getImgIds()
print(f"Total images: {len(img_ids)}")


Loading COCO annotations from /home/kezouke/Thesis/coco2014/annotations/captions_val2014.json
loading annotations into memory...
Done (t=0.13s)
creating index...
index created!
Total images: 40504


In [12]:
print(f"\nGenerating captions for {len(img_ids)} images...")
print(f"Batch size: {BATCH_SIZE}")

results = []
image_data_list = []

# Prepare image data
for img_id in img_ids:
    img_info = coco.loadImgs(img_id)[0]
    image_path = os.path.join(IMAGE_DIR, img_info['file_name'])
    image_data_list.append((img_id, image_path))


Generating captions for 40504 images...
Batch size: 8


In [13]:
# Test on first batch
print("\nTesting on first batch...")
test_images = []
for i in range(min(BATCH_SIZE, len(image_data_list))):
    img_path = image_data_list[i][1]
    try:
        img = Image.open(img_path).convert("RGB")
        test_images.append(img)
    except Exception as e:
        print(f"Error loading {img_path}: {e}")


Testing on first batch...


In [14]:
test_captions = generate_captions_batch(test_images, model, processor, batch_size=BATCH_SIZE)
print("Sample captions:")
for i, caption in enumerate(test_captions[:3]):
    print(f"  {i+1}. {caption}")

Sample captions:
  1. a man riding a motorcycle on a dirt road
  2. a woman cutting a cake in a kitchen
  3. a boy holding an umbrella
holding a cow


In [15]:
# Process all images in batches with progress bar
for batch_idx in tqdm(range(0, len(image_data_list), BATCH_SIZE), desc="Processing batches"):
    batch_data = image_data_list[batch_idx:batch_idx+BATCH_SIZE]
    img_ids_batch = [x[0] for x in batch_data]
    paths_batch = [x[1] for x in batch_data]
    
    # Load images
    images_batch = []
    valid_img_ids = []
    for img_id, path in zip(img_ids_batch, paths_batch):
        try:
            img = Image.open(path).convert("RGB")
            images_batch.append(img)
            valid_img_ids.append(img_id)
        except Exception as e:
            print(f"Error loading {path}: {e}")
            continue
    
    if not images_batch:
        continue
    
    # Generate captions for batch
    captions = generate_captions_batch(
        images_batch, 
        model, 
        processor, 
        batch_size=len(images_batch)
    )
    
    # Add to results
    for img_id, caption in zip(valid_img_ids, captions):
        results.append({
            "image_id": img_id,
            "caption": caption
        })

Processing batches: 100%|██████████| 5063/5063 [33:03<00:00,  2.55it/s] 


In [16]:
# Save results
print(f"\nSaving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f)
print("✓ Results saved")


Saving results to blip2_opt_2.7b_coco_results.json
✓ Results saved


In [17]:
# Evaluate with COCO metrics
print("\n" + "="*60)
print("COCO Evaluation Results")
print("="*60)


COCO Evaluation Results


In [19]:
coco_results = coco.loadRes(OUTPUT_FILE)
coco_eval = COCOEvalCap(coco, coco_results)
coco_eval.evaluate()

print("\nMetrics:")
for metric, score in coco_eval.eval.items():
    print(f"  {metric:10s}: {score:.4f}")

Loading and preparing results...
DONE (t=0.13s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 2492309 tokens at 5452156.56 tokens per second.
PTBTokenizer tokenized 395193 tokens at 3193228.28 tokens per second.


setting up scorers...
clipscore is using cuda
computing Bleu score...
{'testlen': 352327, 'reflen': 361222, 'guess': [352327, 311823, 271319, 230816], 'correct': [290822, 168818, 84981, 40563]}
ratio: 0.9753752539989232
Bleu_1: 0.805
Bleu_2: 0.652
Bleu_3: 0.506
Bleu_4: 0.386
computing METEOR score...
METEOR: 0.281
computing Rouge score...
ROUGE_L: 0.583
computing CIDEr score...
CIDEr: 1.222
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 12.75 s
SPICE: 0.213
computing CLIPScore score...


100%|██████████| 159/159 [00:08<00:00, 19.75it/s]


due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


100%|██████████| 792/792 [00:38<00:00, 20.51it/s]


due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


40504it [00:00, 122963.47it/s]


CLIPScore: 0.766
RefCLIPScore: 0.825

Metrics:
  Bleu_1    : 0.8049
  Bleu_2    : 0.6518
  Bleu_3    : 0.5063
  Bleu_4    : 0.3862
  METEOR    : 0.2809
  ROUGE_L   : 0.5833
  CIDEr     : 1.2221
  SPICE     : 0.2127
  CLIPScore : 0.7661
  RefCLIPScore: 0.8247


In [20]:
import numpy as np

def to_serializable(obj):
    if isinstance(obj, np.generic):
        return obj.item()  # converts np.float16, np.int32, etc. to Python scalar
    raise TypeError(f"Type {type(obj)} not serializable")

scores_file = OUTPUT_FILE.replace('.json', '_scores.json')
with open(scores_file, 'w') as f:
    json.dump(coco_eval.eval, f, indent=2, default=to_serializable)

print(f"\n✓ Scores saved to {scores_file}")


✓ Scores saved to blip2_opt_2.7b_coco_results_scores.json
