In [None]:
from huggingface_hub import login

your_hf_token = ""

login(token=your_hf_token)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import json
import torch
from PIL import Image
from tqdm import tqdm
from transformers import AutoProcessor, LlavaForConditionalGeneration
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
import requests
from io import BytesIO

In [None]:
# Configuration
MODEL_PATH = "llava-hf/llava-1.5-7b-hf"
COCO_DIR = "Thesis/coco2014"
IMAGE_DIR = os.path.join(COCO_DIR, "val2014")
ANNOTATION_FILE = os.path.join(COCO_DIR, "annotations", "captions_val2014.json")
OUTPUT_FILE = "llava_v1.5_7b_coco_results.json"
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# Generation parameters
GENERATION_PARAMS = {
    "max_new_tokens": 4,
    "do_sample": False,  # Use greedy decoding
    "temperature": 1.0,
    "top_p": None,
}

In [5]:
torch.cuda.empty_cache()

print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

Free GPU memory: 23.26 GB


In [6]:
def load_model_and_processor(model_path, device):
    """Load LLaVA model and processor."""
    print(f"Loading model: {model_path}")
    model = LlavaForConditionalGeneration.from_pretrained(
        model_path,
        dtype=torch.float16,
        device_map=device
    ).eval()
    model = torch.compile(model)
    processor = AutoProcessor.from_pretrained(model_path)
    print(f"✓ Model loaded successfully on {device}")
    return model, processor

In [7]:
def prepare_conversation(prompt="Describe this image"):
    """Prepare conversation template for LLaVA."""
    return [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt},
            ],
        },
    ]

In [8]:
def generate_captions_batch(image_paths, model, processor, batch_size=4, prompt="Describe this image."):
    """Generate captions for a batch of images using TRUE batching."""
    # Step 1: Load all images
    images = []
    for path in image_paths:
        try:
            img = Image.open(path).convert("RGB")
            images.append(img)
        except Exception as e:
            print(f"Error loading {path}: {e}")
            # Use a dummy image or skip? For simplicity, append a blank (not ideal)
            # Better: handle missing images upstream
            images.append(Image.new("RGB", (224, 224)))

    # Step 2: Prepare conversation template for each image (same prompt)
    conversations = [
        [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": prompt},
                ],
            }
        ]
        for _ in images
    ]

    # Step 3: Apply chat template to get text prompts
    prompt_texts = [
        processor.apply_chat_template(conv, add_generation_prompt=True)
        for conv in conversations
    ]

    # Step 4: Process ALL images + texts together
    inputs = processor(
        images=images,
        text=prompt_texts,
        return_tensors="pt",
        padding=True  # Important for variable-length text
    )

    # Step 5: Move to GPU
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)

    # Step 6: Generate captions in one go
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=17,
            do_sample=False,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
        )

    # Step 7: Decode all outputs
    captions = processor.batch_decode(outputs, skip_special_tokens=True)

    # Step 8: Post-process each caption
    clean_captions = []
    for caption in captions:
        if "Assistant:" in caption:
            caption = caption.split("Assistant:")[-1].strip()
        elif "<|assistant|>" in caption:
            caption = caption.split("<|assistant|>")[-1].strip()
        clean_captions.append(caption)

    return clean_captions

In [9]:
def evaluate_coco(annotation_file, results_file):
    """Evaluate results using pycocoevalcap."""
    print("\n" + "="*60)
    print("COCO Evaluation Results")
    print("="*60)
    
    # Load COCO API
    coco = COCO(annotation_file)
    coco_results = coco.loadRes(results_file)
    
    # Create evaluator
    coco_eval = COCOEvalCap(coco, coco_results)
    
    # Run evaluation
    coco_eval.evaluate()
    
    # Print scores
    print("\nMetrics:")
    for metric, score in coco_eval.eval.items():
        print(f"  {metric:10s}: {score:.3f}")
    
    return coco_eval.eval

In [10]:
model, processor = load_model_and_processor(MODEL_PATH, DEVICE)

Loading model: llava-hf/llava-1.5-7b-hf


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.73it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✓ Model loaded successfully on cuda


In [11]:
if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB available")

GPU Memory: 10.09 GB available


In [12]:
print(f"\nLoading COCO annotations from {ANNOTATION_FILE}")
coco = COCO(ANNOTATION_FILE)
img_ids = coco.getImgIds()
print(f"Total images: {len(img_ids)}")


Loading COCO annotations from /home/kezouke/Thesis/coco2014/annotations/captions_val2014.json
loading annotations into memory...
Done (t=0.12s)
creating index...
index created!
Total images: 40504


In [13]:
print(f"\nGenerating captions for {len(img_ids)} images...")
print(f"Batch size: {BATCH_SIZE}")


Generating captions for 40504 images...
Batch size: 8


In [14]:
results = []
image_paths_list = []

# Prepare image paths
for img_id in img_ids:
    img_info = coco.loadImgs(img_id)[0]
    image_path = os.path.join(IMAGE_DIR, img_info['file_name'])
    image_paths_list.append((img_id, image_path))

In [15]:
test_paths = [image_paths_list[i][1] for i in range(8)]
captions = generate_captions_batch(test_paths, model, processor, batch_size=8)
print(captions)

['USER:  \nDescribe this image. ASSISTANT: The image features a man wearing a red helmet, sitting on a motorcycle', 'USER:  \nDescribe this image. ASSISTANT: The image features a woman wearing a red shirt, standing in a kitchen and', 'USER:  \nDescribe this image. ASSISTANT: The image features a young boy standing in a field, holding an umbrella to', 'USER:  \nDescribe this image. ASSISTANT: The image features a young boy wearing headphones and sitting at a computer des', 'USER:  \nDescribe this image. ASSISTANT: The image features a group of people, including a young boy, sitting in front of', 'USER:  \nDescribe this image. ASSISTANT: The image depicts a man standing in a large, well-equipped kitchen', 'USER:  \nDescribe this image. ASSISTANT: The image features a woman standing in a kitchen, holding a cat in her arms.', 'USER:  \nDescribe this image. ASSISTANT: The image features a young girl sitting at a dining table, enjoying a del']


In [16]:
# Process in batches with progress bar
for batch_idx in tqdm(range(0, len(image_paths_list), BATCH_SIZE), 
                        desc="Processing batches"):
    batch_data = image_paths_list[batch_idx:batch_idx+BATCH_SIZE]
    img_ids_batch = [x[0] for x in batch_data]
    paths_batch = [x[1] for x in batch_data]
    
    # Generate captions for batch
    captions = generate_captions_batch(
        paths_batch, 
        model, 
        processor, 
        batch_size=BATCH_SIZE,
        prompt="Describe this image."
    )
    
    # Add to results
    for img_id, caption in zip(img_ids_batch, captions):
        results.append({
            "image_id": img_id,
            "caption": caption
        })

Processing batches: 100%|██████████| 5063/5063 [2:28:45<00:00,  1.76s/it]  


In [17]:
print(f"Saving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f)
print("✓ Results saved")

Saving results to llava_v1.5_7b_coco_results.json
✓ Results saved


In [18]:
input_file = "llava_v1.5_7b_coco_results.json"
output_file = "llava_v1.5_7b_coco_results_cleaned.json"

In [19]:
import json

with open(input_file, 'r') as f:
    data = json.load(f)

cleaned_data = []
for item in data:
    caption = item.get("caption", "")
    if "ASSISTANT:" in caption:
        caption = caption.split("ASSISTANT:")[-1].strip()
    elif "Assistant:" in caption:
        caption = caption.split("Assistant:")[-1].strip()
    elif "<|assistant|>" in caption:
        caption = caption.split("<|assistant|>")[-1].strip()
    caption = caption.strip()
    cleaned_data.append({
        "image_id": item["image_id"],
        "caption": caption
    })

with open(output_file, 'w') as f:
    json.dump(cleaned_data, f, indent=2)

print(f"Cleaned captions saved to {output_file}")

Cleaned captions saved to llava_v1.5_7b_coco_results_cleaned.json


In [20]:
# Evaluate with COCO metrics
print("\n" + "="*60)
print("COCO Evaluation Results")
print("="*60)


COCO Evaluation Results


In [21]:
coco_results = coco.loadRes(output_file)
coco_eval = COCOEvalCap(coco, coco_results)
coco_eval.evaluate()

Loading and preparing results...
DONE (t=0.03s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 2492309 tokens at 2852716.61 tokens per second.
PTBTokenizer tokenized 637477 tokens at 2538436.23 tokens per second.


setting up scorers...
clipscore is using cuda
computing Bleu score...
{'testlen': 564095, 'reflen': 494937, 'guess': [564095, 523591, 483087, 442583], 'correct': [327108, 147333, 65341, 27919]}
ratio: 1.139730915247797
Bleu_1: 0.580
Bleu_2: 0.404
Bleu_3: 0.281
Bleu_4: 0.193
computing METEOR score...
METEOR: 0.250
computing Rouge score...
ROUGE_L: 0.452
computing CIDEr score...
CIDEr: 0.551
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [2.9 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.9

SPICE evaluation took: 4.398 min
SPICE: 0.182
computing CLIPScore score...


  0%|          | 0/159 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


  0%|          | 0/792 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


40504it [00:00, 122859.08it/s]


CLIPScore: 0.761
RefCLIPScore: 0.806


In [22]:
print("\nMetrics:")
for metric, score in coco_eval.eval.items():
    print(f"  {metric:10s}: {score:.4f}")


Metrics:
  Bleu_1    : 0.5799
  Bleu_2    : 0.4039
  Bleu_3    : 0.2805
  Bleu_4    : 0.1932
  METEOR    : 0.2503
  ROUGE_L   : 0.4515
  CIDEr     : 0.5507
  SPICE     : 0.1819
  CLIPScore : 0.7612
  RefCLIPScore: 0.8057


In [23]:
import numpy as np

def to_serializable(obj):
    if isinstance(obj, np.generic):
        return obj.item()  # converts np.float16, np.int32, etc. to Python scalar
    raise TypeError(f"Type {type(obj)} not serializable")

scores_file = OUTPUT_FILE.replace('.json', '_scores.json')
with open(scores_file, 'w') as f:
    json.dump(coco_eval.eval, f, indent=2, default=to_serializable)

print(f"\n✓ Scores saved to {scores_file}")


✓ Scores saved to llava_v1.5_7b_coco_results_scores.json
