In [1]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from huggingface_hub import login
import torch
import os
import json
from PIL import Image
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
your_hf_token = ""
login(token=your_hf_token)

MODEL_PATH = "llava-hf/llama3-llava-next-8b-hf"
COCO_DIR = "Thesis/coco2014"
IMAGE_DIR = os.path.join(COCO_DIR, "val2014")
ANNOTATION_FILE = os.path.join(COCO_DIR, "annotations", "captions_val2014.json")
OUTPUT_FILE = "llama3_llava_next_8b_coco_results.json"
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

GENERATION_PARAMS = {
    "max_new_tokens": 70,
    "do_sample": False,
    "use_cache": True,
}

In [3]:
torch.cuda.empty_cache()
if torch.cuda.is_available():
    print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

Free GPU memory: 23.30 GB


In [4]:
def load_model_and_processor(model_path, device):
    print(f"Loading model: {model_path}")
    
    processor = LlavaNextProcessor.from_pretrained(model_path)
    model = LlavaNextForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map="auto",
        trust_remote_code=False  # Not needed for HF-native models
    ).eval()
    
    print("✓ Model and processor loaded successfully")
    return model, processor


def generate_caption_single(image_path, model, processor):
    try:
        image = Image.open(image_path).convert("RGB")
        
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Generate a detailed caption for this image in one sentence."},
                    {"type": "image"},
                ],
            },
        ]
        
        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=GENERATION_PARAMS["max_new_tokens"],
                do_sample=GENERATION_PARAMS["do_sample"],
                use_cache=GENERATION_PARAMS["use_cache"]
            )
        
        # Decode WITH special tokens to parse structure
        full_output = processor.decode(output[0], skip_special_tokens=False)
        
        # Extract assistant response
        if "<|start_header_id|>assistant<|end_header_id|>" in full_output:
            caption = full_output.split("<|start_header_id|>assistant<|end_header_id|>")[-1]
            caption = caption.split("<|eot_id|>")[0].strip()
        else:
            # Fallback
            caption = full_output
        
        return caption if caption else "No caption generated"
    
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return "Error generating caption"


def generate_captions_batch(image_paths, model, processor, batch_size=None):
    """Process images one-by-one (LLaVA-NeXT doesn't support true multi-image batching easily)."""
    captions = []
    for img_path in tqdm(image_paths, desc="Generating captions", leave=False):
        cap = generate_caption_single(img_path, model, processor)
        captions.append(cap)
    return captions


def evaluate_coco(annotation_file, results_file):
    print("\n" + "="*60)
    print("COCO Evaluation Results")
    print("="*60)
    
    coco = COCO(annotation_file)
    coco_results = coco.loadRes(results_file)
    coco_eval = COCOEvalCap(coco, coco_results)
    coco_eval.evaluate()
    
    print("\nMetrics:")
    for metric, score in coco_eval.eval.items():
        print(f"  {metric:10s}: {score:.4f}")
    return coco_eval.eval


def to_serializable(obj):
    if isinstance(obj, np.generic):
        return obj.item()
    raise TypeError(f"Type {type(obj)} not serializable")

In [5]:
model, processor = load_model_and_processor(MODEL_PATH, DEVICE)

Loading model: llava-hf/llama3-llava-next-8b-hf


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.28s/it]


✓ Model and processor loaded successfully


In [6]:
print(f"\nLoading COCO annotations from {ANNOTATION_FILE}")
coco = COCO(ANNOTATION_FILE)
img_ids = coco.getImgIds()
print(f"Total images: {len(img_ids)}")


Loading COCO annotations from /home/kezouke/Thesis/coco2014/annotations/captions_val2014.json
loading annotations into memory...
Done (t=0.11s)
creating index...
index created!
Total images: 40504


In [7]:
# Prepare image paths
image_data_list = []
for img_id in img_ids:
    img_info = coco.loadImgs(img_id)[0]
    image_path = os.path.join(IMAGE_DIR, img_info['file_name'])
    image_data_list.append((img_id, image_path))

In [8]:
# Test first few
print("Testing on first batch...")
test_paths = [image_data_list[i][1] for i in range(min(3, len(image_data_list)))]
test_captions = generate_captions_batch(test_paths, model, processor)
for i, cap in enumerate(test_captions):
    print(cap)

Testing on first batch...


Generating captions:   0%|          | 0/3 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Generating captions:  33%|███▎      | 1/3 [00:01<00:02,  1.09s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Generating captions:  67%|██████▋   | 2/3 [00:02<00:01,  1.02s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
                                                                  

A man riding a motorcycle down a dirt road.
A woman in a hairnet is cutting a large cake with a knife.
A young boy holding an umbrella standing next to a cow.




In [9]:
# Full inference
results = []
for batch_start in tqdm(range(0, len(image_data_list), BATCH_SIZE), desc="Processing batches"):
    batch = image_data_list[batch_start:batch_start + BATCH_SIZE]
    img_ids_batch = [x[0] for x in batch]
    paths_batch = [x[1] for x in batch]
    
    captions = generate_captions_batch(paths_batch, model, processor)
    
    for img_id, caption in zip(img_ids_batch, captions):
        results.append({
            "image_id": img_id,
            "caption": caption
        })

Processing batches:   0%|          | 0/5063 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Processing batches:   0%|          | 1/5063 [00:07<9:54:18,  7.04s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end genera

In [10]:
print(f"\nSaving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f)
print("✓ Results saved")

# Evaluate
eval_scores = evaluate_coco(ANNOTATION_FILE, OUTPUT_FILE)

scores_file = OUTPUT_FILE.replace('.json', '_scores.json')
with open(scores_file, 'w') as f:
    json.dump(eval_scores, f, indent=2, default=to_serializable)
print(f"✓ Scores saved to {scores_file}")


Saving results to llama3_llava_next_8b_coco_results.json
✓ Results saved

COCO Evaluation Results
loading annotations into memory...
Done (t=0.12s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.03s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 2492309 tokens at 2178166.93 tokens per second.
PTBTokenizer tokenized 552240 tokens at 3498480.44 tokens per second.


setting up scorers...
clipscore is using cuda
computing Bleu score...
{'testlen': 465103, 'reflen': 437467, 'guess': [465103, 424599, 384095, 343591], 'correct': [354577, 206306, 106969, 53836]}
ratio: 1.063172765031417
Bleu_1: 0.762
Bleu_2: 0.609
Bleu_3: 0.469
Bleu_4: 0.357
computing METEOR score...
METEOR: 0.313
computing Rouge score...
ROUGE_L: 0.589
computing CIDEr score...
CIDEr: 1.237
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [1.1 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.2 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [1.7

SPICE evaluation took: 4.126 min
SPICE: 0.251
computing CLIPScore score...


  0%|          | 0/159 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
TOKENIZERS_PARALLELISMTOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
=(true | false)
	- Avoid 

due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


  0%|          | 0/792 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


40504it [00:00, 122861.74it/s]


CLIPScore: 0.795
RefCLIPScore: 0.838

Metrics:
  Bleu_1    : 0.7624
  Bleu_2    : 0.6086
  Bleu_3    : 0.4690
  Bleu_4    : 0.3566
  METEOR    : 0.3130
  ROUGE_L   : 0.5893
  CIDEr     : 1.2369
  SPICE     : 0.2512
  CLIPScore : 0.7949
  RefCLIPScore: 0.8379
✓ Scores saved to llama3_llava_next_8b_coco_results_scores.json


In [11]:
!nvidia-smi

Sat Nov  8 23:53:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off |   00000000:01:00.0 Off |                  N/A |
| 72%   47C    P2            128W /  420W |   17575MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
