In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from huggingface_hub import login
import torch
import os
import json
from PIL import Image
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
your_hf_token = ""
login(token=your_hf_token)

In [None]:
# Configuration
MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
COCO_DIR = "Thesis/coco2014"
IMAGE_DIR = os.path.join(COCO_DIR, "val2014")
ANNOTATION_FILE = os.path.join(COCO_DIR, "annotations", "captions_val2014.json")
OUTPUT_FILE = "qwen2vl_7b_coco_results.json"
BATCH_SIZE = 4  # Reduced batch size for memory efficiency
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Generation parameters
GENERATION_PARAMS = {
    "max_new_tokens": 50,  # Qwen2-VL generates longer captions by default
    "do_sample": False,  # Use greedy decoding for consistency
}

In [4]:
torch.cuda.empty_cache()
if torch.cuda.is_available():
    print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

Free GPU memory: 23.13 GB


In [5]:
def load_model_and_processor(model_path, device):
    """Load Qwen2-VL model and processor."""
    print(f"Loading model: {model_path}")
    
    # Load model with bfloat16 for better performance
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_path,
        dtype=torch.bfloat16,
        device_map="auto"
    ).eval()
    
    # Load processor with optimized pixel settings
    processor = AutoProcessor.from_pretrained(model_path)
    
    print(f"✓ Model loaded successfully")
    return model, processor

In [6]:
def generate_captions_batch(image_paths, model, processor, batch_size=4):
    """
    Generate captions for a batch of images using Qwen2-VL.
    
    Args:
        image_paths: List of image file paths
        model: Qwen2-VL model
        processor: Qwen2-VL processor
        batch_size: Batch size for processing
        
    Returns:
        List of generated captions
    """
    # Load images
    images = []
    for img_path in image_paths:
        try:
            img = Image.open(img_path).convert("RGB")
            images.append(img)
        except Exception as e:
            print(f"Error loading {img_path}: {e}")
            images.append(None)
    
    # Filter out failed images
    valid_images = [img for img in images if img is not None]
    
    if not valid_images:
        return []
    
    # Prepare messages for batch - using simple caption prompts
    messages = []
    for img in valid_images:
        message = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": "Generate a detailed caption for this image in one sentence."}
                ]
            }
        ]
        messages.append(message)
    
    # Apply chat template for batch
    texts = []
    for msg in messages:
        text = processor.apply_chat_template(
            msg, 
            tokenize=False, 
            add_generation_prompt=True
        )
        texts.append(text)
    
    # Extract vision info from messages
    image_inputs, video_inputs = process_vision_info(messages)
    
    # Process inputs
    inputs = processor(
        text=texts,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )
    
    # Move to device
    inputs = inputs.to(model.device)
    
    # Generate captions
    with torch.inference_mode():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=GENERATION_PARAMS["max_new_tokens"],
            do_sample=GENERATION_PARAMS["do_sample"],
        )
    
    # Extract only newly generated tokens
    generated_ids_trimmed = [
        out_ids[len(in_ids):] 
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    
    # Decode captions
    captions = processor.batch_decode(
        generated_ids_trimmed, 
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=False
    )
    
    # Clean captions
    clean_captions = [caption.strip() for caption in captions]
    
    return clean_captions


def evaluate_coco(annotation_file, results_file):
    """Evaluate results using pycocoevalcap."""
    print("\n" + "="*60)
    print("COCO Evaluation Results")
    print("="*60)
    
    # Load COCO API
    coco = COCO(annotation_file)
    coco_results = coco.loadRes(results_file)
    
    # Create evaluator
    coco_eval = COCOEvalCap(coco, coco_results)
    
    # Run evaluation
    coco_eval.evaluate()
    
    # Print scores
    print("\nMetrics:")
    for metric, score in coco_eval.eval.items():
        print(f"  {metric:10s}: {score:.4f}")
    
    return coco_eval.eval


def to_serializable(obj):
    """Convert numpy types to Python natives for JSON serialization."""
    if isinstance(obj, np.generic):
        return obj.item()
    raise TypeError(f"Type {type(obj)} not serializable")

In [7]:
# Load model and processor
model, processor = load_model_and_processor(MODEL_PATH, DEVICE)

Loading model: Qwen/Qwen2-VL-7B-Instruct


Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.12it/s]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


✓ Model loaded successfully


In [8]:
# Load COCO annotations
print(f"\nLoading COCO annotations from {ANNOTATION_FILE}")
coco = COCO(ANNOTATION_FILE)
img_ids = coco.getImgIds()
print(f"Total images: {len(img_ids)}")
print(f"Generating captions for {len(img_ids)} images...")
print(f"Batch size: {BATCH_SIZE}\n")


Loading COCO annotations from /home/kezouke/Thesis/coco2014/annotations/captions_val2014.json
loading annotations into memory...
Done (t=0.12s)
creating index...
index created!
Total images: 40504
Generating captions for 40504 images...
Batch size: 4



In [9]:
results = []
image_data_list = []

# Prepare image data
for img_id in img_ids:
    img_info = coco.loadImgs(img_id)[0]
    image_path = os.path.join(IMAGE_DIR, img_info['file_name'])
    image_data_list.append((img_id, image_path))

In [10]:
print("Testing on first batch...")
test_batch_size = min(BATCH_SIZE, len(image_data_list))
test_paths = [image_data_list[i][1] for i in range(test_batch_size)]
test_captions = generate_captions_batch(test_paths, model, processor, BATCH_SIZE)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Testing on first batch...


In [11]:
print(f"Sample captions from first batch:")
for i, caption in enumerate(test_captions[:3]):
    print(f"  {i+1}. {caption}")

Sample captions from first batch:
  1. A person wearing a red shirt and helmet is riding a motorcycle on a dirt path with a mountain in the background.
  2. A person wearing a red shirt and a hairnet is holding a knife and appears to be cutting a large block of butter or cheese. The setting looks like a kitchen or food processing area, with stainless steel appliances and a door in the background.
  3. A young boy is holding a colorful umbrella while standing near a group of cows in a field.


In [12]:
# Process all images in batches
for batch_idx in tqdm(range(0, len(image_data_list), BATCH_SIZE), desc="Processing batches"):
    batch_data = image_data_list[batch_idx:batch_idx+BATCH_SIZE]
    img_ids_batch = [x[0] for x in batch_data]
    paths_batch = [x[1] for x in batch_data]
    
    # Generate captions for batch
    captions = generate_captions_batch(
        paths_batch, 
        model, 
        processor, 
        batch_size=len(paths_batch)
    )
    
    # Handle case where some images failed to load
    if len(captions) < len(img_ids_batch):
        # Filter out failed images
        valid_img_ids = []
        valid_images = []
        for img_id, path in zip(img_ids_batch, paths_batch):
            try:
                Image.open(path).convert("RGB")
                valid_img_ids.append(img_id)
                valid_images.append(path)
            except:
                continue
        
        img_ids_batch = valid_img_ids
    
    # Add to results
    for img_id, caption in zip(img_ids_batch, captions):
        results.append({
            "image_id": img_id,
            "caption": caption
        })

Processing batches: 100%|██████████| 10126/10126 [5:26:47<00:00,  1.94s/it] 


In [13]:
# Save results
print(f"\nSaving results to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w') as f:
    json.dump(results, f)
print("✓ Results saved")


Saving results to qwen2vl_7b_coco_results.json
✓ Results saved


In [14]:
# Evaluate with COCO metrics
eval_scores = evaluate_coco(ANNOTATION_FILE, OUTPUT_FILE)


COCO Evaluation Results
loading annotations into memory...
Done (t=0.12s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.03s)
creating index...
index created!
tokenization...


PTBTokenizer tokenized 2492309 tokens at 4078004.09 tokens per second.
PTBTokenizer tokenized 1187032 tokens at 4044021.70 tokens per second.


setting up scorers...
clipscore is using cuda
computing Bleu score...
{'testlen': 1024735, 'reflen': 528770, 'guess': [1024735, 984231, 943727, 903223], 'correct': [458290, 207028, 83711, 33105]}
ratio: 1.937959793482985
Bleu_1: 0.447
Bleu_2: 0.307
Bleu_3: 0.203
Bleu_4: 0.132
computing METEOR score...
METEOR: 0.266
computing Rouge score...
ROUGE_L: 0.418
computing CIDEr score...
CIDEr: 0.242
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.4 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.2

SPICE evaluation took: 16.07 min
SPICE: 0.223
computing CLIPScore score...


  0%|          | 0/159 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


  0%|          | 0/792 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

due to a numerical instability, new numpy normalization is slightly different than paper results. to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.


40504it [00:00, 122970.06it/s]


CLIPScore: 0.826
RefCLIPScore: 0.805

Metrics:
  Bleu_1    : 0.4472
  Bleu_2    : 0.3067
  Bleu_3    : 0.2028
  Bleu_4    : 0.1322
  METEOR    : 0.2661
  ROUGE_L   : 0.4181
  CIDEr     : 0.2416
  SPICE     : 0.2227
  CLIPScore : 0.8257
  RefCLIPScore: 0.8047


In [15]:
# Save scores
scores_file = OUTPUT_FILE.replace('.json', '_scores.json')
with open(scores_file, 'w') as f:
    json.dump(eval_scores, f, indent=2, default=to_serializable)

print(f"✓ Scores saved to {scores_file}")

✓ Scores saved to qwen2vl_7b_coco_results_scores.json
