In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from huggingface_hub import login
import torch
import os
import json
import re
from PIL import Image
from tqdm import tqdm
import unicodedata

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
your_hf_token = ""
login(token=your_hf_token)

In [None]:
MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
IMAGE_DIR = "Thesis/coco2014/test2014"
OUTPUT_FILE = "qwen2vl_7b_coco_test_results.json"
BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
GENERATION_PARAMS = {
    "max_new_tokens": 50,
    "do_sample": False,
}

torch.cuda.empty_cache()
if torch.cuda.is_available():
    print(f"Free GPU memory: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")


Free GPU memory: 23.30 GB


In [5]:
def extract_image_id(filename):
    match = re.search(r'(\d{12})\.(jpg|jpeg|png)$', filename, re.IGNORECASE)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f"Could not extract image_id from: {filename}")

# === Helper: Clean caption for COCO server (ASCII-only) ===
def clean_caption_for_coco(caption):
    if not isinstance(caption, str):
        caption = str(caption)
    nfkd = unicodedata.normalize('NFKD', caption)
    ascii_str = nfkd.encode('ASCII', 'ignore').decode('ASCII')
    return ' '.join(ascii_str.split()) or "a photo"

In [None]:
print(f"Loading model: {MODEL_PATH}")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto"
).eval()

processor = AutoProcessor.from_pretrained(MODEL_PATH)
print(f"✓ Model loaded on {next(model.parameters()).device}")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading model: Qwen/Qwen2-VL-7B-Instruct


Loading checkpoint shards:  20%|██        | 1/5 [00:36<02:27, 36.88s/it]

In [None]:
all_files = os.listdir(IMAGE_DIR)
image_files = [f for f in all_files if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
image_files.sort()
print(f"\nFound {len(image_files)} images in {IMAGE_DIR}")


Found 40775 images in /home/kezouke/Thesis/coco2014/test2014


In [None]:
def generate_captions_batch(image_paths, model, processor):
    images = []
    valid_paths = []
    for path in image_paths:
        try:
            img = Image.open(path).convert("RGB")
            images.append(img)
            valid_paths.append(path)
        except Exception as e:
            print(f"⚠️ Skipping {path}: {e}")
            continue

    if not images:
        return [], []

    # Prepare messages
    messages = []
    for img in images:
        messages.append([
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": "Generate a detailed caption for this image in one sentence."}
                ]
            }
        ])

    # Apply chat template
    texts = [
        processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
        for msg in messages
    ]

    # Process vision inputs
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=texts,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)

    # Generate
    with torch.inference_mode():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=GENERATION_PARAMS["max_new_tokens"],
            do_sample=GENERATION_PARAMS["do_sample"],
        )

    # Trim input tokens
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]

    # Decode
    captions = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    clean_captions = [cap.strip() for cap in captions]
    image_ids = [extract_image_id(os.path.basename(p)) for p in valid_paths]
    return image_ids, clean_captions

In [None]:
results = []

for start_idx in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing batches"):
    batch_files = image_files[start_idx:start_idx + BATCH_SIZE]
    batch_paths = [os.path.join(IMAGE_DIR, f) for f in batch_files]

    image_ids, captions = generate_captions_batch(batch_paths, model, processor)

    for img_id, cap in zip(image_ids, captions):
        cleaned_cap = clean_caption_for_coco(cap)
        results.append({
            "image_id": img_id,
            "caption": cleaned_cap
        })


Processing batches:   0%|          | 0/10194 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Processing batches: 100%|██████████| 10194/10194 [5:33:07<00:00,  1.96s/it] 


In [None]:
print(f"\nSaving {len(results)} captions to {OUTPUT_FILE}")
with open(OUTPUT_FILE, 'w', encoding='ascii') as f:
    json.dump(results, f, indent=2, ensure_ascii=True)

print("✅ Done! Ready for COCO test server submission.")


Saving 40775 captions to qwen2vl_7b_coco_test_results.json
✅ Done! Ready for COCO test server submission.
