# Example for Qwen3-VL-Reranker using vLLM

## 1. Import Packages and Prepare Utility Functions

In [1]:
import os
from pathlib import Path
from typing import Dict, Any
from jinja2 import Template
from vllm import LLM
from vllm.multimodal.utils import fetch_image

def parse_input_dict(input_dict: Dict[str, Any]):
    """
    Parse input dictionary to extract image and text content.
    Returns the formatted content string and multimodal data.
    """
    image = input_dict.get('image')
    text = input_dict.get('text')

    mm_data = {
        'image': []
    }
    content = ''
    if image:
        content += '<|vision_start|><|image_pad|><|vision_end|>'
        if isinstance(image, str):
            if image.startswith(('http://', 'https://')):
                try:
                    image_obj = fetch_image(image)
                    mm_data['image'].append(image_obj)
                except Exception as e:
                    print(f"Warning: Failed to fetch image {image}: {e}")
            else:
                abs_image_path = os.path.abspath(image)
                if os.path.exists(abs_image_path):
                    from PIL import Image
                    image_obj = Image.open(abs_image_path)
                    mm_data['image'].append(image_obj)
                else:
                    print(f"Warning: Image file not found: {abs_image_path}")
        else:
            mm_data['image'].append(image)
    
    if text:
        content += text
    
    return content, mm_data

def format_vllm_input(
    query_dict: Dict[str, Any],
    doc_dict: Dict[str, Any],
    chat_template: str
):
    """
    Format query and document into vLLM input format.
    Combines multimodal data from both query and document.
    """
    query_content, query_mm_data = parse_input_dict(query_dict)
    doc_content, doc_mm_data = parse_input_dict(doc_dict)

    mm_data = { 'image': [] }
    mm_data['image'].extend(query_mm_data['image'])
    mm_data['image'].extend(doc_mm_data['image'])

    prompt = Template(chat_template).render(
        query_content=query_content,
        doc_content=doc_content,
    )
    return {
        'prompt': prompt,
        'multi_modal_data': mm_data
    }

def get_rank_scores(
    llm,
    inputs: Dict[str, Any],
    default_instruction: str = "Given a search query, retrieve relevant candidates that answer the query.",
    template_path: str = "reranker_template.jinja"
):
    """
    Generate relevance scores for documents given a query.
    Returns a list of scores for each document.
    """
    query_dict = inputs['query']
    doc_dicts = inputs['documents']
    instruction = inputs.get('instruction') or default_instruction

    chat_template = Template(Path(template_path).read_text())
    chat_template = chat_template.render(instruction=instruction)

    prompts = []

    for doc_dict in doc_dicts:
        prompt = format_vllm_input(
            query_dict, doc_dict, chat_template
        )
        prompts.append(prompt)

    outputs = llm.classify(
        prompts=prompts
    )
    scores = [ output.outputs.probs[0] for output in outputs ]
    return scores

  from .autonotebook import tqdm as notebook_tqdm


## 2. Initialize Model

In [2]:
# Initialize the Qwen3-VL-Reranker model
llm = LLM(
    model='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B',
    runner='pooling',
    dtype='bfloat16',
    trust_remote_code=True,
    hf_overrides={
        "architectures": ["Qwen3VLForSequenceClassification"],
        "classifier_from_token": ["no", "yes"],
        "is_original_qwen3_reranker": True,
    },
)

print("Model initialized successfully!")

INFO 01-16 16:15:13 [utils.py:267] non-default args: {'runner': 'pooling', 'trust_remote_code': True, 'dtype': 'bfloat16', 'disable_log_stats': True, 'hf_overrides': {'architectures': ['Qwen3VLForSequenceClassification'], 'classifier_from_token': ['no', 'yes'], 'is_original_qwen3_reranker': True}, 'model': '/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 01-16 16:15:13 [model.py:859] Resolved `--convert auto` to `--convert classify`. Pass the value explicitly to silence this message.


INFO 01-16 16:15:13 [model.py:530] Resolved architecture: Qwen3VLForSequenceClassification


INFO 01-16 16:15:13 [model.py:1547] Using max model len 262144


2026-01-16 16:15:14,132	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 01-16 16:15:14 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=16384.


INFO 01-16 16:15:14 [vllm.py:618] Asynchronous scheduling is enabled.


INFO 01-16 16:15:14 [vllm.py:625] Disabling NCCL for DP synchronization when using async scheduling.




[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:17 [core.py:96] Initializing a V1 LLM engine (v0.14.0rc2.dev90+gbcf2333cd) with config: model='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B', speculative_config=None, tokenizer='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_c

[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:18 [parallel_state.py:1212] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.78.15.105:46275 backend=nccl


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:18 [parallel_state.py:1423] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:27 [gpu_model_runner.py:3803] Starting to load model /cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B...


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:28 [mm_encoder_attention.py:86] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:28 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.39it/s]


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.39it/s]


[0;36m(EngineCore_DP0 pid=200152)[0;0m 




[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:29 [default_loader.py:291] Loading weights took 1.24 seconds


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:30 [gpu_model_runner.py:3900] Model loading took 4.4 GiB memory and 1.838998 seconds


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:30 [gpu_model_runner.py:4711] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 video items of the maximum feature size.


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:48 [backends.py:644] Using cache directory: /cpfs01/user/linqi.lmx/.cache/vllm/torch_compile_cache/c988518be9/rank_0_0/backbone for vLLM's torch.compile


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:48 [backends.py:704] Dynamo bytecode transform time: 7.87 s


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:52 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 16384) from the cache, took 0.866 s


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:52 [monitor.py:34] torch.compile takes 8.74 s in total


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:53 [gpu_worker.py:355] Available KV cache memory: 65.22 GiB


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:53 [kv_cache_utils.py:1307] GPU KV cache size: 610,560 tokens


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:53 [kv_cache_utils.py:1312] Maximum concurrency for 262,144 tokens per request: 2.33x


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

2026-01-16 16:15:53,469 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

2026-01-16 16:15:53,485 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|                                                        | 0/51 [00:00<?, ?it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   2%|▉                                               | 1/51 [00:00<00:11,  4.43it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   6%|██▊                                             | 3/51 [00:00<00:04,  9.89it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  12%|█████▋                                          | 6/51 [00:00<00:02, 15.99it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  16%|███████▌                                        | 8/51 [00:00<00:02, 16.79it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  24%|███████████                                    | 12/51 [00:00<00:01, 22.01it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  29%|█████████████▊                                 | 15/51 [00:00<00:01, 22.91it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  35%|████████████████▌                              | 18/51 [00:00<00:01, 21.15it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  41%|███████████████████▎                           | 21/51 [00:01<00:01, 21.96it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  47%|██████████████████████                         | 24/51 [00:01<00:01, 21.56it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  53%|████████████████████████▉                      | 27/51 [00:01<00:01, 23.44it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  59%|███████████████████████████▋                   | 30/51 [00:01<00:00, 25.06it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  65%|██████████████████████████████▍                | 33/51 [00:01<00:00, 25.86it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  71%|█████████████████████████████████▏             | 36/51 [00:01<00:00, 25.53it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  76%|███████████████████████████████████▉           | 39/51 [00:01<00:00, 25.63it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  82%|██████████████████████████████████████▋        | 42/51 [00:01<00:00, 26.49it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  88%|█████████████████████████████████████████▍     | 45/51 [00:02<00:00, 26.77it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  96%|█████████████████████████████████████████████▏ | 49/51 [00:02<00:00, 28.63it/s]

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|███████████████████████████████████████████████| 51/51 [00:02<00:00, 23.35it/s]




[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:56 [gpu_model_runner.py:4852] Graph capturing finished in 3 secs, took -0.76 GiB


[0;36m(EngineCore_DP0 pid=200152)[0;0m 

INFO 01-16 16:15:56 [core.py:272] init engine (profile, create kv cache, warmup model) took 25.93 seconds


INFO 01-16 16:16:03 [llm.py:347] Supported tasks: ['classify', 'score', 'token_classify']


Model initialized successfully!


## 3. Prepare Input Data

In [3]:
# Define query and candidate documents for reranking
inputs = {
    "instruction": "Retrieve images or text relevant to the user's query.",
    "query": {
        "text": "A woman playing with her dog on a beach at sunset."
    },
    "documents": [
        {
            "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust."
        },
        {
            "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
        },
        {
            "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.",
            "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
        }
    ]
}

print(f"Prepared query with {len(inputs['documents'])} candidate documents")

Prepared query with 3 candidate documents


## 4. Generate Relevance Scores

In [4]:
# Get relevance scores for each document
scores = get_rank_scores(llm, inputs)

print("Relevance Scores:")
for i, score in enumerate(scores):
    print(f"Document {i+1}: {score:.4f}")

Adding requests:   0%|                                                                                                 | 0/3 [00:00<?, ?it/s]

Adding requests:  33%|█████████████████████████████▋                                                           | 1/3 [00:02<00:04,  2.40s/it]

Adding requests: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.51it/s]

Adding requests: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.20it/s]




Processed prompts:   0%|                                           | 0/3 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|███████████████████████████████████| 3/3 [00:00<00:00, 48.83it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|███████████████████████████████████| 3/3 [00:00<00:00, 48.48it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Relevance Scores:
Document 1: 0.8600
Document 2: 0.7232
Document 3: 0.8189



