# Example for Qwen3-VL-Embedding using vLLM

## 1. Import Packages and Prepare Utility Functions

In [1]:
import numpy as np
import os
from typing import List, Dict, Any
from vllm import LLM
from vllm.multimodal.utils import fetch_image
from PIL import Image

def format_input_to_conversation(
    input_dict: Dict[str, Any], 
    default_instruction: str = "Represent the user's input."
) -> List[Dict]:
    content = []
    
    instruction = input_dict.get('instruction') or default_instruction
    text = input_dict.get('text')
    image = input_dict.get('image')
    
    if image:
        image_content = None
        if isinstance(image, str):
            if image.startswith(('http://', 'https://')):
                image_content = image
            else:
                abs_image_path = os.path.abspath(image)
                image_content = 'file://' + abs_image_path
        else:
            image_content = image
        
        if image_content:
            content.append({
                'type': 'image', 
                'image': image_content,
            })
    
    if text:
        content.append({'type': 'text', 'text': text})
    
    if not content:
        content.append({'type': 'text', 'text': ""})
    
    conversation = [
        {"role": "system", "content": [{"type": "text", "text": instruction}]},
        {"role": "user", "content": content}
    ]
    
    return conversation

def prepare_vllm_inputs(
    input_dict: Dict[str, Any], 
    llm, 
) -> Dict[str, Any]:
    conversation = format_input_to_conversation(input_dict)
    
    prompt_text = llm.llm_engine.tokenizer.apply_chat_template(
        conversation, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    multi_modal_data = None
    image = input_dict.get('image')
    if image:
        if isinstance(image, str):
            if image.startswith(('http://', 'https://')):
                try:
                    image_obj = fetch_image(image)
                    multi_modal_data = {"image": image_obj}
                except Exception as e:
                    print(f"Warning: Failed to fetch image {image}: {e}")
            else:
                abs_image_path = os.path.abspath(image)
                if os.path.exists(abs_image_path):
                    image_obj = Image.open(abs_image_path)
                    multi_modal_data = {"image": image_obj}
                else:
                    print(f"Warning: Image file not found: {abs_image_path}")
        else:
            multi_modal_data = {"image": image}
    
    result = {
        "prompt": prompt_text,
        "multi_modal_data": multi_modal_data
    }
    return result

  from .autonotebook import tqdm as notebook_tqdm


## 2. Convert Inputs

In [None]:
# Initialize model
llm = LLM(
    model="Qwen/Qwen3-VL-Embedding-2B",
    runner="pooling",
    dtype='bfloat16',
    trust_remote_code=True,
)

# Prepare input samples
inputs = [
    {
        "text": "A woman playing with her dog on a beach at sunset.",
        "instruction": "Retrieve images or text relevant to the user's query.",
    },
    {
        "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust."
    },
    {
        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
    },
    {
        "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.",
        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
    }
]

print(f"Prepared {len(inputs)} input samples")

# Convert to vLLM format
vllm_inputs = [prepare_vllm_inputs(inp, llm) for inp in inputs]

print("Input conversion completed!")
print(f"\nPreview of the first input prompt:")
print(vllm_inputs[0]["prompt"][:200] + "...")

INFO 01-16 07:47:29 [utils.py:267] non-default args: {'runner': 'pooling', 'trust_remote_code': True, 'dtype': 'bfloat16', 'disable_log_stats': True, 'model': '/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 01-16 07:47:29 [model.py:859] Resolved `--convert auto` to `--convert embed`. Pass the value explicitly to silence this message.


INFO 01-16 07:47:29 [model.py:530] Resolved architecture: Qwen3VLForConditionalGeneration


INFO 01-16 07:47:29 [model.py:1547] Using max model len 262144


2026-01-16 07:47:30,017	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 01-16 07:47:30 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=16384.


INFO 01-16 07:47:30 [vllm.py:618] Asynchronous scheduling is enabled.


INFO 01-16 07:47:30 [vllm.py:625] Disabling NCCL for DP synchronization when using async scheduling.




The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:47:33 [core.py:96] Initializing a V1 LLM engine (v0.14.0rc2.dev90+gbcf2333cd) with config: model='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B', speculative_config=None, tokenizer='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability

[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:47:33 [parallel_state.py:1212] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.78.15.105:38423 backend=nccl


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:47:33 [parallel_state.py:1423] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:47:43 [gpu_model_runner.py:3803] Starting to load model /cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B...


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:47:44 [mm_encoder_attention.py:86] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:47:44 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')


[0;36m(EngineCore_DP0 pid=82232)[0;0m 


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=82232)[0;0m 


Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.09it/s]


[0;36m(EngineCore_DP0 pid=82232)[0;0m 


Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.08it/s]


[0;36m(EngineCore_DP0 pid=82232)[0;0m 




[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:47:46 [default_loader.py:291] Loading weights took 1.06 seconds


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:47:46 [gpu_model_runner.py:3900] Model loading took 4.4 GiB memory and 2.499499 seconds


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:47:46 [gpu_model_runner.py:4711] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 video items of the maximum feature size.


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:48:06 [backends.py:644] Using cache directory: /cpfs01/user/linqi.lmx/.cache/vllm/torch_compile_cache/86f6b21f85/rank_0_0/backbone for vLLM's torch.compile


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:48:07 [backends.py:704] Dynamo bytecode transform time: 9.40 s


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:48:10 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 16384) from the cache, took 0.889 s


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:48:10 [monitor.py:34] torch.compile takes 10.29 s in total


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:48:11 [gpu_worker.py:355] Available KV cache memory: 65.22 GiB


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:48:11 [kv_cache_utils.py:1307] GPU KV cache size: 610,560 tokens


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:48:11 [kv_cache_utils.py:1312] Maximum concurrency for 262,144 tokens per request: 2.33x


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

2026-01-16 07:48:11,729 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

2026-01-16 07:48:11,757 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends


[0;36m(EngineCore_DP0 pid=82232)[0;0m 


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|                                                                                                           | 0/51 [00:00<?, ?it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   4%|███▉                                                                                               | 2/51 [00:00<00:03, 13.39it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   8%|███████▊                                                                                           | 4/51 [00:00<00:04, 11.55it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  14%|█████████████▌                                                                                     | 7/51 [00:00<00:02, 16.33it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  22%|█████████████████████▏                                                                            | 11/51 [00:00<00:01, 21.82it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  29%|████████████████████████████▊                                                                     | 15/51 [00:00<00:01, 24.88it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  35%|██████████████████████████████████▌                                                               | 18/51 [00:00<00:01, 24.55it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  41%|████████████████████████████████████████▎                                                         | 21/51 [00:00<00:01, 25.34it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  49%|████████████████████████████████████████████████                                                  | 25/51 [00:01<00:00, 26.92it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  55%|█████████████████████████████████████████████████████▊                                            | 28/51 [00:01<00:00, 26.75it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  61%|███████████████████████████████████████████████████████████▌                                      | 31/51 [00:01<00:00, 24.39it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  67%|█████████████████████████████████████████████████████████████████▎                                | 34/51 [00:01<00:00, 24.80it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  73%|███████████████████████████████████████████████████████████████████████                           | 37/51 [00:01<00:00, 24.48it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  78%|████████████████████████████████████████████████████████████████████████████▊                     | 40/51 [00:01<00:00, 25.07it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  84%|██████████████████████████████████████████████████████████████████████████████████▋               | 43/51 [00:01<00:00, 25.66it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  90%|████████████████████████████████████████████████████████████████████████████████████████▍         | 46/51 [00:01<00:00, 26.76it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  98%|████████████████████████████████████████████████████████████████████████████████████████████████  | 50/51 [00:02<00:00, 28.94it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:02<00:00, 24.62it/s]




[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:48:14 [gpu_model_runner.py:4852] Graph capturing finished in 3 secs, took -0.76 GiB


[0;36m(EngineCore_DP0 pid=82232)[0;0m 

INFO 01-16 07:48:14 [core.py:272] init engine (profile, create kv cache, warmup model) took 27.63 seconds


INFO 01-16 07:48:21 [llm.py:347] Supported tasks: ['embed', 'token_embed']


Prepared 4 input samples


Input conversion completed!

Preview of the first input prompt:
<|im_start|>system
Retrieve images or text relevant to the user's query.<|im_end|>
<|im_start|>user
A woman playing with her dog on a beach at sunset.<|im_end|>
<|im_start|>assistant
...


## 3. Get Embeddings

In [3]:
# Generate embeddings
outputs = llm.embed(vllm_inputs)

# Extract embedding vectors
embeddings_list = []
for i, output in enumerate(outputs):
    emb = output.outputs.embedding
    embeddings_list.append(emb)

embeddings = np.array(embeddings_list)
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimension per sample: {embeddings.shape[1]}")


Adding requests:   0%|                                                                                                                                                    | 0/4 [00:00<?, ?it/s]


Adding requests:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 3/4 [00:02<00:00,  1.14it/s]


Adding requests: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.50it/s]





Processed prompts:   0%|                                                                                              | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 53.65it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 53.33it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Embeddings shape: (4, 2048)
Embedding dimension per sample: 2048





## 4. Display Similarity Scores

In [4]:
# Calculate similarity matrix
similarity_scores = embeddings @ embeddings.T

print("Similarity Score Matrix:")
print(similarity_scores)

Similarity Score Matrix:
[[1.00000007 0.78962161 0.71557983 0.72364763]
 [0.78962161 0.9999999  0.76502916 0.84728952]
 [0.71557983 0.76502916 0.99999998 0.82388284]
 [0.72364763 0.84728952 0.82388284 1.00000003]]
