In [2]:
import os

from vllm import LLM
from vllm.config import PoolerConfig
from vllm.entrypoints.chat_utils import (
    apply_hf_chat_template,
    parse_chat_messages,
    resolve_chat_template_content_format
)

from config import settings




In [1]:
import vllm 
print(vllm.__version__)

0.7.3


In [26]:
model_dir = os.path.join(
    settings.model_weight_dir, "embedding/gme-Qwen2-VL-2B-Instruct"
)
pooler_config = PoolerConfig(
    pooling_type="LAST",
    normalize=True,
    softmax=False
)

llm = LLM(
    model=model_dir,
    trust_remote_code=False,
    device="cpu",
    max_model_len=32000,
    task="embed",
    dtype="half",
    override_pooler_config=pooler_config,
    enforce_eager=True
)

INFO 02-26 00:13:21 config.py:2444] Downcasting torch.float32 to torch.float16.
INFO 02-26 00:13:21 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/Volumes/T7_YR/models/embedding/gme-Qwen2-VL-2B-Instruct', speculative_config=None, tokenizer='/Volumes/T7_YR/models/embedding/gme-Qwen2-VL-2B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32000, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cpu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/Volumes/T7_YR/models/embedding/gme-Qwen2-VL-2B-Instruct

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
llm.llm_engine.model_config.task

'embed'

In [28]:
# Prepare chat formatter
tokenizer = llm.get_tokenizer()
model_config = llm.llm_engine.get_model_config()
resolved_content_format = resolve_chat_template_content_format(
    None, "auto", tokenizer
)
print(resolved_content_format)

INFO 02-26 00:13:37 chat_utils.py:332] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.
openai


In [29]:
def prepare_inputs(messages):
    conversation, mm_data = parse_chat_messages(
        messages,
        model_config,
        tokenizer,
        content_format=resolved_content_format
    )
    prompt_data = apply_hf_chat_template(
        tokenizer,
        conversation=conversation,
        chat_template=None,
        add_generation_prompt=False,
        continue_final_message=False,
        tools=None
    )
    return prompt_data, mm_data

def embed(messages):
    prompt_data, mm_data = prepare_inputs(messages)
    outputs = llm.embed(
        {
            "prompt": prompt_data,
            "multi_modal_data": mm_data
        }
    )
    print(prompt_data,)
    print(outputs[0].outputs.embedding)
    embedding = outputs[0].outputs.embedding
    return embedding

In [30]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "What is this car?"
            }
        ]
    }
]

embedding = embed(messages)
print(len(embedding))

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.26it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
What is this car?<|im_end|>

[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 




In [31]:
embedding[:5]

[nan, nan, nan, nan, nan]