# 使用 vLLM 提取嵌入

我们可以在 GitHub 上获取 vllm 代码：https://github.com/vllm-project/vllm ；官方网站：https://vllm.ai/

从大语言模型中提取嵌入时，vLLM 库相比传统的 HuggingFace 方法具有几个显著优势。以下是详细对比：

1. **显著更快的推理速度**

   vLLM 针对高吞吐量推理进行了优化。它利用先进的调度、连续批处理和更高效的内存管理来最大化 GPU 利用率，从而实现更快的嵌入提取，特别是在处理大批量或大量文本时。

2. **更好的 GPU 内存效率**

   vLLM 使用 PagedAttention 和内存分页技术来最小化冗余内存分配，使其能够处理更多请求和更大的批次，而不会像 HuggingFace 的默认管道那样快速遇到内存不足的问题。这使得更大的模型（如 10B+ 参数）能够在可用硬件上更流畅地运行。

3. **卓越的多 GPU 支持**

   通过内置的张量并行支持和高效的多 GPU 调度，vLLM 可以自动在多个 GPU 之间分割模型和工作负载，最大限度地减少瓶颈和手动配置。HuggingFace 加速需要更多手动设置，并且在高吞吐量嵌入提取方面通常扩展性较差。

4. **长上下文的更高吞吐量**

   vLLM 专门设计用于高效处理长输入序列，使其在从长文档或段落中提取嵌入时更加适合，而 HuggingFace 方法在这些场景中可能会受到内存限制或变得缓慢。

5. **易于部署**

   vLLM 提供开箱即用的 API 和 HTTP 服务器支持，用于生产推理，允许快速大规模部署。将 HuggingFace 模型集成到生产服务管道中通常需要额外的工程工作才能实现类似的吞吐量和稳定性。


In [None]:
import random
import numpy as np
import torch
import os


def set_seed(seed):
   random.seed(seed)
   np.random.seed(seed)
   os.environ['PYTHONHASHSEED'] = str(seed)
   torch.manual_seed(seed)
   torch.cuda.manual_seed(seed)
   torch.cuda.manual_seed_all(seed)
   torch.backends.cudnn.deterministic = True
   torch.backends.cudnn.benchmark = False


set_seed(42)


In [None]:
import torch
import subprocess
import vllm

def get_gpu_info():
    try:
        cmd = "nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader"
        result = subprocess.check_output(cmd, shell=True, encoding='utf-8').strip().split('\n')
        print("GPU 信息：")
        for idx, line in enumerate(result):
            name, total_mem, free_mem = [s.strip() for s in line.split(',')]
            print(f"  GPU {idx}: {name}, 总内存: {total_mem}, 空闲内存: {free_mem}")
        print(f"检测到 {len(result)} 个 GPU")
    except Exception as e:
        print("获取 GPU 信息失败：", e)

def print_env_info():
    print(f"PyTorch 版本: {torch.__version__}")
    print(f"vllm 版本: {vllm.__version__}")
    cuda_available = torch.cuda.is_available()
    print(f"CUDA 可用: {cuda_available}")
    if cuda_available:
        print(f"CUDA 版本: {torch.version.cuda}")
        print(f"cuDNN 版本: {torch.backends.cudnn.version()}")
print('测试环境：')
print_env_info()
print()
get_gpu_info()


测试环境：
PyTorch 版本: 2.7.1+cu128
vllm 版本: 0.10.1.dev1+gbcc0a3cbe
CUDA 可用: True
CCUDA 版本: 12.8
cuDNN 版本: 90701

GPU 信息：
  GPU 0: NVIDIA A40, 总内存: 46068 MiB, 空闲内存: 45403 MiB
  GPU 1: NVIDIA A40, 总内存: 46068 MiB, 空闲内存: 45403 MiB
检测到 2 个 GPU


# 传统嵌入提取方法


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import time

# 选择设备映射以使用 2 个 GPU

model_path = "/path/to/Genos-10B"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(
    model_path,
    device_map="auto" if torch.cuda.device_count() >= 2 else None
)

text = "ATCG"
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}

start_time = time.time()
with torch.no_grad():
    outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    print(f"最后隐藏层维度: {last_hidden_state.shape}")

    # MEAN 池化
    if "attention_mask" in inputs:
        mask = inputs["attention_mask"].unsqueeze(-1)  # [batch, seq, 1]
        masked_hidden = last_hidden_state * mask
        sum_hidden = masked_hidden.sum(dim=1)
        lengths = mask.sum(dim=1)  # [batch, 1]
        mean_pooled = sum_hidden / lengths
    else:
        # 没有注意力掩码时，对所有 token 求平均
        mean_pooled = last_hidden_state.mean(dim=1)

    print(f"平均池化嵌入维度: {mean_pooled.shape}")
    print(f"平均池化嵌入: {mean_pooled.cpu().numpy()}")

end_time = time.time()
elapsed = end_time - start_time
print(f"嵌入提取时间: {elapsed:.4f} 秒")


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

最后隐藏层维度: torch.Size([1, 4, 4096])
平均池化嵌入维度: torch.Size([1, 4096])
平均池化嵌入: [[-0.7643822  -0.12757319  0.5734206  ... -0.12371235 -0.00170616
  -0.33227795]]
嵌入提取时间: 0.7419 秒


# vLLM 嵌入提取


In [None]:
from vllm import LLM, SamplingParams
from vllm import TokensPrompt
import torch
from transformers import AutoTokenizer, AutoModel
import os
from vllm.config import PoolerConfig
from vllm.pooling_params import PoolingParams


model_path = "/path/to/Genos-10B"
seq_length = 128 * 1024 
gpu_num = 2

llm = LLM(
    model=model_path,
    trust_remote_code=True,
    tensor_parallel_size=gpu_num, 
    block_size=32,
    enable_prefix_caching=True,
    enforce_eager=True,
    gpu_memory_utilization=0.85,  # 提高 GPU 内存利用率
    dtype=torch.bfloat16,
    max_model_len=seq_length,
    max_num_batched_tokens=seq_length,
    override_pooler_config=PoolerConfig(pooling_type="MEAN", normalize=False), # 池化参数，不使用此参数可获取完整的隐藏状态
    task='reward',
    enable_chunked_prefill=False
)


INFO 12-17 01:29:32 [__init__.py:235] Automatically detected platform cuda.
INFO 12-17 01:29:44 [config.py:3440] Downcasting torch.float32 to torch.bfloat16.
INFO 12-17 01:29:44 [config.py:1604] Using max model len 131072
INFO 12-17 01:29:48 [config.py:4628] Only "last" pooling supports chunked prefill and prefix caching; disabling both.
INFO 12-17 01:29:48 [core.py:572] Waiting for init message from front-end.
INFO 12-17 01:29:48 [core.py:71] Initializing a V1 LLM engine (v0.10.1.dev1+gbcc0a3cbe) with config: model='/path/to/Genos-10B', speculative_config=None, tokenizer='/path/to/Genos-10B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decodi

Loading safetensors checkpoint shards:   0% Completed | 0/9 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=1 pid=6387)[0;0m INFO 12-17 01:30:07 [default_loader.py:262] Loading weights took 13.83 seconds
[1;36m(VllmWorker rank=0 pid=6386)[0;0m INFO 12-17 01:30:07 [default_loader.py:262] Loading weights took 13.87 seconds
[1;36m(VllmWorker rank=1 pid=6387)[0;0m INFO 12-17 01:30:08 [gpu_model_runner.py:1892] Model loading took 10.0644 GiB and 14.017120 seconds
[1;36m(VllmWorker rank=0 pid=6386)[0;0m INFO 12-17 01:30:08 [gpu_model_runner.py:1892] Model loading took 10.0644 GiB and 14.020643 seconds
[1;36m(VllmWorker rank=1 pid=6387)[0;0m INFO 12-17 01:30:22 [gpu_worker.py:255] Available KV cache memory: 22.02 GiB
[1;36m(VllmWorker rank=0 pid=6386)[0;0m INFO 12-17 01:30:22 [gpu_worker.py:255] Available KV cache memory: 22.02 GiB
INFO 12-17 01:30:23 [kv_cache_utils.py:833] GPU KV cache size: 481,120 tokens
INFO 12-17 01:30:23 [kv_cache_utils.py:837] Maximum concurrency for 131,072 tokens per request: 3.67x
INFO 12-17 01:30:23 [kv_cache_utils.py:833] GPU KV cache

In [None]:
import time
tokenizer = llm.get_tokenizer()
seqs = ['ATCG']

token_ids = tokenizer(seqs, add_special_tokens=False)["input_ids"]

start_time = time.time()
outputs = llm.encode(prompt_token_ids=token_ids)
pooleds = []
for i, output in enumerate(outputs):
    pooled = output.outputs.data
    pooleds.append(pooled)
end_time = time.time()
elapsed = end_time - start_time
print(f"嵌入提取时间: {elapsed:.4f} 秒")
print(pooleds[0])


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

嵌入提取时间: 0.1067 秒


In [None]:
vllm_mean_pool = pooleds[0]
# 计算 vllm_mean_pool 和 mean_pooled 之间的 L1 距离、L2 距离和 Pearson 相关系数

l1_distance = torch.norm(vllm_mean_pool.cpu() - mean_pooled.cpu(), p=1).item()
l2_distance = torch.norm(vllm_mean_pool.cpu() - mean_pooled.cpu(), p=2).item()

# 展平张量以进行相关性计算
vllm_mean_flat = vllm_mean_pool.view(-1).cpu().numpy()
mean_pooled_flat = mean_pooled.view(-1).cpu().numpy()

if vllm_mean_flat.std() == 0 or mean_pooled_flat.std() == 0:
    pearson_corr = float('nan')
else:
    pearson_corr = np.corrcoef(vllm_mean_flat, mean_pooled_flat)[0, 1]

print(f"vllm_mean_pool 和 mean_pooled 之间的 L1 距离: {l1_distance:.6f}")
print(f"vllm_mean_pool 和 mean_pooled 之间的 L2 距离: {l2_distance:.6f}")
print(f"vllm_mean_pool 和 mean_pooled 之间的 Pearson 相关系数: {pearson_corr:.6f}")


vllm_mean_pool 和 mean_pooled 之间的 L1 距离: 7.521327
vllm_mean_pool 和 mean_pooled 之间的 L2 距离: 0.195607
vllm_mean_pool 和 mean_pooled 之间的 Pearson 相关系数: 0.999995


使用 vLLM 进行嵌入提取相比传统方法实现了 **7 倍加速**，同时产生的嵌入与原始输出高度匹配，显著提高了处理效率。


# 传统方法无法提取超长序列

In [None]:
text = "ATCG" * 32 * 1024 # 128k
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}

start_time = time.time()
with torch.no_grad():
    outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    print(f"最后隐藏层形状: {last_hidden_state.shape}")

    # MEAN 池化
    if "attention_mask" in inputs:
        mask = inputs["attention_mask"].unsqueeze(-1)  # [batch, seq, 1]
        masked_hidden = last_hidden_state * mask
        sum_hidden = masked_hidden.sum(dim=1)
        lengths = mask.sum(dim=1)  # [batch, 1]
        mean_pooled = sum_hidden / lengths
    else:
        # 没有注意力掩码时，对所有 token 求平均
        mean_pooled = last_hidden_state.mean(dim=1)

    print(f"平均池化嵌入形状: {mean_pooled.shape}")
    print(f"平均池化嵌入: {mean_pooled.cpu().numpy()}")

end_time = time.time()
elapsed = end_time - start_time
print(f"嵌入提取时间: {elapsed:.4f} 秒")


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 GiB. GPU 0 has a total capacity of 44.34 GiB of which 10.62 GiB is free. Process 4005586 has 33.71 GiB memory in use. Of the allocated memory 33.39 GiB is allocated by PyTorch, and 12.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# vllm 提取超长序列

In [None]:
import time
tokenizer = llm.get_tokenizer()
seqs = ['ATCG' * 32 * 1024]

token_ids = tokenizer(seqs, add_special_tokens=False)["input_ids"]

start_time = time.time()
outputs = llm.encode(prompt_token_ids=token_ids)
pooleds = []
for i, output in enumerate(outputs):
    pooled = output.outputs.data
    pooleds.append(pooled)
end_time = time.time()
elapsed = end_time - start_time
print(f"嵌入提取时间: {elapsed:.4f} 秒")
print(pooleds[0])


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

嵌入提取时间:  21.7921 秒
tensor([-0.0255,  0.1597, -0.1451,  ..., -0.0175,  0.0131, -0.1934])


# 总结

基于上述实验，我们可以得出以下结论：

1. vLLM 相比传统的 HuggingFace 方法实现了显著更快的嵌入提取——大约快 7 倍。

2. vLLM 产生的嵌入与常规方法生成的嵌入高度一致，表明 vLLM 可以可靠地用于提高效率。

3. vLLM 能够在有限的计算资源内从更长的序列中提取嵌入，使其非常适合需要处理超长输入的下游研究任务。

4. 您可以参考上述演示来使用 vllm 提取嵌入。
