<a href="https://colab.research.google.com/github/hypro2/hands-on-LLM-from-colab/blob/main/vllm_embedding_model_serving.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install vllm

Collecting vllm
  Downloading vllm-0.6.6.post1-cp38-abi3-manylinux1_x86_64.whl.metadata (11 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting fastapi!=0.113.*,!=0.114.0,>=0.107.0 (from vllm)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn[standard] (from vllm)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.0.2-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting lm-format-enforcer<0.11,>=0.10.9 (from vllm)
  Downloading lm_format_enforcer-0.10.9-py3-none-any.whl.metadata (17 kB)
Collecting outlines==0.1.11 (from vllm)
  Downloading outlines-0.1.11-py3-none-any.whl.metadata (17 kB)
Collecting

In [8]:
from vllm import LLM

# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]

# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(
    model="BAAI/bge-base-en-v1.5",
    task="embed",
    enforce_eager=True,
)

# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts)

# Print the outputs.
for prompt, output in zip(prompts, outputs):
    embeds = output.outputs.embedding
    embeds_trimmed = ((str(embeds[:16])[:-1] +
                       ", ...]") if len(embeds) > 16 else embeds)
    print(f"Prompt: {prompt!r} | "
          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")

INFO 01-17 02:26:01 config.py:2272] Downcasting torch.float32 to torch.float16.
INFO 01-17 02:26:01 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='BAAI/bge-base-en-v1.5', speculative_config=None, tokenizer='BAAI/bge-base-en-v1.5', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=512, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=BAAI/bge-base-en-v1.5, num_scheduler_steps=1, multi_step_stream_outputs=True, enabl

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 01-17 02:26:03 model_runner.py:1099] Loading model weights took 0.2097 GB


Processed prompts: 100%|██████████| 4/4 [00:00<00:00, 110.25it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Prompt: 'Hello, my name is' | Embeddings: [-0.022430419921875, 0.0301666259765625, -0.0017852783203125, -0.040557861328125, 0.0133819580078125, 0.05303955078125, -0.0065155029296875, 0.0190887451171875, 0.003986358642578125, -0.0239715576171875, -0.0517578125, -0.01105499267578125, -0.00843048095703125, 0.023956298828125, -0.0004241466522216797, 0.03216552734375, ...] (size=768)
Prompt: 'The president of the United States is' | Embeddings: [-0.0009279251098632812, 0.00719451904296875, -0.004467010498046875, 0.01306915283203125, 0.0675048828125, -0.04296875, 0.025848388671875, 0.0079345703125, -0.0053863525390625, -0.054443359375, -0.04571533203125, -0.01541900634765625, -0.06646728515625, 0.04302978515625, 0.0146942138671875, 0.05609130859375, ...] (size=768)
Prompt: 'The capital of France is' | Embeddings: [-0.06182861328125, -0.047515869140625, 0.017822265625, -0.0157623291015625, -0.0394287109375, -0.00260162353515625, 0.01263427734375, 0.0560302734375, -0.03466796875, -0.0501708984




In [11]:
!nohup vllm serve BAAI/bge-base-en-v1.5 --dtype auto --api-key token-abc123 > nohup.out &

nohup: redirecting stderr to stdout


In [13]:
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "token-abc123"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=openai_api_key,
    base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

responses = client.embeddings.create(
    input=prompts,
    model=model,
)

for data in responses.data:
    print(data.embedding)  # list of float of len 4096

[-0.022430419921875, 0.0301666259765625, -0.0017852783203125, -0.040557861328125, 0.0133819580078125, 0.05303955078125, -0.0065155029296875, 0.0190887451171875, 0.003986358642578125, -0.0239715576171875, -0.0517578125, -0.01105499267578125, -0.00843048095703125, 0.023956298828125, -0.0004241466522216797, 0.03216552734375, 0.052001953125, 0.0174102783203125, -0.016937255859375, 0.0081939697265625, -0.0345458984375, 0.040283203125, 0.07891845703125, 0.020782470703125, -0.0014333724975585938, 0.0246734619140625, -0.01861572265625, 0.01134490966796875, -0.10760498046875, -0.0003674030303955078, 0.0250701904296875, -0.0155792236328125, 0.0264129638671875, -0.0003123283386230469, 0.022247314453125, -0.04388427734375, 0.002376556396484375, -0.0181732177734375, -0.0232086181640625, -0.00855255126953125, 0.0020503997802734375, -0.04156494140625, -0.05731201171875, -0.028106689453125, -0.023529052734375, -0.04229736328125, -0.031829833984375, -0.010009765625, -0.033447265625, -0.0430908203125, -