In [1]:
from vllm import AsyncEngineArgs, AsyncLLMEngine

engine_args = AsyncEngineArgs(
    model = "Open-Orca/OpenOrcaxOpenChat-Preview2-13B",
    download_dir = None,
    gpu_memory_utilization = 0.80,
)
               
engine = AsyncLLMEngine.from_engine_args(engine_args)

INFO 10-06 17:25:37 llm_engine.py:72] Initializing an LLM engine with config: model='Open-Orca/OpenOrcaxOpenChat-Preview2-13B', tokenizer='Open-Orca/OpenOrcaxOpenChat-Preview2-13B', tokenizer_mode=auto, revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 10-06 17:26:06 llm_engine.py:205] # GPU blocks: 1081, # CPU blocks: 327


In [2]:
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
from typing import AsyncGenerator

async def generate(request: Request) -> Response:
    """Generate completion for the request.

    The request should be a JSON object with the following fields:
    - prompt: the prompt to use for the generation.
    - stream: whether to stream the results or not.
    - other fields: the sampling parameters (See `SamplingParams` for details).
    """
    request_dict = dict(request)
    prompt = request_dict.pop("prompt")
    stream = request_dict.pop("stream", False)
    sampling_params = SamplingParams(**request_dict)
    request_id = random_uuid()

    results_generator = engine.generate(prompt, sampling_params, request_id)

    # Streaming case
    async def stream_results() -> AsyncGenerator[bytes, None]:
        async for request_output in results_generator:
            prompt = request_output.prompt
            text_outputs = [
                prompt + output.text for output in request_output.outputs
            ]
            ret = {"text": text_outputs}
            yield (json.dumps(ret) + "\0").encode("utf-8")

    if stream:
        return StreamingResponse(stream_results())

    # Non-streaming case
    final_output = None
    async for request_output in results_generator:
        # if await request.is_disconnected():
        #     # Abort the request if the client disconnects.
        #     await engine.abort(request_id)
        #     return Response(status_code=499)
        final_output = request_output

    assert final_output is not None
    prompt = final_output.prompt
    text_outputs = [prompt + output.text for output in final_output.outputs]
    ret = {"text": text_outputs}
    return JSONResponse(ret)

In [3]:
from pydantic import BaseModel

class TestRequest(BaseModel):
    prompt: str
    stream: bool | None = None

In [4]:
test_request = TestRequest(prompt="Finish learning how to", stream=False)

In [6]:
t = await generate(test_request)

INFO 10-03 01:55:21 async_llm_engine.py:371] Received request 3b30d8a0889f4d76a431c93cd012af99: prompt: 'Finish learning how to', sampling params: SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, temperature=1.0, top_p=1.0, top_k=-1, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=[], ignore_eos=False, max_tokens=16, logprobs=None, skip_special_tokens=True), prompt token ids: None.
INFO 10-03 01:55:21 llm_engine.py:616] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.1%, CPU KV cache usage: 0.0%
INFO 10-03 01:55:22 async_llm_engine.py:111] Finished request 3b30d8a0889f4d76a431c93cd012af99.
