In [1]:
from vllm import LLM, SamplingParams

# Define input prompts & SamplingParams

In [2]:
sampling_params = SamplingParams(temperature=0.2, top_p=0.95)

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("model_16bit")

messages_list = [
    [
        {"from": "human", "value": "Next number of the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
    ],
    [
        {"from": "human", "value": "台灣最高的建築物是什麼？"},
    ],
]

prompts = [
    tokenizer.apply_chat_template(
        messages,
        tokenize = False,
        add_generation_prompt = True,
    ) for messages in messages_list
]
prompts

['<|im_start|>user\nNext number of the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\n<|im_start|>assistant\n',
 '<|im_start|>user\n台灣最高的建築物是什麼？<|im_end|>\n<|im_start|>assistant\n']

# Init vLLM engine

In [4]:
llm = LLM(model="model_16bit", gpu_memory_utilization=0.65, max_model_len=2000)

INFO 03-04 14:07:37 llm_engine.py:79] Initializing an LLM engine with config: model='model_16bit', tokenizer='model_16bit', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2000, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 03-04 14:08:05 llm_engine.py:337] # GPU blocks: 662, # CPU blocks: 2048
INFO 03-04 14:08:06 model_runner.py:676] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-04 14:08:06 model_runner.py:680] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed

# Generate

In [5]:
outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Processed prompts: 100%|██████████| 2/2 [00:00<00:00,  4.21it/s]

Prompt: '<|im_start|>user\nNext number of the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\n<|im_start|>assistant\n', Generated text: 'The next number in the Fibonacci sequence is 13.'
Prompt: '<|im_start|>user\n台灣最高的建築物是什麼？<|im_end|>\n<|im_start|>assistant\n', Generated text: '台灣最高的建築物是台北101'





In [6]:
outputs

[RequestOutput(request_id=0, prompt='<|im_start|>user\nNext number of the fibonnaci sequence: 1, 1, 2, 3, 5, 8,<|im_end|>\n<|im_start|>assistant\n', prompt_token_ids=[1, 523, 28766, 321, 28730, 2521, 28766, 28767, 1838, 13, 6693, 1474, 302, 272, 16182, 266, 28711, 13114, 7768, 28747, 28705, 28740, 28725, 28705, 28740, 28725, 28705, 28750, 28725, 28705, 28770, 28725, 28705, 28782, 28725, 28705, 28783, 28725, 2, 28705, 13, 28789, 28766, 321, 28730, 2521, 28766, 28767, 489, 11143, 13], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='The next number in the Fibonacci sequence is 13.', token_ids=[1014, 1679, 1474, 297, 272, 401, 593, 266, 28127, 7768, 349, 28705, 28740, 28770, 28723, 2], cumulative_logprob=0.0, logprobs=None, finish_reason=length)], finished=True, metrics=RequestMetrics(arrival_time=347032.245817534, last_token_time=347032.245817534, first_scheduled_time=1709532497.8421493, first_token_time=1709532498.0150914, time_in_queue=1709185465.5963318, finished_time=17

# OpenAI Completions API

In [8]:
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:7788/v1"
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
completion = client.completions.create(model="model_16bit",
                                      prompt="台灣最高的建築物是",
                                      temperature=0.1) # max_tokens 和 temperature 參數在此設定
                                      
print("Completion result:", completion)

Completion result: Completion(id='cmpl-d23138fc33de4d9a8632e6203e28e8b9', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text='台北101，高度達到508公尺，')], created=347455, model='model_16bit', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=16, prompt_tokens=13, total_tokens=29))


# OpenAI Chat API

In [2]:
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:7788/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

chat_response = client.chat.completions.create(
    model="model_16bit",
    messages=[
        {"from": "system", "value": "You are a helpful assistant."},
        {"from": "human", "value": "Tell me a joke."},
    ]
)
print("Chat response:", chat_response)

Chat response: ChatCompletion(id='cmpl-9e636679178c4d0f981611b05864ab6f', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Q: There's a theory which states that if ever anyone discovers exactly what the Universe is for and why it is here, it will instantly disappear and be replaced by something even more bizarre and inexplicable. There's another theory which states that this has already happened.", role='assistant', function_call=None, tool_calls=None))], created=347176, model='model_16bit', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=60, prompt_tokens=46, total_tokens=106))
