# tutorials

* https://zhuanlan.zhihu.com/p/676635585

# supported models

* https://docs.vllm.ai/en/latest/models/supported_models.html

# use modelscope model

* export VLLM_USE_MODELSCOPE=True

```python
from vllm import LLM

llm = LLM(model=..., revision=..., trust_remote_code=True)  # Name or path of your model
output = llm.generate("Hello, my name is")
print(output)
```
* example:
```python
llm = LLM(model="shakechen/Llama-2-7b-hf", revision="master", trust_remote_code=True)
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
prompts = ["帮我写一篇介绍董晓蕊的文章", "曹操为什么赤壁之战中会失败？"]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

# serving with openai and modelscope

## start serving
* VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server --model="qwen/Qwen-14B-Chat" --revision="v1.0.1" --trust-remote-code

## get models 

* curl http://localhost:8000/v1/models

## use openai api -chat example

In [None]:
from openai import OpenAI

from openai import ChatCompletion
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://59.68.29.90:17900/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id
print('current model:', model)
chat_response = client.chat.completions.create(
    # model="qwen/Qwen-14B-Chat",
    model=model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Who won the world series in 2020?"},
    ]
)
print("Chat response:", chat_response)

## completion example

In [3]:
models = client.models.list()
model = models.data[0].id

# Completion API
stream = False
completion = client.completions.create(
    model=model,
    prompt="A robot may not injure a human being",
    echo=False,
    n=2,
    stream=stream,
    logprobs=3
)

print("Completion results:")
if stream:
    for c in completion:
        print(c)
else:
    print(completion)

Completion results:
Completion(id='cmpl-c3fca6584ef943f2b831df0b45e9aa45', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=Logprobs(text_offset=[0, 3, 4, 12, 15, 21, 22, 28, 30, 36, 42, 45, 50, 53, 58, 60], token_logprobs=[-0.14477035403251648, -0.013734362088143826, -0.0006811682251282036, -0.013870514929294586, -0.0010487301042303443, -0.001557805109769106, -0.014080584980547428, -0.000674616196192801, -0.0003554189461283386, -0.003102491609752178, -0.001405324088409543, -0.00017832119192462415, -0.0006668727728538215, -0.0008133916999213398, -0.5331632494926453, -0.5901650190353394], tokens=[' or', ',', ' through', ' in', 'action', ',', ' allow', ' a', ' human', ' being', ' to', ' come', ' to', ' harm', '.\n', 'A'], top_logprobs=[{' or': -0.14477035403251648, ',': -2.144770383834839, '.\n': -5.51977014541626}, {',': -0.013734362088143826, ' through': -4.826234340667725, ' allow': -6.388734340667725}, {' through': -0.0006811682251282036, ' Through': -8.12568092346