#### Inference with Vllm Basics

In [1]:
from vllm import LLM, SamplingParams
import transformers
import torch , os
import timeit

#### Load Model 

In [2]:
# initialize model 
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_cache_dir = '/root/data/hf_cache/llama-3-8B-Instruct'
llm = LLM(model=model_cache_dir,dtype=torch.float16,tensor_parallel_size=1)
vllm_tokenizer = llm.get_tokenizer()
print(vllm_tokenizer.name_or_path)

INFO 06-16 19:13:07 llm_engine.py:161] Initializing an LLM engine (v0.5.0) with config: model='/root/data/hf_cache/llama-3-8B-Instruct', speculative_config=None, tokenizer='/root/data/hf_cache/llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/root/data/hf_cache/llama-3-8B-Instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-16 19:13:12 model_runner.py:159] Loading model weights took 14.9595 GB
INFO 06-16 19:13:13 gpu_executor.py:83] # GPU blocks: 27889, # CPU blocks: 2048
INFO 06-16 19:13:17 model_runner.py:878] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-16 19:13:17 model_runner.py:882] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-16 19:13:24 model_runner.py:954] Graph capturing finished in 7 secs.
/root/data/hf_cache/llama-3-8B-Instruct


#### Prepare prompt for corresponding LLM

In [3]:
def create_prompt(message,tokenizer,system_prompt=None,chat_history=[],fromat=True):
    conversation = []
    if system_prompt:
        conversation.append({"role": "system", "content": system_prompt})
    for user, assistant in chat_history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": message})
    
    if fromat:
        conversation = tokenizer.apply_chat_template(conversation,tokenize=False) 
    
    return conversation
    

In [4]:
prompts = [
    "What is your name?",
    "The president of the United States is who ?",
    "What are the pros/cons of ChatGPT vs Open Source LLMs?",
    "Write an email to a new client to offer a subscription for a paper supply for 1 year.",
    "I have $10,000 USD for investment. How one should invest it during times of high inflation and high mortgate rates?",
    "Write a function in python that calculates the square of a sum of two numbers.",
]
system_prompt = "You name is Llama 3 bot. You are a friendly chatbot who always provide very very short answer."
formated_prompts = [create_prompt(text, vllm_tokenizer,system_prompt) for text in prompts]
print(formated_prompts[0])


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You name is Llama 3 bot. You are a friendly chatbot who always provide very very short answer.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is your name?<|eot_id|>


#### Run examples 

In [5]:
# perform the inference
sampling_params = SamplingParams( temperature=0.6,
                                    top_p=0.9,
                                    max_tokens=128,
                                    stop_token_ids=[vllm_tokenizer.eos_token_id, 
                                                    vllm_tokenizer.convert_tokens_to_ids("<|eot_id|>")],
                                    skip_special_tokens=True )
outputs = llm.generate(formated_prompts, sampling_params)

# print outputs
for output in outputs[:1]:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text!r}")
    print(f"performance metrics: {output.metrics}")

Processed prompts: 100%|██████████| 6/6 [00:01<00:00,  4.34it/s, Generation Speed: 140.51 toks/s]

Generated text: "<|start_header_id|>assistant<|end_header_id|>\n\nI'm Llama 3!"
performance metrics: RequestMetrics(arrival_time=1718565243.5628803, last_token_time=1718565243.5628803, first_scheduled_time=1718565243.5764158, first_token_time=1718565243.6157765, time_in_queue=0.013535499572753906, finished_time=1718565243.7696753)





#### Time it 

In [6]:
def vllm_generate_benchmark():
    outputs = llm.generate(formated_prompts, sampling_params,use_tqdm=False)

In [7]:
# Running the timeit 10 times
execution_time = timeit.repeat(stmt=vllm_generate_benchmark, repeat=10, number=1)
print("Time used: {:.2f}".format(sum(execution_time)))

Time used: 14.21


#### Compare it is transformer pipeline

In [8]:
## load model 
#os.environ["CUDA_VISIBLE_DEVICES"] = "1"
pipeline = transformers.pipeline(
    "text-generation",
    model=model_cache_dir,
    model_kwargs={"torch_dtype": torch.bfloat16}, ## load with bf16
    device_map=torch.device("cuda:2"), ## put it on gpu1 
)
terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Run one test

In [9]:
messages = [create_prompt(text, pipeline.tokenizer,system_prompt,fromat=False) for text in prompts]

outputs = pipeline(
    messages,
    max_new_tokens=128,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=pipeline.tokenizer.eos_token_id)

print(outputs[0][0]["generated_text"][-1])

{'role': 'assistant', 'content': 'Llama 3!'}


In [11]:
def hf_generate_benchmark():
    outputs = pipeline(
            messages,
            max_new_tokens=128,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            pad_token_id=pipeline.tokenizer.eos_token_id)

In [12]:
# Running the timeit 10 times
execution_time = timeit.repeat(stmt=hf_generate_benchmark, 
                                repeat=10, number=1)
print("Time used: {:.2f}".format(sum(execution_time)))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Time used: 48.54
