In [1]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM
import torch

In [3]:
def vllm_completion(prompts, args, llm):
    config = SamplingParams(temperature=0, max_tokens=args.max_new_tokens)
    if "llama" in args.llm.lower():
        prompts = [LLAMA3_CHAT.format(PROMPT=p) for p in prompts]
    else:
        prompts = [QWEN_CHAT.format(PROMPT=p) for p in prompts]
    output= llm.generate(prompts=prompts, sampling_params=config)
    output = [o.outputs[0].text.strip() for o in output]
    if "llama" in args.llm.lower():
        output = [o.split("<|eot_id|>")[0] for o in output]
    return output

In [None]:
llm = LLM(model='meta-llama/Meta-Llama-3-8B-Instruct')

In [None]:
"""
This example shows how to use LoRA with different quantization techniques
for offline inference.

Requires HuggingFace credentials for access.
"""

import gc
from typing import List, Optional, Tuple

import torch
from huggingface_hub import snapshot_download

from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.lora.request import LoRARequest


"""Main function that sets up and runs the prompt processing."""
test_config= {
"name": "qlora_inference_example",
'model': "meta-llama/Meta-Llama-3-8B-Instruct",
'quantization': "bitsandbytes",
'lora_repo': 'timdettmers/qlora-flan-7b'
}

engine_args = EngineArgs(
    model=test_config['model'],
    quantization=test_config['quantization'],
    qlora_adapter_name_or_path=test_config['lora_repo'],
    load_format=test_config['quantization'],
    enable_lora=True,
    max_lora_rank=64,
    # set it only in GPUs of limited memory
    enforce_eager=True)
engine = LLMEngine.from_engine_args(engine_args)

lora_path = snapshot_download(repo_id=test_config['lora_repo'])


In [None]:
test_prompts = [
     ('Say Hello to me',SamplingParams(temperature=0.0,
                logprobs=1,
                prompt_logprobs=1,
                max_tokens=128), None),
    # ("my name is", SamplingParams(temperature=0.0,
    #             logprobs=1,
    #             prompt_logprobs=1,
    #             max_tokens=128),
    # LoRARequest("lora-test-1", 1, lora_path))
    ]
request_id = 0
while test_prompts or engine.has_unfinished_requests():
        if test_prompts:
            prompt, sampling_params, lora_request = test_prompts.pop(0)
            engine.add_request(str(request_id),
                               prompt,
                               sampling_params,
                               lora_request=lora_request)
            request_id += 1

        request_outputs: List[RequestOutput] = engine.step()
        for request_output in request_outputs:
            if request_output.finished:
                print("----------------------------------------------------")
                print(f"Prompt: {request_output.prompt}")
                print(f"Output: {request_output.outputs[0].text}")