In [None]:
#=======================================================================
# 예제 : sLLM을 이용한 chat봇 예제 
#
#=======================================================================
import torch
import transformers
from peft import PeftModel
import time

from myutils import GPU_info, seed_everything
device = GPU_info()
#seed 설정
SEED = 111
seed_everything(SEED)

#---------------------------------------------------------------------
# param 설정
lora_weights:str = '../data11/model/LLM/beomi/KoAlpaca-Polyglot-5.8B-LoRA-qa-moco/'
model_path:str ='../data11/model/LLM/beomi/KoAlpaca-Polyglot-5.8B/'

uselora_weight = True # Lora 사용하는 경우 True
load_8bit = True
#---------------------------------------------------------------------

start_time = time.time()

# tokenizer 로딩
tokenizer = transformers.AutoTokenizer.from_pretrained( model_path)

# 원본 모델 로딩
model = transformers.AutoModelForCausalLM.from_pretrained(model_path, load_in_8bit=load_8bit, torch_dtype=torch.float16, device_map="auto")

if uselora_weight:
    model = PeftModel.from_pretrained(model, lora_weights, torch_dtype=torch.float16) # loRA 모델 로딩

if not load_8bit:
    model.half()
    
print(model.eval())

end_time = time.time() - start_time
print("time: {:.2f} ms\n".format(end_time * 1000)) 


In [None]:
import torch
import time
from transformers import GenerationConfig

eos_str = tokenizer.decode(tokenizer.eos_token_id)

def run_query_loop():
    while True:
        try:
            handle_query()
        except KeyboardInterrupt:
            return
        
def handle_query(max_new_tokens:int=256):
    
    query = input("질문:")
    
    start_time = time.time()
    #prompt = query
    #prompt = f"<mocoquery>{query}</mocoquery>\n\n<answer>"
    #prompt = f"### 질문: {input_text}\n\n### 맥락: {context}\n\n### 답변:" if context else f"### 질문: {input_text}\n\n### 답변:"
    #prompt = f"### 질문 : 간략히 답변해줘.{query}\r\n###답변:"
    #prompt = f"### 질문: {query}\n\n### 답변:"
    prompt = f"### 지시: [질의]에 가장 적합한 [응답]을, 제시된 [문단]에서 찾아서 간단한 문장으로 답변해주세요.\n\n### [질의]: {query}\n\n### [응답]:"
    
    #print(prompt)
    
    # config 설정
    generation_config = GenerationConfig(
            temperature=0.5,
            top_p=0.75,
            top_k=40,
            num_beams=1,
            bos_token_id=tokenizer.bos_token_id,  # 시작토큰 
            eos_token_id=tokenizer.eos_token_id,  # end 토큰
            pad_token_id=tokenizer.pad_token_id   # padding 토큰
    )
    
    # 프롬프트 tokenizer 
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    #print(input_ids)

    # Without streaming
    # generate 처리
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=False,
            max_new_tokens=max_new_tokens,
        )

    # 출력
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)

    end_time = time.time() - start_time
    print("time: {:.2f} ms\n".format(end_time * 1000)) 
    print(output.replace(eos_str, ''))
    print()


In [None]:
run_query_loop()