### 오프라인 서빙

#### 1. 기본 배치 추론

In [None]:
import torch
from datasets import load_dataset

def make_prompt(ddl, question, query=''):
    prompt = f"""당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question을 해결할 수 있는 SQL 쿼리를 생성하세요.
    ### DDL :
    {ddl}

    ### Question :
    {question}

    ### SQL :
    {query}"""

    return prompt

dataset = load_dataset("shangrilar/ko_text2sql", "origin")['test']
dataset = dataset.to_pandas()

for idx, row in dataset.iterrows():
    prompt = make_prompt(row['context'], row['question'])
    dataset.loc[idx, 'prompt'] = prompt

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "shangrilar/yi-ko-6b-text2sql"
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)
hf_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
# 배치 크기에 따른 추론 시간 확인
import time
for batch_size in [1, 2, 4, 8, 16, 32]:
    start_time = time.time()
    hf_pipeline(dataset['prompt'].tolist(), max_new_tokens=128, batch_size=batch_size)
    print(f"{batch_size}: {time.time() - start_time}")

#### 2. vLLM

In [None]:
from vllm import LLM, SamplingParams

model_id = "shangrilar/yi-ko-6b-text2sql"
llm = LLM(model=model_id, dtype=torch.float16, max_model_len=1024)

In [None]:
# vLLM을 활용한 오프라인 추론 시간 측정

for max_num_seqs in [1, 2, 4, 8, 16, 32]:
    start_time = time.time()
    llm.llm_engine.scheduler_config.max_num_seqs = max_num_seqs
    sampling_params = SamplingParams(temperature=1, top_p=1, max_tokens=128)
    outputs = llm.generate(dataset['prompt'].tolist(), sampling_params)
    print(f"{max_num_seqs}: {time.time() - start_time}")

### 온라인 서빙

In [None]:
# 온라인 서빙을 위한 vLLM API 서버 실행
!python -m vllm.entrypoints.openai.api_server \
--model shangrilar/yi-ko-6b-text2sql \
--host 127.0.0.1 \
--port 8888 \
--max-model-len 1024

In [None]:
# API 서버 실행 확인
!curl http://localhost:8888/v1/models

In [None]:
# API 요청
import json

json_data = json.dumps(
    {
        "model": "shangrilar/yi-ko-6b-text2sql",
        "prompt": dataset.loc[0, 'prompt'],
        "max_tokens": 128,
        "temperature": 1
    }
)

!curl http://localhost:8888/v1/completions \
    -H "Content-Type: application/json" \
    -d '{json_data}'

In [None]:
# OpenAI 클라이언트를 사용한 API 요청
from openai import OpenAI

openai_api_key = ""
openai_api_base = "http://localhost:8888/v1"
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base
)
completion = client.completions.create(
    model="shangrilar/yi-ko-6b-text2sql",
    prompt=dataset.loc[0, 'prompt'],
    max_tokens=128
)
print(completion.choices[0].text)