실행 위치: Google Colab

# 1. vllm 라이브러리 설치

In [None]:
!curl -LsSf https://astral.sh/uv/install.sh | sh

In [None]:
!uv pip install --system vllm==0.11.0 torch==2.8.0 transformers==4.57.6

# 2. 모델 로드
 - 모델 로드 중 실패 발생시 제보 부탁드립니다. (Colab 기본 라이브러리 버전과 충돌 발생 가능성 있음)

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(
    model='openai-community/gpt2-xl'
)
sampling_params = SamplingParams(
    max_tokens=1000,
    temperature=0.0,
    repetition_penalty=2.0,
    n=1
)

# 3. 테스트 시퀀스 준비

In [None]:
inputs = ['Write a short poem about the beauty of a starry night',
          'Tell me a story about a brave adventurer exploring a mysterious forest',
          'Explain how a computer processes data in simple terms',
          'Describe a futuristic city where robots and humans coexist',
          'What are the benefits of daily meditation for mental health?',
          "Tell a funny story about a cat who thinks it's a detective.",
          "Create a dialogue between two friends planning a hiking trip.",
          "How would you convince someone to try a new hobby like painting?",
          "Write a scene where a pirate discovers a hidden treasure.",
          "What is the science behind rainbows in an easy explanation?",
          "Imagine a world where everyone can fly; describe a day in that world."
          ]
print(len(inputs))

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2-xl')

# 4. vLLM 배치 추론 수행

In [None]:
import time
start_time = time.time()

outputs = llm.generate(inputs, sampling_params)

end_time = time.time()

## 결과 보기

In [None]:
for output in outputs:
  print(output.outputs)

## 초당 토큰 수 계산

In [None]:
# 생성된 토큰 수 계산 (입력 토큰 제외)
tokens_per_sequence = [len(rqst_output.outputs[0].token_ids) for rqst_output in outputs]
total_tokens = sum(tokens_per_sequence)
elapsed_time = end_time - start_time

print(f'{len(inputs)} 개 시퀀스의 총 응답 토큰 수: {total_tokens}')
print(f'응답 소요 시간: {elapsed_time}')
print(f'초당 생성 토큰 수: {total_tokens / elapsed_time}')
