실행 위치: Google Colab

# 1. 모델 로드

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "openai-community/gpt2-xl"
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    trust_remote_code=True
).to('cuda')

tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. KV Cache를 사용했을 때의 추론

In [None]:
input = "Replace me by any text you'd like"
use_cache_encoded_input = tokenizer(
    input,
    return_tensors='pt'
).to('cuda')

In [None]:
import time
start_time = time.time()
use_cache_outputs = model.generate(
        **use_cache_encoded_input,
        use_cache=True,
        max_length=100,
        repetition_penalty=1.2,
        num_beams=5,
        no_repeat_ngram_size=2
)
end_time = time.time()

In [None]:
print(use_cache_encoded_input)
print('---------')
print(use_cache_outputs[0])

In [None]:
input_token_len = len(use_cache_encoded_input['input_ids'][0])
output_tokens = use_cache_outputs[0][input_token_len:]   # 입력 토큰은 제거하고 출력 토큰만 계산
elapsed_time = end_time - start_time

print(f'출력된 토큰 개수: {len(output_tokens)}')
print(f'KV Cache 사용시 소요 시간: {elapsed_time}')
print(f'토큰당 소요시간: {elapsed_time / len(output_tokens)}')

# 3. KV Cache 미사용시의 추론

In [None]:
input = "How are you today?"
no_cache_encoded_input = tokenizer(
    input,
    return_tensors='pt'
).to('cuda')

In [None]:
start_time = time.time()
no_cache_outputs = model.generate(
        **no_cache_encoded_input,
        use_cache=False,
        max_length=100,
        repetition_penalty=1.2,
        num_beams=5,
        no_repeat_ngram_size=2
)
end_time = time.time()

In [None]:
print(no_cache_encoded_input)
print('---------')
print(no_cache_outputs[0])

In [None]:
input_token_len = len(no_cache_encoded_input['input_ids'][0])
output_tokens = no_cache_outputs[0][input_token_len:]
elapsed_time = end_time - start_time

print(f'출력된 토큰 개수: {len(output_tokens)}')
print(f'No Cache 소요 시간: {elapsed_time}')
print(f'토큰당 소요시간: {elapsed_time / len(output_tokens)}')