In [1]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"]='0'

In [2]:
def count_gpu_memories(usage=""):
    num_gpus = torch.cuda.device_count()
    gpu_memory_info = []
    
    for i in range(num_gpus):
        allocated = torch.cuda.memory_allocated(i)
        gpu_memory_info.append(
            (i, allocated / 1024**2)  # MB 단위로 변환
        )
    
    print(f"{usage} GPU 메모리 사용량 (단위: MB):")
    for gpu_id, allocated_mb in gpu_memory_info:
        print(f"GPU {gpu_id} - Allocated: {allocated_mb:.4f} MB")

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

prompt = "안녕하세요! Qwen2.5 모델의 구조를 분석해보겠습니다."

# Qwen2.5 모델과 토크나이저 로드
model_name = "../Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', torch_dtype=torch.bfloat16)

count_gpu_memories("Model Load")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Model Load GPU 메모리 사용량 (단위: MB):
GPU 0 - Allocated: 13042.6069 MB


In [4]:
# 입력 텍스트 토큰화
device = 'cuda:0'
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
max_new_tokens = 64

# 텍스트 생성
import time
st = time.time()
with torch.no_grad():
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=max_new_tokens,
        temperature=0.5,
        top_p=0.3,
        do_sample=True
    )
print(f"Generate sample used {time.time()-st: .2f} seconds")
# 생성된 텍스트 디코딩
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generate sample used  28.06 seconds


In [5]:
count_gpu_memories("Inference")
print("==========================================================")
print(f"{generated_text}")

Inference GPU 메모리 사용량 (단위: MB):
GPU 0 - Allocated: 13050.7339 MB
안녕하세요! Qwen2.5 모델의 구조를 분석해보겠습니다. Qwen2.5는 130억 개의 파라미터를 가지고 있으며, 이는 대규모 언어 모델입니다. 또한, Qwen2.5는 Transformer 아키텍처를 사용하여 설계되었습니다.

Transformer 아키텍처는 주로 인코
