In [None]:
from typing import Tuple, Optional, List

import os
import warnings
import threading

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

In [None]:
def load_model_and_tokenizer(
    model_path: str,
    device: str = "cuda"
) -> Tuple[AutoModelForCausalLM, AutoTokenizer, Optional[object]]:
    """
    Load a causal language model and its tokenizer.

    Args:
        model_path: Identifier or local path of the pretrained model.
        device: Compute device for inference, e.g. 'cuda' or 'cpu'.

    Returns:
        model: The loaded model in evaluation mode.
        tokenizer: The corresponding tokenizer.
    """
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map=device
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_path
    )

    model.eval()  # Set model to inference mode
    return model, tokenizer


def test_text_model_streaming(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompts: List[str]
) -> None:
    """
    Run a series of prompts through the model and print out responses token-by-token.

    Args:
        model: The language model for text generation.
        tokenizer: Tokenizer matching the model.
        prompts: List of user prompt strings.
    """
    print("\n=== Text Model Streaming Test ===\n")

    for idx, prompt in enumerate(prompts, start=1):
        print(f"Test #{idx}: {prompt}")
        print("Response: ", end="", flush=True)

        # Prepare chat-style input
        messages = [{"role": "user", "content": prompt}]
        formatted_input = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        inputs = tokenizer(formatted_input, return_tensors="pt").to(model.device)

        # Set up the streamer
        streamer = TextIteratorStreamer(
            tokenizer,
            skip_prompt=True,
            skip_special_tokens=True
        )

        # Kick off generation in a background thread
        generation_kwargs = dict(
            **inputs,
            max_new_tokens=256,
            temperature=0.2,
            # do_sample=False,
            repetition_penalty=1.1,
            streamer=streamer
        )
        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()

        # Print tokens as they arrive
        for chunk in streamer:
            print(chunk, end="", flush=True)

        thread.join()
        print("\n" + "-" * 80)

In [None]:
# model_path = 'output/fft_20250722_020908'
model_path = 'Qwen/Qwen3-0.6B'
device = 'cuda:0'

# 모델 경로 확인
if not os.path.exists(model_path):
    warnings.warn(f"Warning: 로컬 모델 경로 '{model_path}'가 존재하지 않습니다. HuggingFace를 탐색합니다.")
    # sys.exit(1)

# 디바이스 설정
if device == "cuda" and not torch.cuda.is_available():
    print("CUDA를 사용할 수 없습니다. CPU로 전환합니다.")
    device = "cpu"

print(f"모델 로딩 중: {model_path} (디바이스: {device})")

# 모델 로드
model, tokenizer = load_model_and_tokenizer(model_path, device)
print(f"✅ 모델 로딩 완료!")
    

In [None]:
test_cases = [
    "안녕하세요! 오늘 날씨가 어떤가요?",
    "안녕하세요! 오늘 날씨가 어떤가요?",
    "안녕하세요! 자기소개를 해주세요.",
    "파이썬에서 리스트와 튜플의 차이점은 무엇인가요?",
    "머신러닝과 딥러닝의 차이를 설명해주세요.",
    "한국의 전통 음식 3가지를 추천해주세요."
]

test_text_model_streaming(model, tokenizer, test_cases)