In [2]:
!pip install fastapi==0.121.2 pydantic==2.12.3 mlflow==2.12.1 \
  torch==2.6.0+cu124 torchvision==0.21.0+cu124 \
  pillow==11.3.0 numpy==1.26.4 requests==2.32.5 uvicorn==0.38.0 \
  python-dotenv==1.0.1 google-auth==2.29.0 \
  transformers peft bitsandbytes datasets scikit-learn openai matplotlib ipykernel \
  --extra-index-url https://download.pytorch.org/whl/cu124


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu124
Collecting fastapi==0.121.2
  Using cached fastapi-0.121.2-py3-none-any.whl.metadata (28 kB)
Collecting pydantic==2.12.3
  Using cached pydantic-2.12.3-py3-none-any.whl.metadata (87 kB)
Collecting mlflow==2.12.1
  Using cached mlflow-2.12.1-py3-none-any.whl.metadata (29 kB)
Collecting torch==2.6.0+cu124
  Using cached https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.21.0+cu124
  Using cached https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (6.1 kB)
Collecting pillow==11.3.0
  Using cached pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.0 kB)
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting requests==2.32.5
  Using cached requests-2.32.5-py3-n

In [None]:
import torch
from PIL import Image

from transformers import (
    AutoProcessor,
    AutoModelForVision2Seq,
    BitsAndBytesConfig,
)

from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
base_model = "Qwen/Qwen3-VL-8B-Instruct"
adapter_path = "./qlora-adapter"   # QLoRA 결과

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# processor
processor = AutoProcessor.from_pretrained(
    base_model,
    trust_remote_code=True,
)

# base model
base = AutoModelForVision2Seq.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# LoRA adapter 결합
model = PeftModel.from_pretrained(base, adapter_path)
model.eval()


Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.53s/it]


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): Qwen3VLForConditionalGeneration(
      (model): Qwen3VLModel(
        (visual): Qwen3VLVisionModel(
          (patch_embed): Qwen3VLVisionPatchEmbed(
            (proj): Conv3d(3, 1152, kernel_size=(2, 16, 16), stride=(2, 16, 16))
          )
          (pos_embed): Embedding(2304, 1152)
          (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()
          (blocks): ModuleList(
            (0-26): 27 x Qwen3VLVisionBlock(
              (norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (attn): Qwen3VLVisionAttention(
                (qkv): Linear4bit(in_features=1152, out_features=3456, bias=True)
                (proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
              )
              (mlp): Qwen3VLVisionMLP(
                (linear_fc1): Linear4bit(in_features=1152, out_features=4304, bias=True)
  

### 테스트 입력 준비

In [5]:
image_path = "../mixture_data/아토피/H5_924_P7_L5.png"
image = Image.open(image_path).convert("RGB")

In [6]:
# 메시지 구성: chat template + image token 구조
messages = [
    {
        "role": "system",
        "content": "너는 얼굴 이미지를 참고하여 식단 JSON만 생성하는 모델이다."
    },
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": (
                    "진단: 아토피\n"
                    "키/몸무게/활동/목표: 162cm, 69kg, LOW, GAIN\n"
                    "1식 칼로리 목표: 380 kcal\n"
                    "식이 룰: 오메가-3 우선, 가공식품 제한\n"
                    "반드시 JSON만 출력하라."
                )
            }
        ]
    }
]

In [7]:
# text + image를 동시에 processor에 전달 (핵심)
text = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False,   # 문자열로 반환
)

# device 이동
inputs = processor(
    images=[image],
    text=text,
    return_tensors="pt",
)

inputs = {k: v.to(model.device) for k, v in inputs.items()}


In [8]:
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=False,
    )

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [None]:
generated_ids = outputs[0][inputs["input_ids"].shape[-1]:]
decoded = processor.tokenizer.decode(generated_ids, skip_special_tokens=True)

print(decoded)

<|im_start|>system
너는 얼굴 이미지를 참고하여 식단 JSON만 생성하는 모델이다.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|ima