In [1]:
# 라이브러리
from pathlib import Path
import json
from collections import Counter

import torch
import mlflow
from PIL import Image
from datasets import Dataset
from sklearn.model_selection import train_test_split

import transformers
from transformers import (
    AutoProcessor,
    AutoModelForVision2Seq,
    BitsAndBytesConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from transformers.integrations import MLflowCallback

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

print("transformers:", transformers.__version__)
print("torch:", torch.__version__, "cuda:", torch.cuda.is_available())


  from .autonotebook import tqdm as notebook_tqdm


transformers: 4.57.3
torch: 2.3.1+cu121 cuda: True


In [None]:
# 1) 데이터 로드 (synthetic_final.ndjson)->계층화샘플링
data_path = Path("../mixture_data/synthetic_final.ndjson")
rows = [json.loads(l) for l in data_path.open(encoding="utf-8") if l.strip()]

train_rows, eval_rows = train_test_split(
    rows,
    test_size=0.1,
    random_state=42,
    stratify=[r["diagnosis_name"] for r in rows],
)

train_dataset = Dataset.from_list(train_rows)
eval_dataset = Dataset.from_list(eval_rows)

print("train:", Counter(train_dataset["diagnosis_name"]))
print("eval :", Counter(eval_dataset["diagnosis_name"]))

train: Counter({'아토피': 90, '정상': 90, '지루': 90, '건선': 90, '여드름': 90, '주사': 90})
eval : Counter({'아토피': 10, '지루': 10, '여드름': 10, '정상': 10, '주사': 10, '건선': 10})


In [3]:
# from collections import Counter

# splits = full_ds.train_test_split(test_size=0.1, seed=42, stratify_by_column="diagnosis_name")
# train_dataset, eval_dataset = splits["train"], splits["test"]
# print("train:", Counter(train_dataset["diagnosis_name"]))
# print("eval:", Counter(eval_dataset["diagnosis_name"]))

In [3]:
# 2) 모델/프로세서 로드 (4bit) + 이미지 토큰 등록
base_model = "Qwen/Qwen3-VL-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

processor = AutoProcessor.from_pretrained(base_model, trust_remote_code=True)

model = AutoModelForVision2Seq.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.50s/it]


In [4]:
# 3) LoRA 준비
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 43,646,976 || all params: 8,810,770,672 || trainable%: 0.4954


In [5]:
# 4) 프롬프트/콜레이터
def build_messages(r):
    sys_prompt = "너는 얼굴 이미지를 참고하여 식단 JSON만 생성하는 모델이다. JSON 외 텍스트는 출력하지 마라."

    user_text = (
        f"진단: {r['diagnosis_name']}\n"
        f"키/몸무게/활동/목표: {r['height_cm']}cm, {r['weight_kg']}kg, {r['activity_level']}, {r['goal_type']}\n"
        f"1식 칼로리 목표: {r['calorie_plan']} kcal\n"
        f"식이 룰: {r['rules_text']}\n"
        "반드시 JSON만 출력하라."
    )

    assistant = json.dumps(r["diet_json"], ensure_ascii=False)

    return [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": user_text}]},
        {"role": "assistant", "content": assistant},
    ]


def collate_fn(batch):
    images = []
    messages = []

    for r in batch:
        img_path = (data_path.parent / r["image"].replace("\\", "/")).resolve()
        img = Image.open(img_path).convert("RGB")
        images.append(img)
        messages.append(build_messages(r))

    # 1) chat template -> "문자열"로 만든다 (여기가 핵심)
    texts = processor.apply_chat_template(
        messages,
        add_generation_prompt=False,
        tokenize=False,                # 변경: tokenize=False 로 문자열 생성
    )

    # 2) processor를 "text+images"로 1번만 호출 (여기가 핵심)
    inputs = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
    )

    labels = inputs["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100

    batch_out = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "pixel_values": inputs["pixel_values"],
        "labels": labels,
    }

    # image_grid_thw 있으면 반드시 포함
    if "image_grid_thw" in inputs:
        batch_out["image_grid_thw"] = inputs["image_grid_thw"]

    return batch_out

In [6]:
# collate_fn 출력과 tokenizer decode가 정상인지 확인
b = collate_fn([train_dataset[0], train_dataset[1]])
print("keys:", b.keys())
print("input_ids:", tuple(b["input_ids"].shape))
print("pixel_values:", tuple(b["pixel_values"].shape))
if "image_grid_thw" in b:
    print("image_grid_thw:", tuple(b["image_grid_thw"].shape))

decoded = processor.tokenizer.decode(b["input_ids"][0], skip_special_tokens=False)
print(decoded[:300])


keys: dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'labels', 'image_grid_thw'])
input_ids: (2, 1377)
pixel_values: (8192, 1536)
image_grid_thw: (2, 3)
<|im_start|>system
너는 얼굴 이미지를 참고하여 식단 JSON만 생성하는 모델이다. JSON 외 텍스트는 출력하지 마라.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|


In [7]:
# 5) Trainer + MLflow
mlflow.set_experiment("qwen3vl-qlora")

training_args = Seq2SeqTrainingArguments(
    output_dir="./qlora-qwen3vl",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=5,
    save_steps=50,
    save_total_limit=2,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    report_to="mlflow",
    remove_unused_columns=False,
    predict_with_generate=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    callbacks=[MLflowCallback()],
)

with mlflow.start_run():
    mlflow.log_params({
        "lora_r": lora_config.r,
        "lora_alpha": lora_config.lora_alpha,
        "base_model": base_model,
    })
    trainer.train()

save_dir = "qlora-adapter"
trainer.save_model(save_dir)
mlflow.log_artifacts(save_dir)


  return FileStore(store_uri, store_uri)
You are adding a <class 'transformers.integrations.integration_utils.MLflowCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
MLflowCallback


2025/12/13 05:28:27 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 0f4038d34a2c4f87bc0a50fce6aedeaf: Failed to log run data: Exception: Changing param values is not allowed. Param with key='fp16_backend' was already logged with value='' for run ID='0f4038d34a2c4f87bc0a50fce6aedeaf'. Attempted logging new value 'auto'.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,18.3134
10,13.3336
15,8.784
20,7.4124
25,6.7477
30,6.4241
35,6.298
40,6.1718
45,6.0833
50,6.0615




In [None]:
# 7) 추론 어댑터 로드
from peft import PeftModel

base_model = AutoModelForVision2Seq.from_pretrained(
    "Qwen/Qwen3-VL-8B-Instruct",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(base_model, "qlora-adapter")
model.eval()
