In [None]:
# 라이브러리

# !pip install -q "transformers>=4.42.0" "accelerate>=0.30.0" bitsandbytes peft datasets pillow mlflow
from pathlib import Path
import json
from datasets import Dataset
from PIL import Image
import torch
import mlflow
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig, TrainingArguments, Trainer
from transformers.integrations import MLflowCallback
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

print("torch", torch.__version__, "cuda", torch.cuda.is_available())

In [None]:
# 1) 데이터 로드 (synthetic_final.ndjson)
data_path = Path("../mixture_data/synthetic_final.ndjson")
rows = [json.loads(l) for l in data_path.open(encoding="utf-8") if l.strip()]
print("rows", len(rows), rows[0].keys())

full_ds = Dataset.from_list(rows)
splits = full_ds.train_test_split(test_size=0.1, seed=42)
train_dataset, eval_dataset = splits["train"], splits["test"]
print("train:", len(train_dataset), "eval:", len(eval_dataset))


In [None]:
# 2) 모델/프로세서 로드 (4bit)
base_model = "Qwen/Qwen3-VL-7B-Instruct"  # VRAM에 맞게 조정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
processor = AutoProcessor.from_pretrained(base_model, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()

In [None]:
# 3) LoRA 준비
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# 4) 프롬프트/콜레이터
def build_messages(r, img):
    sys_prompt = (
        "너는 얼굴 사진과 진단명을 받아 1) 진단을 보조/검증하고 2) 주어진 건강정보/칼로리/룰에 맞춰 "
        "식단 JSON만 생성하는 VLM이다. JSON 외 텍스트는 답하지 말 것."
    )
    user_prompt = (
        f"진단: {r['diagnosis_name']}\n"
        f"키/몸무게/활동/목표: {r['height_cm']}cm, {r['weight_kg']}kg, {r['activity_level']}, {r['goal_type']}\n"
        f"1식 칼로리 목표: {r['calorie_plan']} kcal\n"
        f"식이 룰: {r['rules_text']}\n"
        "얼굴 사진을 참고해 진단을 보조/검증하되, 최종 식단 JSON만 반환."
    )
    assistant = json.dumps(r["diet_json"], ensure_ascii=False)
    return [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": [{"type": "image", "image": img}, {"type": "text", "text": user_prompt}]},
        {"role": "assistant", "content": assistant},
    ]

def collate_fn(batch):
    images, messages = [], []
    for r in batch:
        img_path = Path(r["image"])
        if not img_path.is_absolute():
            img_path = (data_path.parent / img_path).resolve()
        img = Image.open(img_path).convert("RGB")
        images.append(img)
        messages.append(build_messages(r, img))

    pixel_values = processor(images=images, return_tensors="pt").pixel_values
    text_inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=False,
        return_tensors="pt",
        padding=True,
    )
    labels = text_inputs["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    return {
        "input_ids": text_inputs["input_ids"],
        "attention_mask": text_inputs["attention_mask"],
        "pixel_values": pixel_values,
        "labels": labels,
    }

In [None]:
# 5) Trainer + MLflow
mlflow.set_experiment("qwen3vl-qlora")
training_args = TrainingArguments(
    output_dir="./qlora-qwen3vl",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=5,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    report_to="mlflow",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
    callbacks=[MLflowCallback()],
)

In [None]:
# 6) 학습 + 로깅
with mlflow.start_run():
    mlflow.log_params({
        "lora_r": lora_config.r,
        "lora_alpha": lora_config.lora_alpha,
        "base_model": base_model,
    })
    trainer.train()
    save_dir = "qlora-adapter"
    trainer.save_model(save_dir)
    mlflow.log_artifacts(save_dir)

In [None]:
# 7) 추론 어댑터 로드
from peft import PeftModel
base_infer = AutoModelForVision2Seq.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
peft_infer = PeftModel.from_pretrained(base_infer, "qlora-adapter")
peft_infer.eval()