### 적용한 것
- 3 epochs
- image size 448
- LoRA 설정: r = 16, alpha = 32
- train 기준 question, 각 선지별로 task type 분류
- task type별로 prompt hint 추가
- task type 기준으로 train/valid 셋 분할

## 모듈 import

In [None]:
!pip install -q transformers accelerate bitsandbytes peft wandb
!pip install -U bitsandbytes



In [None]:
# 구글드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os, re, math, random
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
import torch
from typing import Any
from transformers import (
    AutoModelForVision2Seq,
    AutoProcessor,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np

Image.MAX_IMAGE_PIXELS = None
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [None]:
BASE_DIR = "/content/drive/MyDrive/AI챌린지/2025-ssafy-14/"
MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
IMAGE_SIZE = 448
MAX_NEW_TOKENS = 8
SEED = 42
random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

## question task 분류

In [None]:
# Rule-based task_type classifier for the user's CSV at /mnt/data/task.csv
import re
import pandas as pd
from pathlib import Path

# ===== 1) Keyword dictionaries =====
COLOR_WORDS = {
    "ko": ["색", "색깔", "무슨 색", "어떤 색", "빛깔", "컬러"],
    "en": ["color", "colour", "what color"]
}
COUNT_WORDS = {
    "ko": ["몇 개", "개수", "얼마나", "몇명", "몇 명", "몇마리", "몇 마리", "수량"],
    "en": ["how many", "count", "number of"]
}
TEXT_OCR_WORDS = {
    "ko": ["글자", "문구", "표지판", "간판", "텍스트", "문자", "읽", "라벨", "번호판", "표기"],
    "en": ["text", "word", "words", "label", "read", "sign", "plate", "license"]
}
CLOCK_TIME_WORDS = {
    "ko": ["시계", "시각", "몇 시", "몇시", "시간", "시침", "분침"],
    "en": ["clock", "time", "o'clock", "what time"]
}
POSITION_WORDS = {
    "ko": ["왼쪽", "오른쪽", "가운데", "중앙", "상단", "하단", "옆", "뒤", "앞"],
    "en": ["left", "right", "center", "middle", "top", "bottom", "behind", "front", "near"]
}
SCENE_LOC_WORDS = {
    "ko": ["어디", "어디인가요", "장소", "장면", "도시", "국가", "지역", "현장", "배경"],
    "en": ["where", "place", "scene", "location", "city", "country"]
}
OBJECT_WORDS = {
    "ko": ["무엇", "무엇인가요", "무슨 물체", "어떤 물체", "종류", "타입"],
    "en": ["what is this", "what object", "which object", "what animal", "type of"]
}
ACTION_WORDS = {
    "ko": ["무엇을 하고", "하고 있", "동작", "행동", "중", "하는 중"],
    "en": ["doing", "action", "is he", "is she", "are they", "doing what"]
}
ATTRIBUTE_WORDS = {
    "ko": ["크기", "길이", "높이", "넓이", "패턴", "모양", "재질", "형태"],
    "en": ["size", "length", "height", "width", "pattern", "shape", "material", "texture"]
}
WEATHER_WORDS = {
    "ko": ["날씨", "비", "눈", "맑", "흐림", "구름"],
    "en": ["weather", "rain", "snow", "sunny", "cloudy"]
}

COLOR_CHOICES = [
    "red","blue","green","yellow","black","white","gray","grey","brown","pink","purple","orange",
    "빨강","파랑","초록","노랑","검정","흰색","회색","갈색","분홍","보라","주황"
]
DIRECTION_CHOICES = ["left","right","center","middle","top","bottom","왼쪽","오른쪽","가운데","중앙","상단","하단"]
BOOLEAN_CHOICES = ["yes","no","참","거짓","맞다","아니다","있다","없다"]
CITY_COUNTRY_HINT = ["seoul","tokyo","london","paris","new york","베이징","서울","도쿄","런던","파리","뉴욕","중국","한국","일본","프랑스","미국"]

def ratio_numeric(tokens):
    c = 0
    for t in tokens:
        t = str(t).strip()
        if re.fullmatch(r"[0-9]+(\.[0-9]+)?", t):
            c += 1
    return c / max(1, len(tokens))

def contains_any(text, words):
    text = "" if text is None else str(text)
    text_l = text.lower()
    return any(w in text_l for w in words["en"]) or any(w in text for w in words["ko"])

def list_contains_any(options, vocab):
    opts = [str(x).strip().lower() for x in options]
    return any(any(v in o for v in vocab) for o in opts)

def majority_in_vocab(options, vocab):
    opts = [str(x).strip().lower() for x in options]
    if not opts: return False
    cnt = sum(1 for o in opts if any(v == o or v in o for v in vocab))
    return cnt >= (len(opts) // 2 + 1)

TASK_HINT = {
    "color": "주요 대상의 표면/의류/신호등 등 색상이 뚜렷한 영역을 우선 주시하고, 그림자/광원 영향이 적은 부분의 색을 비교하세요.",
    "count": "화면 전역을 훑으며 동일 객체 인스턴스를 식별하고 겹침/가림 영역을 주의 깊게 스캔하세요.",
    "text_ocr": "고대비 문양/평평한 표면(간판, 라벨, 표지판, 차량 번호판)을 클로즈업하듯 주시해 글자 윤곽을 확인하세요.",
    "clock_time": "아날로그 시계의 시침/분침 끝을 집중 관찰하거나 디지털의 숫자 세그먼트를 확인하세요.",
    "position": "질문에 언급된 방향(왼쪽/오른쪽/상단/하단/가운데) 영역을 먼저 살피세요.",
    "scene_location": "이미지 전체 레이아웃과 배경 단서를 우선 관찰하세요(건축물 유형, 도로 표지, 자연물 등).",
    "object_classification": "프레임의 중심 또는 가장 salient한 물체 외곽선/형태를 우선 관찰하세요.",
    "action": "사람/동물의 포즈, 팔/손/시선, 상호작용 물체(도구, 공 등) 주변을 집중 관찰하세요.",
    "attribute": "대상의 가장 특징적인 부분(무늬/재질/형태 대비가 큰 부분)을 확대하듯 살피세요.",
    "weather": "하늘/지평선/지면 상태를 확인하고 강수/적설/구름 분포 단서를 관찰하세요.",
    "boolean": "질문의 대상 영역을 먼저 찾고, 존재/부재나 상태를 빠르게 확인하세요.",
    "unknown": "이미지 전역을 스캔한 뒤 질문 키워드와 매칭되는 후보 영역을 단계적으로 좁혀보세요."
}

PRIORITY = [
    "color", "text_ocr", "clock_time", "count", "position",
    "scene_location", "action", "attribute", "object_classification",
    "weather", "boolean"
]

def infer_task_type(question, choices):
    q = (question or "").strip()
    a,b,c,d = choices
    options = [a,b,c,d]

    # 1) options-driven
    if majority_in_vocab(options, COLOR_CHOICES):
        return "color", "선지에 색상 단어 다수"
    if majority_in_vocab(options, DIRECTION_CHOICES):
        return "position", "선지에 방향/위치 단어 다수"
    if ratio_numeric(options) >= 0.5:
        if contains_any(q, CLOCK_TIME_WORDS):
            return "clock_time", "선지 숫자 + 질문에 시계/시간"
        return "count", "선지에 숫자 다수"
    if majority_in_vocab(options, BOOLEAN_CHOICES):
        if contains_any(q, TEXT_OCR_WORDS):
            return "text_ocr", "Yes/No + 텍스트 판독 질문"
        if contains_any(q, COUNT_WORDS):
            return "count", "Yes/No지만 개수 관련"
        return "boolean", "선지가 예/아니오 계열"
    if list_contains_any(options, CITY_COUNTRY_HINT):
        return "scene_location", "선지에 도시/국가명"

    # 2) question-driven
    if contains_any(q, COLOR_WORDS): return "color", "질문에 색 관련 키워드"
    if contains_any(q, TEXT_OCR_WORDS): return "text_ocr", "질문에 텍스트/표지/읽기"
    if contains_any(q, CLOCK_TIME_WORDS): return "clock_time", "질문에 시계/시간"
    if contains_any(q, COUNT_WORDS): return "count", "질문에 개수/몇 개"
    if contains_any(q, POSITION_WORDS): return "position", "질문에 위치/방향"
    if contains_any(q, SCENE_LOC_WORDS): return "scene_location", "질문에 장소/어디"
    if contains_any(q, ACTION_WORDS): return "action", "질문에 동작/무엇을 하고"
    if contains_any(q, ATTRIBUTE_WORDS): return "attribute", "질문에 속성(크기/모양/재질)"
    if contains_any(q, OBJECT_WORDS): return "object_classification", "질문에 무엇/어떤 물체"
    if contains_any(q, WEATHER_WORDS): return "weather", "질문에 날씨"

    # backup
    if list_contains_any(options, COLOR_CHOICES):
        return "color", "선지에 색상 단어 포함"
    if list_contains_any(options, DIRECTION_CHOICES):
        return "position", "선지에 방향 단어 포함"

    return "unknown", "규칙에 매칭되지 않음"

def task_hint(task_type):
    return TASK_HINT.get(task_type, TASK_HINT["unknown"])


In [None]:
def detect_q_and_choices(df: pd.DataFrame):
    """question, a,b,c,d 컬럼을 휴리스틱으로 감지"""
    cols = list(df.columns)
    lower_map = {c.lower(): c for c in cols}

    # question 후보
    candidate_q_cols = [c for c in cols
                        if c.lower() in ["question", "질문"]
                        or "question" in c.lower()
                        or "질문" in c.lower()]
    q_col = candidate_q_cols[0] if candidate_q_cols else ("question" if "question" in lower_map else cols[0])

    # 선택지 후보
    choice_candidates = []
    for base in ["a", "b", "c", "d"]:
        if base in lower_map:
            choice_candidates.append(lower_map[base])
        else:
            hit = [c for c in cols if c.lower()==base
                   or c.lower().endswith("_"+base)
                   or c.lower().startswith(base+"_")
                   or c.lower().strip()==base]
            if hit:
                choice_candidates.append(hit[0])
            else:
                hit = [c for c in cols if base in c.lower() and ("choice" in c.lower() or "보기" in c.lower())]
                if hit:
                    choice_candidates.append(hit[0])

    if len(choice_candidates) < 4:
        remain = [c for c in cols if c != q_col]
        choice_candidates = remain[:4]

    if len(choice_candidates) != 4:
        raise ValueError(f"선지 열 4개(a,b,c,d)를 찾지 못했습니다. 감지된 후보: {choice_candidates} | 전체 컬럼: {cols}")

    a_col, b_col, c_col, d_col = choice_candidates
    return q_col, a_col, b_col, c_col, d_col


def classify_df(df: pd.DataFrame):
    """df에 task_type / task_reason / task_hint 컬럼을 추가해서 반환 + 간단 리포트"""
    q_col, a_col, b_col, c_col, d_col = detect_q_and_choices(df)

    task_types, reasons, hints = [], [], []
    for _, row in df.iterrows():
        q = str(row[q_col])
        a, b, c, d = (row[a_col], row[b_col], row[c_col], row[d_col])
        t, why = infer_task_type(q, (a, b, c, d))
        task_types.append(t)
        reasons.append(why)
        hints.append(task_hint(t))

    out = df.copy()
    out["task_type"] = task_types
    out["task_reason"] = reasons
    out["task_hint"] = hints

    counts = out["task_type"].value_counts(dropna=False)
    meta = {
        "q_col": q_col,
        "choice_cols": [a_col, b_col, c_col, d_col],
        "counts": counts
    }
    return out, meta

train_path = Path(BASE_DIR) / "train.csv"
test_path  = Path(BASE_DIR) / "test.csv"

# Train
df_train = pd.read_csv(train_path)
train_df, meta_train = classify_df(df_train)

# Test
df_test = pd.read_csv(test_path)
test_df, meta_test = classify_df(df_test)

## LoRA 세팅

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=IMAGE_SIZE*IMAGE_SIZE,
    max_pixels=IMAGE_SIZE*IMAGE_SIZE,
    trust_remote_code=True,
)

base_model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

base_model = prepare_model_for_kbit_training(base_model)
base_model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/390 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]



config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.72G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

trainable params: 43,646,976 || all params: 8,810,770,672 || trainable%: 0.4954


## 프롬프트

In [None]:
SYSTEM_INSTRUCT = (
    "당신은 시각 정보를 세밀하게 분석하여 질문에 논리적으로 답하는 비주얼 AI 전문가입니다. "
    "이미지에서 주요 대상과 그 특징(형태, 재질, 위치, 배경)을 관찰하고, "
    "보기 중 가장 가능성이 높은 정답을 선택하세요. "
    "최종 출력은 반드시 a, b, c, d 중 하나의 소문자 한 글자만 작성하세요."
)

def build_mc_prompt(question, a, b, c, d, task_hint):
    base_prompt = (
        f"질문: {question}\n\n"
        f"보기:\n(a) {a}\n(b) {b}\n(c) {c}\n(d) {d}\n\n"
        "이미지를 관찰한 후, 가장 알맞은 보기를 고르세요. "
        "정답은 반드시 소문자 알파벳 a, b, c, d 중 하나로만 답하세요."
    )
    base_prompt += f"\n\n힌트: {task_hint}"
    return base_prompt

## 데이터셋

In [None]:
class VQAMCDataset(Dataset):
    def __init__(self, df, processor, train=True):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.train = train
        self.path = '/content/drive/MyDrive/AI챌린지/2025-ssafy-14/'

    def __len__(self): return len(self.df)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        img = Image.open(self.path + row["path"]).convert("RGB")

        q = str(row["question"])
        a, b, c, d, e = str(row["a"]), str(row["b"]), str(row["c"]), str(row["d"]), str(row['task_hint'])
        user_text = build_mc_prompt(q, a, b, c, d, e)

        # ✅ Qwen3는 멀티모달 메시지 기반이므로 구조 그대로 유지 가능
        messages = [
            {"role": "system", "content": [{"type": "text", "text": SYSTEM_INSTRUCT}]},
            {"role": "user", "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": user_text}
            ]}
        ]
        if self.train:
            gold = str(row["answer"]).strip().lower()
            messages.append({"role": "assistant", "content": [{"type": "text", "text": gold}]})

        return {"messages": messages, "image": img}

In [None]:
from dataclasses import dataclass

@dataclass
class DataCollator:
    processor: Any
    train: bool = True

    def __call__(self, batch):
        texts, images = [], []
        for sample in batch:
            messages = sample["messages"]
            img = sample["image"]

            text = self.processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )
            texts.append(text)
            images.append(img)

        enc = self.processor(
            text=texts,
            images=images,
            padding=True,
            return_tensors="pt"
        )

        if self.train:
            enc["labels"] = enc["input_ids"].clone()

        return enc

## Validation

In [None]:
# 1) stratify용 라벨 준비
labels = train_df["task_type"].astype(str)
labels = labels.replace({"nan": np.nan}).fillna("unknown")  # NaN도 unknown으로

# 2) 희귀 클래스(1개뿐인 클래스)는 unknown으로 흡수
vc = labels.value_counts()
rare_classes = set(vc[vc < 2].index)

labels_mapped = labels.copy()
labels_mapped = np.where(labels.isin(rare_classes), "unknown", labels)

# 3) stratify split
train_subset, valid_subset = train_test_split(
    train_df,
    test_size=0.10,
    random_state=42,
    shuffle=True,
    stratify=labels_mapped  # 희귀 클래스 통합 버전
)

# 4) Dataset / DataLoader 구성
train_ds = VQAMCDataset(train_subset.reset_index(drop=True), processor, train=True)
valid_ds = VQAMCDataset(valid_subset.reset_index(drop=True), processor, train=True)

train_loader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=DataCollator(processor, True))
valid_loader = DataLoader(valid_ds, batch_size=1, shuffle=False, collate_fn=DataCollator(processor, True))

## wandb 연결

In [None]:
# 라이브러리 설치 및 임포트
!pip install wandb -q

In [None]:
import wandb
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbaekus2209[0m ([33mssafy_A014[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import math
import torch
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup
# 2. WandB 초기화
wandb.init(
    project="SSAFY_AI_CHALLENGE",       # 팀 프로젝트 이름
    entity="ssafy_A014",       # 팀 엔터티 (팀 워크스페이스 이름)
    name="best_qwen3_8b_3epoch_typeprompt",    # 실험 이름 (원하면 동적으로 변경 가능)
    # 실험 파라미터 설정
    config={
        "model": "Qwen3-VL-8B-LoRA",  # 모델 이름
        "epochs": 3,
        # "batch_size": 16,
        "grad_accum": 4,
        "lr": 1e-4,
        "scheduler": "linear_warmup_3%",
        "precision": "bfloat16",
    }
)

## 학습

In [None]:
def extract_choice(text: str) -> str:
    text = text.strip().lower()

    lines = [l.strip() for l in text.splitlines() if l.strip()]
    if not lines:
        return "a"
    last = lines[-1]
    if last in ["a", "b", "c", "d"]:
        return last

    tokens = last.split()
    for tok in tokens:
        if tok in ["a", "b", "c", "d"]:
            return tok
    return "a"

In [None]:
# 3. 설정
config = wandb.config
device = "cuda"

model = model.to(device)
GRAD_ACCUM = config.grad_accum
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr)
num_training_steps = config.epochs * math.ceil(len(train_loader)/GRAD_ACCUM)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    int(num_training_steps*0.03),
    num_training_steps
)
scaler = torch.cuda.amp.GradScaler(enabled=True)

# 4. 학습 루프
global_step = 0
all_outputs = []
for epoch in range(config.epochs):
    running = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [train]", unit="batch")
    for step, batch in enumerate(progress_bar, start=1):
        batch = {k:v.to(device) for k,v in batch.items()}

        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
            outputs = model(**batch)
            loss = outputs.loss / GRAD_ACCUM

        scaler.scale(loss).backward()
        running += loss.item()

        if step % GRAD_ACCUM == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
            global_step += 1

            avg_loss = running / GRAD_ACCUM
            progress_bar.set_postfix({"loss": f"{avg_loss:.3f}"})
            running = 0.0

            # WandB 로그 기록
            wandb.log({
                "train/loss": avg_loss,
                "train/learning_rate": scheduler.get_last_lr()[0],
                "global_step": global_step,
                "epoch": epoch + (step / len(train_loader))
            })

    # Validation
    model.eval()
    val_loss = 0.0
    val_steps = 0

    # 해당 샘플의 id, 모델이 예측한 정답, 실제 정답 라벨
    val_ids, val_preds, val_golds = [], [], []

    with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
        for vstep, vb in enumerate(tqdm(valid_loader, desc=f"Epoch {epoch+1} [valid]", unit="batch"), start=1):
            vb = {k:v.to(device) for k,v in vb.items()}
            loss = model(**vb).loss.item()
            val_loss += loss
            val_steps += 1

            # <-- NEW: test와 동일한 방식으로 생성 기반 예측 (chat template + extract_choice)
            row = valid_subset.iloc[vstep - 1]
            img_path = BASE_DIR + row["path"] if not str(row["path"]).startswith("/") else row["path"]
            img = Image.open(img_path).convert("RGB")

            user_text = build_mc_prompt(row["question"], row["a"], row["b"], row["c"], row["d"], row['task_hint'])
            messages = [
                {"role": "system", "content": [{"type": "text", "text": SYSTEM_INSTRUCT}]},
                {"role": "user", "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": user_text}
                ]}
            ]

            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            inputs = processor(text=[text], images=[img], return_tensors="pt").to(device)

            out_ids = model.generate(
                **inputs,
                max_new_tokens=2,
                do_sample=False,
                eos_token_id=processor.tokenizer.eos_token_id
            )
            output_text = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
            pred = extract_choice(output_text)

            # <-- NEW: gold는 valid_subset에서 직접 참조 (순서 일치 가정)
            gold = str(valid_subset.iloc[vstep-1]["answer"]).strip().lower()
            val_preds.append(pred)
            val_golds.append(gold)

            val_ids.append(valid_subset.iloc[vstep-1]["id"])


            # Validation도 실시간으로 step별 기록
            wandb.log({
                "valid/loss": loss,
                "step": global_step + vstep,  # train과 같은 축에서 표시
                "epoch": epoch + (vstep / len(valid_loader))
            })

    avg_val_loss = val_loss / val_steps
    print(f"[Epoch {epoch+1}] valid loss {avg_val_loss:.4f}")

    # <-- NEW: accuracy 계산 및 로깅/저장
    correct = sum(p == g for p, g in zip(val_preds, val_golds))
    val_acc = correct / max(len(val_golds), 1)
    print(f"[Epoch {epoch+1}] valid accuracy {val_acc:.4f}")

    # Validation 로그 기록
    wandb.log({
        "valid/loss": avg_val_loss,
        "epoch": epoch + 1
    })

    # dataframe으로 보기(output_df)
    cols = {}
    if len(val_ids) == len(val_preds) and len(val_ids) > 0:
        cols["id"] = val_ids
    cols["gold"] = val_golds
    cols["pred"] = val_preds
    output_df = pd.DataFrame(cols)

    output_df["epoch"] = epoch + 1
    all_outputs.append(output_df)

    print(f"총 {len(output_df)}개 샘플 중 {correct}개 정답 ({val_acc*100:.2f}% 정확도)")

    model.train()

# 5. 모델 저장 및 업로드
all_output_df = pd.concat(all_outputs, ignore_index=True)

SAVE_DIR = "/content/drive/MyDrive/AI챌린지/2025-ssafy-14/best_qwen3_8b_3epochs_typeprompt"
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print("Saved:", SAVE_DIR)

# 모델 파일 WandB에 업로드 (선택)
wandb.save(SAVE_DIR + "/*")
wandb.finish()


  scaler = torch.cuda.amp.GradScaler(enabled=True)


Epoch 1 [train]:   0%|          | 0/3498 [00:00<?, ?batch/s]

  with torch.cuda.amp.autocast(dtype=torch.bfloat16):
  with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):


Epoch 1 [valid]:   0%|          | 0/389 [00:00<?, ?batch/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[Epoch 1] valid loss 3.4902
[Epoch 1] valid accuracy 0.9254
총 389개 샘플 중 360개 정답 (92.54% 정확도)


Epoch 2 [train]:   0%|          | 0/3498 [00:00<?, ?batch/s]

  with torch.cuda.amp.autocast(dtype=torch.bfloat16):
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):


Epoch 2 [valid]:   0%|          | 0/389 [00:00<?, ?batch/s]

[Epoch 2] valid loss 3.4849
[Epoch 2] valid accuracy 0.9254
총 389개 샘플 중 360개 정답 (92.54% 정확도)


Epoch 3 [train]:   0%|          | 0/3498 [00:00<?, ?batch/s]

  with torch.cuda.amp.autocast(dtype=torch.bfloat16):
  with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):


Epoch 3 [valid]:   0%|          | 0/389 [00:00<?, ?batch/s]

[Epoch 3] valid loss 3.4846
[Epoch 3] valid accuracy 0.9306
총 389개 샘플 중 362개 정답 (93.06% 정확도)




Saved: /content/drive/MyDrive/AI챌린지/2025-ssafy-14/best_qwen3_8b_3epochs_typeprompt


0,1
epoch,▁▁▁▂▂▃▃▃▂▂▂▃▄▄▅▃▄▄▄▅▆▆▆▆▆▇▇▇▇▇███▇▇█████
global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇█
step,▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▅▅▅▅▅▅▇▇▇▇▇▇▇███████
train/learning_rate,▄██████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
train/loss,█▄▂▃▂▃▃▂▃▂▄▃▂▃▃▂▂▂▄▃▂▃▁▁▁▄▃▃▂▃▃▂▂▂▂▂▃▃▃▃
valid/loss,▆▄▆▅▄▅▂▅▅▃▅▇▇▅▇▆▆▆▇▁▄▅▇▆▇▆▄▅▂▂▁▆▂▆▇█▇▅▄▄

0,1
epoch,3.0
global_step,2622.0
step,3011.0
train/learning_rate,0.0
train/loss,0.85305
valid/loss,3.48463


## 추론

In [None]:
# 추론을 위해 모든 레이어 활성화
model.eval()
preds = []

# 추론 루프
for i in tqdm(range(len(test_df)), desc="Inference", unit="sample"):
    row = test_df.iloc[i]
    test_path = "/content/drive/MyDrive/AI챌린지/2025-ssafy-14/" + row["path"]
    img = Image.open(test_path).convert("RGB")
    user_text = build_mc_prompt(row["question"], row["a"], row["b"], row["c"], row["d"], row["task_hint"])

    messages = [
        {"role":"system","content":[{"type":"text","text":SYSTEM_INSTRUCT}]},
        {"role":"user","content":[
            {"type":"image","image":img},
            {"type":"text","text":user_text}
        ]}
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[img], return_tensors="pt").to(device)

    with torch.no_grad():
        out_ids = model.generate(**inputs, max_new_tokens=2, do_sample=False,
                                 eos_token_id=processor.tokenizer.eos_token_id)
    output_text = processor.batch_decode(out_ids, skip_special_tokens=True)[0]

    preds.append(extract_choice(output_text))

# 제출 파일 생성
submission = pd.DataFrame({"id": test_df["id"], "answer": preds})
submission.to_csv("/content/drive/MyDrive/AI챌린지/2025-ssafy-14/best_qwen3_8b_3epochs_typeprompt.csv", index=False)
print("Saved ")

Inference:   0%|          | 0/3887 [00:00<?, ?sample/s]

Saved 


In [None]:
from google.colab import runtime
runtime.unassign()