In [None]:
!pip install -q transformers accelerate bitsandbytes peft wandb

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# 구글드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os, re, math, random
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
import torch
from typing import Any
from transformers import (
    AutoModelForVision2Seq,
    AutoProcessor,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tqdm import tqdm

Image.MAX_IMAGE_PIXELS = None
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


# 데이터 불러오기 및 모델 셋업


In [None]:
MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
IMAGE_SIZE = 448    # IMAGE_SIZE = 384 or 448 or 512 or 1024
MAX_NEW_TOKENS = 8
SEED = 42
random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ssafy_ai_chllenge/train.csv")
test_df  = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ssafy_ai_chllenge/test.csv")

# LoRA 세팅

In [None]:
pip install -U bitsandbytes



## LoRA Parameter 세팅

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=IMAGE_SIZE*IMAGE_SIZE,
    max_pixels=IMAGE_SIZE*IMAGE_SIZE,
    trust_remote_code=True,
)

base_model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

base_model = prepare_model_for_kbit_training(base_model)
base_model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/390 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]



config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.72G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

trainable params: 21,823,488 || all params: 8,788,947,184 || trainable%: 0.2483


In [None]:
# parameter 세팅 버전

import importlib.util
ATTN_IMPL = "flash_attention_2" if importlib.util.find_spec("flash_attn") else "sdpa"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=IMAGE_SIZE*IMAGE_SIZE,
    max_pixels=IMAGE_SIZE*IMAGE_SIZE,
    trust_remote_code=True,
)

base_model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation=ATTN_IMPL,
)

base_model = prepare_model_for_kbit_training(base_model)
base_model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# Prompt 세팅


In [None]:
SYSTEM_INSTRUCT = (
    "당신은 시각 정보를 세밀하게 분석하여 질문에 논리적으로 답하는 비주얼 AI 전문가입니다. "
    "이미지에서 주요 대상과 그 특징(형태, 재질, 위치, 배경)을 관찰하고, "
    "보기 중 가장 가능성이 높은 정답을 선택하세요. "
    "최종 출력은 반드시 a, b, c, d 중 하나의 소문자 한 글자만 작성하세요."
)

def build_mc_prompt(question, a, b, c, d):
    return (
        f"질문: {question}\n\n"
        f"보기:\n(a) {a}\n(b) {b}\n(c) {c}\n(d) {d}\n\n"
        "이미지를 관찰한 후, 가장 알맞은 보기를 고르세요. "
        "정답은 반드시 소문자 알파벳 a, b, c, d 중 하나로만 답하세요."
    )

# Dataset 세팅

In [None]:
class VQAMCDataset(Dataset):
    def __init__(self, df, processor, train=True):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.train = train
        self.path = '/content/drive/MyDrive/Colab Notebooks/ssafy_ai_chllenge/'

    def __len__(self): return len(self.df)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        img = Image.open(self.path + row["path"]).convert("RGB")

        q = str(row["question"])
        a, b, c, d = str(row["a"]), str(row["b"]), str(row["c"]), str(row["d"])
        user_text = build_mc_prompt(q, a, b, c, d)

        # ✅ Qwen3는 멀티모달 메시지 기반이므로 구조 그대로 유지 가능
        messages = [
            {"role": "system", "content": [{"type": "text", "text": SYSTEM_INSTRUCT}]},
            {"role": "user", "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": user_text}
            ]}
        ]
        if self.train:
            gold = str(row["answer"]).strip().lower()
            messages.append({"role": "assistant", "content": [{"type": "text", "text": gold}]})

        return {"messages": messages, "image": img}

In [None]:
from dataclasses import dataclass

@dataclass
class DataCollator:
    processor: Any
    train: bool = True

    def __call__(self, batch):
        texts, images = [], []
        for sample in batch:
            messages = sample["messages"]
            img = sample["image"]

            text = self.processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )
            texts.append(text)
            images.append(img)

        enc = self.processor(
            text=texts,
            images=images,
            padding=True,
            return_tensors="pt"
        )

        if self.train:
            enc["labels"] = enc["input_ids"].clone()

        return enc

In [None]:
split = int(len(train_df)*0.9)
train_subset, valid_subset = train_df.iloc[:split], train_df.iloc[split:]

train_ds = VQAMCDataset(train_subset, processor, train=True)
valid_ds = VQAMCDataset(valid_subset, processor, train=True)

train_loader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=DataCollator(processor, True))
valid_loader = DataLoader(valid_ds, batch_size=1, shuffle=False, collate_fn=DataCollator(processor, True))

print("✅ Qwen3 Dataset 준비 완료!")

✅ Qwen3 Dataset 준비 완료!


# WandB 세팅

In [None]:
# 라이브러리 설치 및 임포트
!pip install wandb -q

In [None]:
import wandb
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbaekus2209[0m ([33mssafy_A014[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import math
import torch
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup
# 2. WandB 초기화
wandb.init(
    project="SSAFY_AI_CHALLENGE",       # 팀 프로젝트 이름
    entity="ssafy_A014",       # 팀 엔터티 (팀 워크스페이스 이름)
    name="qwen3_8b_reason_prompt_epoch_image_size_1024",    # 실험 이름 (원하면 동적으로 변경 가능)
    # 실험 파라미터 설정
    config={
        "model": "Qwen3-VL-8B-LoRA",  # 모델 이름
        "epochs": 1,
        "grad_accum": 4,
        "lr": 1e-4,
        "scheduler": "linear_warmup_3%",
        "precision": "bfloat16",
    }
)

In [None]:
import math
import torch
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup
# 2. WandB 초기화
wandb.init(
    project="SSAFY_AI_CHALLENGE",       # 팀 프로젝트 이름
    entity="ssafy_A014",       # 팀 엔터티 (팀 워크스페이스 이름)
    name="qwen3_8b_reason_prompt_epoch_image_size_1024",    # 실험 이름 (원하면 동적으로 변경 가능)
    # 실험 파라미터 설정
    config={
        "model": "Qwen3-VL-8B-LoRA",
        "epochs": 1,             # 원하면 늘려도 됨
        "grad_accum": 4,         # 그대로 쓸 거면 유지 (아래 6번 참고)
        "lr": 5e-4,
        "warmup_ratio": 0.10,
        "weight_decay": 0.0,
        "scheduler": "linear",   # 실제로 linear를 쓸 거면 이렇게
        "precision": "bfloat16",
    }
)

# 학습

In [None]:
# 3. 설정 (baseline)
config = wandb.config
device = "cuda"

model = model.to(device)
GRAD_ACCUM = config.grad_accum
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr)
num_training_steps = 1 * math.ceil(len(train_loader)/GRAD_ACCUM)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    int(num_training_steps*0.03),
    num_training_steps
)
scaler = torch.cuda.amp.GradScaler(enabled=True)

# 4. 학습 루프
global_step = 0
for epoch in range(config.epochs):
    running = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [train]", unit="batch")
    for step, batch in enumerate(progress_bar, start=1):
        batch = {k:v.to(device) for k,v in batch.items()}

        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
            outputs = model(**batch)
            loss = outputs.loss / GRAD_ACCUM

        scaler.scale(loss).backward()
        running += loss.item()

        if step % GRAD_ACCUM == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
            global_step += 1

            avg_loss = running / GRAD_ACCUM
            progress_bar.set_postfix({"loss": f"{avg_loss:.3f}"})
            running = 0.0

            # WandB 로그 기록
            wandb.log({
                "train/loss": avg_loss,
                "train/learning_rate": scheduler.get_last_lr()[0],
                "global_step": global_step,
                "epoch": epoch + (step / len(train_loader))
            })

    # Validation
    model.eval()
    val_loss = 0.0
    val_steps = 0
    with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
        for vstep, vb in enumerate(tqdm(valid_loader, desc=f"Epoch {epoch+1} [valid]", unit="batch"), start=1):
            vb = {k:v.to(device) for k,v in vb.items()}
            loss = model(**vb).loss.item()
            val_loss += loss
            val_steps += 1

            # Validation도 실시간으로 step별 기록
            wandb.log({
                "valid/loss": loss,
                "step": global_step + vstep,  # train과 같은 축에서 표시
                "epoch": epoch + (vstep / len(valid_loader))
            })

    avg_val_loss = val_loss / val_steps
    print(f"[Epoch {epoch+1}] valid loss {avg_val_loss:.4f}")

    # Validation 로그 기록
    wandb.log({
        "valid/loss": avg_val_loss,
        "epoch": epoch + 1
    })

    model.train()

# 5. 모델 저장 및 업로드
SAVE_DIR = "/content/drive/MyDrive/Colab Notebooks/ssafy_ai_chllenge/qwen3_vl_8b_lora_pp3_3_image_size_1024"
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print("Saved:", SAVE_DIR)

# 모델 파일 WandB에 업로드 (선택)
wandb.save(SAVE_DIR + "/*")
wandb.finish()


  scaler = torch.cuda.amp.GradScaler(enabled=True)


Epoch 1 [train]:   0%|          | 0/3498 [00:00<?, ?batch/s]

  with torch.cuda.amp.autocast(dtype=torch.bfloat16):


In [None]:
# 3. 설정 (parameter 세팅 버전)
config = wandb.config
device = "cuda"

model = model.to(device)
GRAD_ACCUM = config.grad_accum
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=float(config.lr),
    weight_decay=float(config.weight_decay)
)
num_training_steps = config.epochs * math.ceil(len(train_loader)/GRAD_ACCUM)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    int(num_training_steps * float(config.warmup_ratio)),
    num_training_steps
)
scaler = torch.cuda.amp.GradScaler(enabled=True)

# 4. 학습 루프
global_step = 0
for epoch in range(config.epochs):
    running = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [train]", unit="batch")
    for step, batch in enumerate(progress_bar, start=1):
        batch = {k:v.to(device) for k,v in batch.items()}

        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
            outputs = model(**batch)
            loss = outputs.loss / GRAD_ACCUM

        scaler.scale(loss).backward()
        running += loss.item()

        if step % GRAD_ACCUM == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
            global_step += 1

            avg_loss = running / GRAD_ACCUM
            progress_bar.set_postfix({"loss": f"{avg_loss:.3f}"})
            running = 0.0

            # WandB 로그 기록
            wandb.log({
                "train/loss": avg_loss,
                "train/learning_rate": scheduler.get_last_lr()[0],
                "global_step": global_step,
                "epoch": epoch + (step / len(train_loader))
            })

    # Validation
    model.eval()
    val_loss = 0.0
    val_steps = 0
    with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
        for vstep, vb in enumerate(tqdm(valid_loader, desc=f"Epoch {epoch+1} [valid]", unit="batch"), start=1):
            vb = {k:v.to(device) for k,v in vb.items()}
            loss = model(**vb).loss.item()
            val_loss += loss
            val_steps += 1

            # Validation도 실시간으로 step별 기록
            wandb.log({
                "valid/loss": loss,
                "step": global_step + vstep,  # train과 같은 축에서 표시
                "epoch": epoch + (vstep / len(valid_loader))
            })

    avg_val_loss = val_loss / val_steps
    print(f"[Epoch {epoch+1}] valid loss {avg_val_loss:.4f}")

    # Validation 로그 기록
    wandb.log({
        "valid/loss": avg_val_loss,
        "epoch": epoch + 1
    })

    model.train()

# 5. 모델 저장 및 업로드
SAVE_DIR = "/content/drive/MyDrive/Colab Notebooks/ssafy_ai_chllenge/qwen3_vl_8b_lora_pp3_3_image_size_1024"
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print("Saved:", SAVE_DIR)

# 모델 파일 WandB에 업로드 (선택)
wandb.save(SAVE_DIR + "/*")
wandb.finish()


# 추론

In [None]:
# 데이터 파서 : 모델의 응답에서 선지를 추출
def extract_choice(text: str) -> str:
    text = text.strip().lower()

    lines = [l.strip() for l in text.splitlines() if l.strip()]
    if not lines:
        return "a"
    last = lines[-1]
    if last in ["a", "b", "c", "d"]:
        return last

    tokens = last.split()
    for tok in tokens:
        if tok in ["a", "b", "c", "d"]:
            return tok
    return "a"

# 추론을 위해 모든 레이어 활성화
model.eval()
preds = []

# 추론 루프
for i in tqdm(range(len(test_df)), desc="Inference", unit="sample"):
    row = test_df.iloc[i]
    test_path = "/content/drive/MyDrive/Colab Notebooks/ssafy_ai_chllenge/" + row["path"]
    img = Image.open(test_path).convert("RGB")
    user_text = build_mc_prompt(row["question"], row["a"], row["b"], row["c"], row["d"])

    messages = [
        {"role":"system","content":[{"type":"text","text":SYSTEM_INSTRUCT}]},
        {"role":"user","content":[
            {"type":"image","image":img},
            {"type":"text","text":user_text}
        ]}
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[img], return_tensors="pt").to(device)

    with torch.no_grad():
        out_ids = model.generate(**inputs, max_new_tokens=2, do_sample=False,
                                 eos_token_id=processor.tokenizer.eos_token_id)
    output_text = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
    # print("output_text:", output_text)
    # print("extract_choice:", extract_choice(output_text))
    preds.append(extract_choice(output_text))

# 제출 파일 생성
submission = pd.DataFrame({"id": test_df["id"], "answer": preds})
submission.to_csv("/content/drive/MyDrive/Colab Notebooks/ssafy_ai_chllenge/submission_qwen3_8b_pp3_3_image_1024.csv", index=False)
print("Saved /content/drive/MyDrive/Colab Notebooks/ssafy_ai_chllenge/submission_qwen2_8b_pp3_3_image_1024.csv")

# 런타임 종료

In [None]:
from google.colab import runtime
runtime.unassign()