In [None]:
!pip install -q -U "transformers>=4.46" "accelerate>=0.34" "peft>=0.11" "bitsandbytes>=0.43" "sentencepiece" "protobuf>=5.29.1,<6"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m125.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m116.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import bitsandbytes as bnb
import transformers

print("bitsandbytes version:", bnb.__version__)
print("transformers version:", transformers.__version__)

ModuleNotFoundError: No module named 'bitsandbytes'

In [None]:
# ==========================
# Cell 1: Imports & Safety
# ==========================
import os, io, re, base64, hashlib, warnings
import pandas as pd
from tqdm import tqdm
from PIL import Image, UnidentifiedImageError, ImageFile
from urllib.parse import urlparse
from urllib.request import urlopen, Request
import csv, json
from pathlib import Path
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
from peft import PeftModel

# PIL safety knobs
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb=128"
ImageFile.LOAD_TRUNCATED_IMAGES = True
warnings.filterwarnings("ignore", category=UserWarning, module="PIL")

In [None]:
# ==========================
# Cell 2: CONFIG & SYSTEM PROMPT
# ==========================

# --- 모델 및 경로 설정 ---
BASE_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

# fine-tuning 결과 load
FT_OUT_DIR = "/content/drive/MyDrive/Colab Notebooks/wook/fine-tuning/datasets/qlora/qlora-out"
ADAPTER_DIR = f"{FT_OUT_DIR}/checkpoint-4000_backup"
PROC_DIR = FT_OUT_DIR

DTYPE = torch.bfloat16

TEST_PATH = "/content/drive/MyDrive/Colab Notebooks/wook/deeplearningchallenge/deep_chal_multitask_dataset_test.parquet"
OUT_DIR = "/content/drive/MyDrive/Colab Notebooks/wook/output"
IMG_BASE = "/content"
URL_TIMEOUT = 5
MAX_SIDE_HARD = 3500

# --- 이미지 및 생성 설정 ---
LONG_SIDE = 1280
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.2
TOP_P = 0.9

# --- 시스템 프롬프트 ---
SYSTEM_PROMPT = """
You are a unified vision–language inference assistant for five tasks:
[captioning, vqa, summarization, text_qa, math_reasoning].

Always reply in English and output ONLY the answer according to task-specific rules.
If the answer cannot be determined from the provided input, output exactly: unknown.

Task-specific output rules:

**captioning**:
- MUST start with "The image is" or "The image shows".
- 1–3 sentences (100–200 words), neutral and factual.
- Describe ONLY what is visible: subjects, colors, composition, background, style; include spatial relations (“in the center”, “on the right side”, “in the background”).
- For book/magazine covers: include ALL clearly legible text (title, author, publisher, tagline) verbatim with original casing/punctuation; if a fragment is partially unreadable, acknowledge unreadability and omit the uncertain part.
- For illustrations: state a style label such as “vintage pulp illustration”, “digital illustration”, or “cartoon-like”.
- Avoid speculation (no invented weapons, blood, locations, dates, brands, or narratives).
- Optionally end with a brief overall impression (e.g., “The image has a futuristic and sci-fi feel to it”).

**vqa**:
- Output ONLY the extracted text or answer from the image
- For text extraction: output exactly as shown (preserve capitalization, punctuation)
- For yes/no questions: use lowercase 'yes' or 'no'
- For counting: use digits only
- For names/titles: preserve exact formatting
- No additional words or explanations

**math_reasoning**:
- Show step-by-step calculation using the format: value = <<calculation>>result
- Each line should explain one step
- Use the exact format for calculations: <<operation>>
- End with: #### [final_answer]
- Example format:
  Mimi has 2 x 12 = <<2*12=24>>24 sea shells.
  Kyle has 24 x 2 = <<24*2=48>>48 sea shells.
  Leigh has 48 / 3 = <<48/3=16>>16 sea shells.
  #### 16

**summarization**:
- Single paragraph of 140–180 words.
- Start EXACTLY with “[Act Name] - ” where [Act Name] is copied verbatim from the source.
- Prefer canonical legislative verbs AS WRITTEN in the source: Amends, Authorizes, Directs, Establishes, Requires, Allows, Provides, Designates, Decreases, Earmarks, Extends.
- Copy proper nouns, program/account names, agencies, and fiscal years (e.g., FY2000–FY2001) verbatim. Include fees, dollar amounts, headcounts/limits if present.
- Summarize systematically in this order if available: (1) scope/purpose, (2) appropriations/funding, (3) authorities/requirements, (4) eligibility/coverage, (5) timelines/reporting/oversight, (6) penalties/fees/exemptions.
- If brief section tags like “(Sec. X)” appear in the source, you MAY include them verbatim.
- Do NOT invent provisions; do NOT use “TABLE OF CONTENTS” or headings; no quotations.
- If the act name cannot be determined from the input, output exactly: unknown.

**text_qa**:
- Return a JSON object with exactly these keys:
  {'input_text': [list of answers], 'answer_start': [list of start positions], 'answer_end': [list of end positions]}
- Extract answers directly from the source text
- For multiple questions, maintain order correspondence
- Use exact text spans from the passage
- Format: valid JSON without any additional text

Do not add any explanations, labels, or metadata beyond the specified format for each task.
""".strip()

print("Configuration and System Prompt are set.")

Configuration and System Prompt are set.


SYSTEM_PROMPT = """
You are a unified vision–language inference assistant for five tasks:
[captioning, vqa, summarization, text_qa, math_reasoning].
Always reply in English and output ONLY the answer—no prefaces, labels, or extra words.
If the answer cannot be determined from the provided input, output exactly: unknown.

Task-specific output rules:
- captioning: 1–2 concise sentences describing visible content only. No guesses about time/brand/camera; avoid 'image of'.
- vqa: a single short answer. Use 'yes' or 'no' for polar questions; numbers as digits; one noun phrase otherwise; no punctuation.
- text_qa: extractive-style short answer (1–8 words) from the text; no full sentences unless necessary.
- summarization: one coherent paragraph (~130–180 words), neutral tone; capture main entities, events, and outcomes; no metadata or quotes.
- math_reasoning: give ONLY the final answer (and units if explicitly required). For multiple choice, output just the option letter.
Do not explain steps or reasoning unless explicitly asked in the user message.
""".strip()

In [None]:
# ==========================
# Cell 3: Helper Functions
# ==========================

def safe_to_csv_utf8(df: pd.DataFrame, path: str):
    out = df.copy()
    out.columns = [c.replace("\ufeff","").strip().lower() for c in out.columns]
    assert out.columns.tolist() == ["id", "output"], f"Columns must be ['id','output'], got {out.columns.tolist()}"
    out["id"] = out["id"].astype(str)
    out["output"] = out["output"].astype(str)
    kwargs = dict(index=False, encoding="utf-8", quoting=csv.QUOTE_MINIMAL)
    try:
        out.to_csv(path, lineterminator="\n", **kwargs)
    except TypeError:
        out.to_csv(path, **kwargs)

def assert_submission_utf8_ok(path: str):
    raw = Path(path).read_bytes()
    try:
        text = raw.decode("utf-8")
    except UnicodeDecodeError as e:
        raise AssertionError(f"Not UTF-8: {e}")
    with open(path, "r", encoding="utf-8", newline="") as f:
        rows = list(csv.reader(f))
    header = rows[0]
    if [h.strip().lower() for h in header] != ["id","output"]:
        raise AssertionError(f"Header mismatch: {header}")
    print(f"Submission '{path}' looks OK. ({len(rows)-1} rows)")

_b64_re = re.compile(r'^[A-Za-z0-9+/=\n\r]+$')
def looks_like_base64(s: str, min_len: int = 128) -> bool:
    if not (isinstance(s, str) and len(s) >= min_len and _b64_re.match(s)):
        return False
    try:
        head = base64.b64decode(s[:4096], validate=True)
        return head.startswith(b"\x89PNG") or head.startswith(b"\xff\xd8")
    except Exception:
        return False

def is_url(path: str) -> bool:
    try:
        return urlparse(str(path)).scheme in ("http", "https")
    except Exception:
        return False

def build_user_prompt(task: str, input_type: str, the_input: str, question: str | None) -> str:
    t = (task or "").strip().lower()
    it = (input_type or "text").strip().lower()
    q = (question or "").strip()
    lines = [
        f"Task: {t}",
        f"InputType: {it}",
        f"Question: {q}" if q else "Question:",
    ]
    lines.append("Input:\n" + (the_input or "") if it == "text" else "Input: <image>")
    return "\n".join(lines)

def _cap_max_side(img: Image.Image, cap=MAX_SIDE_HARD) -> Image.Image:
    if max(img.size) <= cap: return img
    img = img.copy()
    img.thumbnail((cap, cap), Image.LANCZOS)
    return img

def _resize_keep_ratio(img: Image.Image, long_side: int) -> Image.Image:
    if max(img.size) <= long_side: return img
    img = img.copy()
    img.thumbnail((long_side, long_side), Image.LANCZOS)
    return img

RESIZE_CACHE_DIR = "/tmp/img_resized_cache"
os.makedirs(RESIZE_CACHE_DIR, exist_ok=True)

def _hash_image_bytes(img: Image.Image) -> str:
    with io.BytesIO() as bio:
        img.save(bio, format="PNG", optimize=False)
        return hashlib.md5(bio.getvalue()).hexdigest()

def finalize_image(img: Image.Image) -> Image.Image:
    img = _cap_max_side(img, MAX_SIDE_HARD)
    target = LONG_SIDE
    if max(img.size) <= target: return img
    h = _hash_image_bytes(img) + f"_{target}"
    path = os.path.join(RESIZE_CACHE_DIR, h + ".png")
    if os.path.exists(path): return Image.open(path).convert("RGB")
    out = _resize_keep_ratio(img, target)
    out.save(path, format="PNG")
    return out

def load_image(input_obj) -> Image.Image:
    try:
        if isinstance(input_obj, (bytes, bytearray)):
            return _cap_max_side(Image.open(io.BytesIO(input_obj)).convert("RGB"))
        if isinstance(input_obj, str):
            if input_obj.startswith("data:image"):
                b64 = input_obj.split(",", 1)[1]
                return _cap_max_side(Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB"))
            if looks_like_base64(input_obj):
                return _cap_max_side(Image.open(io.BytesIO(base64.b64decode(input_obj))).convert("RGB"))
            p = str(input_obj)
            if is_url(p):
                req = Request(p, headers={"User-Agent": "Mozilla/5.0"})
                with urlopen(req, timeout=URL_TIMEOUT) as r:
                    raw = r.read()
                return _cap_max_side(Image.open(io.BytesIO(raw)).convert("RGB"))
            if not os.path.isabs(p) and IMG_BASE:
                p = os.path.join(IMG_BASE, p)
            return _cap_max_side(Image.open(p).convert("RGB"))
    except Exception:
        return Image.new("RGB", (1, 1), (0, 0, 0))
    return Image.new("RGB", (1, 1), (0, 0, 0))

print("Helper functions are defined.")

Helper functions are defined.


In [None]:
# ==========================
# Cell 4: Load Model & Processor
# ==========================
model, proc = None, None
if 'model' not in locals() or model is None:
    print(f"Loading model '{BASE_ID}'...")
    base = AutoModelForImageTextToText.from_pretrained(
        BASE_ID, device_map="auto", trust_remote_code=True, torch_dtype=DTYPE
    )
    proc = AutoProcessor.from_pretrained(BASE_ID, trust_remote_code=True)
    tok_files = ["tokenizer.json", "tokenizer.model", "vocab.json"]
    if any(os.path.exists(os.path.join(FT_OUT_DIR, f)) for f in tok_files):
        tok = AutoTokenizer.from_pretrained(FT_OUT_DIR, trust_remote_code=True, use_fast=False)
        proc.tokenizer = tok
    if ADAPTER_DIR:
        model = PeftModel.from_pretrained(base, ADAPTER_DIR, is_trainable=False).eval()
    else:
        model = base
    model = model.eval()
    print("Model and processor loaded.")
else:
    print("Model and processor are already loaded.")

Loading model 'Qwen/Qwen2.5-VL-7B-Instruct'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


chat_template.json: 0.00B [00:00, ?B/s]

Model and processor loaded.


In [None]:
# ==========================
# Cell 5: Inference Function
# ==========================
@torch.no_grad()
def infer_one(model, proc, task: str, input_type: str, the_input, question: str = "") -> str:
    user_prompt = build_user_prompt(task, input_type, str(the_input), question)

    if (input_type or "text").lower() == "image":
        img = load_image(the_input)
        img = finalize_image(img)
        user_content = [{"type": "image"}, {"type": "text", "text": user_prompt}]
        images = [img]
    else:
        user_content = [{"type": "text", "text": user_prompt}]
        images = None

    messages = [
        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
        {"role": "user",   "content": user_content},
    ]

    text = proc.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    kwargs = dict(text=[text], return_tensors="pt", padding=True)
    if images is not None:
        kwargs["images"] = images
    batch = proc(**kwargs)
    device = next(model.parameters()).device
    for k, v in list(batch.items()):
        if isinstance(v, torch.Tensor):
            batch[k] = v.to(device)

    input_len = batch["input_ids"].shape[1]
    out_ids = model.generate(
        **batch,
        do_sample=bool(TEMPERATURE and TEMPERATURE > 0.0),
        temperature=TEMPERATURE, top_p=TOP_P,
        max_new_tokens=MAX_NEW_TOKENS,
        pad_token_id=proc.tokenizer.eos_token_id,
        eos_token_id=proc.tokenizer.eos_token_id,
    )
    gen_only = out_ids[:, input_len:]
    out = proc.batch_decode(gen_only, skip_special_tokens=True)[0]
    return out.strip()

print("Inference function is defined.")

Inference function is defined.


In [None]:
# ==========================
# Cell 6: Load Data & Run Inference
# ==========================
df = pd.read_parquet(TEST_PATH)

if "input_type" not in df.columns and "input_tpye" in df.columns:
    df = df.rename(columns={"input_tpye": "input_type"})
df = df.reset_index(drop=True)
df.insert(0, "id", df.index.astype(str))

print(f"Total rows to process: {len(df):,}")

results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    output = infer_one(
        model,
        proc,
        task=row["task"],
        input_type=row["input_type"],
        the_input=row["input"],
        question=row.get("question", "")
    )
    results.append({"id": str(row["id"]), "output": output})

submission_df = pd.DataFrame(results)
print("\nInference complete.")
submission_df.head()

# ==========================
# Cell 7: Save and Verify Submission
# ==========================
os.makedirs(OUT_DIR, exist_ok=True)
submission_path = os.path.join(OUT_DIR, "submission.csv")

safe_to_csv_utf8(submission_df, submission_path)
try:
    assert_submission_utf8_ok(submission_path)
except AssertionError as e:
    print(f"Verification failed: {e}")

print(f"\nSubmission file saved to: {submission_path}")

Total rows to process: 50


100%|██████████| 50/50 [09:54<00:00, 11.88s/it]


Inference complete.
Submission '/content/drive/MyDrive/Colab Notebooks/wook/output/submission.csv' looks OK. (50 rows)

Submission file saved to: /content/drive/MyDrive/Colab Notebooks/wook/output/submission.csv



