In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

# 모델 로드 
CAPTION_MODEL_ID = "Salesforce/blip-image-captioning-large"
caption_device = "cuda" if torch.cuda.is_available() else "cpu"

caption_processor = BlipProcessor.from_pretrained(CAPTION_MODEL_ID)
caption_model = BlipForConditionalGeneration.from_pretrained(
    CAPTION_MODEL_ID,
    torch_dtype=torch.float16 if caption_device == "cuda" else torch.float32,
).to(caption_device).eval()

@torch.no_grad()
def caption_image(pil_img, max_new_tokens: int = 40, resize_max: int = 1024):
    # 너무 큰 이미지면 살짝 줄여 캡션 속도/메모리 절약
    if resize_max is not None:
        pil_img = pil_img.copy()
        pil_img.thumbnail((resize_max, resize_max))

    inputs = caption_processor(images=pil_img, return_tensors="pt").to(caption_device)
    out_ids = caption_model.generate(**inputs, max_new_tokens=max_new_tokens)
    text = caption_processor.tokenizer.decode(out_ids[0], skip_special_tokens=True).strip()
    if text and text[-1] not in ".!?":
        text += "."
    return text

def one_sentence(text: str):
    # 필요 시 한 문장만 쓰고 싶을 때 사용(선택)
    idx = text.find(".")
    return text[:idx+1] if idx != -1 else text

In [None]:
from tqdm import tqdm
from PIL import Image
import os

# BASE_DIR
BASE_DIR = "/content/drive/MyDrive/AI챌린지/2025-ssafy-14"

def augment_questions_inplace(df, base_dir=BASE_DIR, store_depiction_col: bool = True, one_sentence_only: bool = False):
    depictions = []
    orig_questions = df["question"].astype(str).tolist()

    for i, row in tqdm(df.iterrows(), total=len(df), desc="Captioning"):
        rel_path = str(row["path"]).strip()
        img_path = os.path.join(base_dir, rel_path)
        img = Image.open(img_path).convert("RGB")

        dep = caption_image(img)  # 영어 캡션
        if one_sentence_only:
            dep = one_sentence(dep)

        depictions.append(dep)

    if store_depiction_col:
        df["depiction"] = depictions  # 참고용(원치 않으면 False)

    # 기존 question 덮어쓰기
    df["question"] = [f"This picture seems like {d} {q}" for d, q in zip(depictions, orig_questions)]

    return df

In [None]:
# train/test 모두 동일 적용
train_df = augment_questions_inplace(train_df, BASE_DIR, store_depiction_col=False, one_sentence_only=False)
test_df  = augment_questions_inplace(test_df,  BASE_DIR, store_depiction_col=False, one_sentence_only=False)