In [None]:
import torch
from transformers import (BlipProcessor,
                          BlipForImageTextRetrieval,
                          BlipForConditionalGeneration,
                          BlipForQuestionAnswering)
from PIL import Image
import requests
import numpy as np
import glob
import os
import io
import json
from tqdm.auto import tqdm

DATA_DIR = "/content/group5_project/data"

# GPU 사용 설정
device = "cuda" if torch.cuda.is_available() else "cpu"

# 모델 로드
model_name = "ybelkada/blip-vqa-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)

print(f"BLIP 모델 로드 완료. (사용 장치: {device})")

In [None]:
questions = {"피노키오":["Is there a prominent human-shaped statue in the picture?",
                    "If there is a statue, is it a character from a fairy tale?",
                    "Does the statue have a particularly long nose?",
                    "Is the statue wearing green-colored clothes?",
                    "Are there flower patterns on the statue's clothes?",
                    "Is the statue holding something?",
                    "Is the object the statue is holding a book?",
                    "Is the hat the statue is wearing a pointed cone shape?",
                    "Are the statue's arms relatively thin and long?",
                    "Is the statue holding up a finger with its right hand, like the number 1?"],
             "네모탑":["Is this object shaped like a tower?",
                    "Is this tower shaped like stacked square boxes?",
                    "Is the height much longer than the width?",
                    "Is this object made of wood?",
                    "Is the color reddish-brown?",
                    "Does the color look like rusty metal?",
                    "Is the surface smooth?",
                    "Does the sculpture only use dark colors?",
                    "Does this sculpture have many layers?",
                    "Is the number of layers exactly 5?",
                    "Do the layers get smaller as they go up?",
                    "Is the bottom part the widest?",
                    "Does each layer have a square opening on the front?",
                    "Does the height of this sculpture look more than twice a person's height?"],
             "지혜의숲 조각상":["Is this picture taken inside a building?",
                         "Does the background show a place with many books?",
                         "Do wooden bookshelves cover the walls?",
                         "Does the room have a high ceiling?",
                         "Is the floor made of wood?",
                         "Is the main object a human-shaped sculpture?",
                         "Is the sculpture sitting down?",
                         "Does the sculpture have its hands near its face?",
                         "Is the sculpture holding binoculars?",
                         "Is the sculpture looking straight ahead?",
                         "Does the sculpture wear a suit?",
                         "Is the suit covered in many bright colors?",
                         "Does the suit have patterns like flowers?",
                         "Does the sculpture's suit have a lot of green and yellow?",
                         "Is the sculpture's face also painted with patterns?",
                         "Is the sculpture sitting on a square base?",
                         "Does the base look like it is made of stacked small blocks?",
                         "Does the base have letters written on it?",
                         "Is a small container placed next to the sculpture?",
                         "Does that small container have patterns on it too?"]}

In [None]:
LANDMARK_NAME = "네모탑" # 탐색할 랜드마크 폴더 이름
landmark_dir = os.path.join(DATA_DIR, LANDMARK_NAME)

In [None]:
image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.webp", "*.jfif"]
image_files = []
for ext in image_extensions:
    # landmark_dir 경로에서 모든 이미지 파일을 찾음
    image_files.extend(glob.glob(os.path.join(landmark_dir, ext)))

if not image_files:
    print(f"경고: '{landmark_dir}' 폴더에서 이미지를 찾을 수 없습니다. 경로를 확인해주세요.")
else:
    print(f"'{LANDMARK_NAME}' 폴더에서 총 {len(image_files)}개의 이미지를 찾았습니다.")

# --- 5. VQA 실행 및 답변 수집 ---
all_answers_set = set() # 모든 고유한 답변을 저장할 set

# tqdm을 사용하여 진행률 표시
for img_path in tqdm(image_files, desc=f"'{LANDMARK_NAME}' 이미지 처리 중"):
    try:
        image = Image.open(img_path).convert("RGB")
        print(f"\n--- {img_path.split('/')[-1]} 처리 중 ---")
        display(image.resize((300, 300)))

        yes_count = 0
        no_count = 0
        # 이미지 Processing
        inputs = vqa_processor(image, question, return_tensors="pt").to(device)

        for question in questions[LANDMARK_NAME]:

            # 답변 생성 (max_length를 짧게 하여 단답형 유도)
            out = vqa_model.generate(**inputs, max_length=5)
            answer = vqa_processor.decode(out[0], skip_special_tokens=True).strip().lower()

            print(f"  Q: {question}")
            print(f"  A: {answer}")

            all_answers_set.add(answer)

            # 'yes'와 'no' 개수 세기
            if answer == 'yes':
                yes_count += 1
            elif answer == 'no':
                no_count += 1

        print(f"\n  YES: {yes_count}, NO: {no_count}")


    except Exception as e:
        print(f"{img_path} 처리 중 오류 발생: {e}")

# --- 6. 최종 Ground Truth 리스트 출력 ---
final_captions = list(all_answers_set)

print("\n" + "="*40)
print(f"'{LANDMARK_NAME}' 랜드마크의 최종 Ground Truth 캡션 후보 목록")
print("="*40)
# Remove the unique() call as it's not a method of list
print(final_captions)