# BLIP QA Automated Test Suite

This notebook runs a full suite of tests against the landmark QA dataset using the `run_tests.py` script. It covers:
1.  **Positive Tests:** Testing each landmark's questions against its own image.
2.  **Negative Tests:** Testing confusing pairs of landmarks against each other to check for distinctiveness.

The results are collected and displayed in summary tables using pandas.

## 1. Setup and Configuration

In [8]:
# 로컬/서버에서 실행할 땐 아래 코드 주석처리 하세요
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [9]:
import os
import json
import pandas as pd
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
import warnings

# --- 1. 환경별 프로젝트 루트 경로 정의 ---
COLAB_DRIVE_ROOT = "/content/drive/Othercomputers/내 컴퓨터/데이콘 출판마을 프로젝트/github"
SERVER_ROOT = "/content/group5_project"

# --- 2. 환경 감지 및 PROJECT_ROOT 설정 ---
if os.path.exists(COLAB_DRIVE_ROOT):
    PROJECT_ROOT = COLAB_DRIVE_ROOT
    print("환경 감지: Google Drive Colab")
elif os.path.exists(SERVER_ROOT):
    PROJECT_ROOT = SERVER_ROOT
    print("환경 감지: Server (group5_project)")
else:
    PROJECT_ROOT = os.getcwd()
    print("환경 감지: Local workspace")

# --- Configuration ---
QA_FILE = os.path.join(PROJECT_ROOT, "data", "landmark_qa_labeled_updated.json")
DATA_ROOT = os.path.join(PROJECT_ROOT, "data")

warnings.filterwarnings("ignore", message=".*TypedStorage is deprecated.*")

with open(QA_FILE, 'r', encoding='utf-8') as f:
    QA_DATA_SNAPSHOT = json.load(f)
LANDMARK_NAMES = list(QA_DATA_SNAPSHOT.keys())
print(f"QA landmarks loaded: {len(LANDMARK_NAMES)} entries")

IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".jfif", ".heic", ".webp")


def collect_landmark_images(data_root, landmark_names, max_images=5):
    image_map = {}
    for landmark in landmark_names:
        folder = os.path.join(data_root, landmark)
        images = []
        if os.path.isdir(folder):
            for root, _, files in os.walk(folder):
                for filename in sorted(files):
                    if filename.lower().endswith(IMAGE_EXTENSIONS):
                        rel_path = os.path.relpath(os.path.join(root, filename), data_root)
                        images.append(rel_path)
        if not images:
            print(f"[경고] '{landmark}' 폴더에서 사용할 이미지가 없습니다.")
        image_map[landmark] = images[:max_images]
    return image_map


landmark_images = collect_landmark_images(DATA_ROOT, LANDMARK_NAMES, max_images=3)
print("Setup complete.")
print(f"QA File: {QA_FILE}")
print(f"Data Root: {DATA_ROOT}")
print(f"Representative image pools ready for {len(landmark_images)} landmarks (≤3 each)")



환경 감지: Google Drive Colab
QA landmarks loaded: 11 entries
Setup complete.
QA File: /content/drive/Othercomputers/내 컴퓨터/데이콘 출판마을 프로젝트/github/data/landmark_qa_labeled_updated.json
Data Root: /content/drive/Othercomputers/내 컴퓨터/데이콘 출판마을 프로젝트/github/data
Representative image pools ready for 11 landmarks (≤3 each)


## 2. BLIP Model Setup


In [10]:
processor = None
model = None
device = "cuda" if torch.cuda.is_available() else "cpu"


def load_blip_model():
    global processor, model
    if processor is None or model is None:
        processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
        model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
        model.to(device)
        print(f"BLIP VQA model loaded on device: {device}")
    return processor, model


def run_vqa_test(qa_file_path, landmark_name, image_path):
    processor, model = load_blip_model()

    try:
        with open(qa_file_path, 'r', encoding='utf-8') as f:
            qa_data = json.load(f)
    except FileNotFoundError:
        return {"error": f"QA 파일을 찾을 수 없습니다: {qa_file_path}"}

    if landmark_name not in qa_data:
        return {"error": f"랜드마크 '{landmark_name}' 데이터를 찾을 수 없습니다."}

    try:
        raw_image = Image.open(image_path).convert('RGB')
    except FileNotFoundError:
        return {"error": f"이미지 파일을 찾을 수 없습니다: {image_path}"}

    questions_for_landmark = qa_data[landmark_name]
    correct_yes_count = 0
    total_yes_questions = 0
    correct_no_count = 0
    total_no_questions = 0

    print(f"\n--- Running Evaluation for '{landmark_name}' with image '{os.path.basename(image_path)}' ---")
    for i, (question, expected_answer) in enumerate(questions_for_landmark):
        inputs = processor(raw_image, question, return_tensors="pt").to(device)
        out = model.generate(**inputs, max_new_tokens=10)
        model_answer = processor.decode(out[0], skip_special_tokens=True).strip().lower()

        is_correct = (model_answer == expected_answer)
        if expected_answer == 'yes':
            total_yes_questions += 1
            if is_correct:
                correct_yes_count += 1
        else:
            total_no_questions += 1
            if is_correct:
                correct_no_count += 1

        print(f"Q{i+1:02d}: '{question}' -> Model: '{model_answer}', Expected: '{expected_answer}' ({'Correct' if is_correct else 'Incorrect'})")

    yes_accuracy = (correct_yes_count / total_yes_questions) if total_yes_questions else 0
    no_accuracy = (correct_no_count / total_no_questions) if total_no_questions else 0
    total_accuracy = ((correct_yes_count + correct_no_count) / (total_yes_questions + total_no_questions)) if (total_yes_questions + total_no_questions) else 0

    print("\n--- Test Summary ---")
    print(f"YES Questions Accuracy: {correct_yes_count}/{total_yes_questions} ({yes_accuracy:.2%})")
    print(f"NO Questions Accuracy: {correct_no_count}/{total_no_questions} ({no_accuracy:.2%})")
    print(f"Overall Accuracy: {total_accuracy:.2%}")
    print("-" * 20)

    return {
        "yes_accuracy": yes_accuracy * 100,
        "no_accuracy": no_accuracy * 100,
        "overall_accuracy": total_accuracy * 100
    }



## 3. Helper Function to Run Tests


In [11]:
def run_test(landmark_name, image_name, image_path):
    """Run BLIP QA locally and return accuracy metrics."""
    print(f"--- Running test: Questions for [{landmark_name}] with image of [{image_name}] ---")
    result = run_vqa_test(QA_FILE, landmark_name, image_path)

    if "error" in result:
        print(result["error"])
        return {"error": result["error"]}

    return {
        "YES Accuracy": round(result["yes_accuracy"], 2),
        "NO Accuracy": round(result["no_accuracy"], 2),
        "Overall Accuracy": round(result["overall_accuracy"], 2)
    }



## 4. Execute Positive Tests
각 랜드마크의 대표 이미지 최대 3장을 사용해 테스트를 반복 실행하고, 평균 정확도를 계산합니다.


In [13]:
positive_results = []

for landmark in LANDMARK_NAMES:
    image_rel_paths = landmark_images.get(landmark, [])
    sample_metrics = []

    for rel_path in image_rel_paths:
        image_path = os.path.join(DATA_ROOT, rel_path)
        accuracy_data = run_test(landmark, landmark, image_path)
        if "error" in accuracy_data:
            continue
        sample_metrics.append(accuracy_data)

    if sample_metrics:
        mean_yes = sum(m["YES Accuracy"] for m in sample_metrics) / len(sample_metrics)
        mean_no = sum(m["NO Accuracy"] for m in sample_metrics) / len(sample_metrics)
        mean_overall = sum(m["Overall Accuracy"] for m in sample_metrics) / len(sample_metrics)
    else:
        mean_yes = mean_no = mean_overall = None

    positive_results.append({
        "Landmark": landmark,
        "YES Accuracy (mean)": round(mean_yes, 2) if mean_yes is not None else None,
        "NO Accuracy (mean)": round(mean_no, 2) if mean_no is not None else None,
        "Overall Accuracy (mean)": round(mean_overall, 2) if mean_overall is not None else None,
        "Samples": len(sample_metrics)
    })
    print("-" * 50)

print("=== Positive Tests Complete ===")
positive_df = pd.DataFrame(positive_results).set_index("Landmark")
positive_df



--- Running test: Questions for [마법천자문 손오공] with image of [마법천자문 손오공] ---
BLIP VQA model loaded on device: cuda

--- Running Evaluation for '마법천자문 손오공' with image '2.jfif' ---
Q01: 'Is the main object a statue?' -> Model: 'yes', Expected: 'yes' (Correct)
Q02: 'Is the statue located outdoors?' -> Model: 'yes', Expected: 'yes' (Correct)
Q03: 'Is the statue human-shaped?' -> Model: 'yes', Expected: 'yes' (Correct)
Q04: 'Is the statue a boy character?' -> Model: 'yes', Expected: 'yes' (Correct)
Q05: 'Is the statue's clothing purple?' -> Model: 'yes', Expected: 'yes' (Correct)
Q06: 'Is the statue wearing a yellow belt?' -> Model: 'yes', Expected: 'yes' (Correct)
Q07: 'Is the statue's hair red?' -> Model: 'yes', Expected: 'yes' (Correct)
Q08: 'Does the statue have spiky hair?' -> Model: 'yes', Expected: 'yes' (Correct)
Q09: 'Does the statue have a headband?' -> Model: 'yes', Expected: 'yes' (Correct)
Q10: 'Is the headband yellow?' -> Model: 'yes', Expected: 'yes' (Correct)
Q11: 'Does the sta

Unnamed: 0_level_0,YES Accuracy (mean),NO Accuracy (mean),Overall Accuracy (mean),Samples
Landmark,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
마법천자문 손오공,91.67,66.67,79.17,3
로드킬 부엉이,93.33,70.0,81.67,3
나남출판사,88.33,66.67,77.5,3
활판공방 인쇄기,95.0,58.33,76.67,3
활돌이,83.33,51.67,67.5,3
지혜의숲 고양이,95.0,60.0,77.5,3
네모탑,86.67,60.0,73.33,3
지혜의숲 조각상,96.67,41.67,69.17,3
지혜의숲 입구 조각상,96.67,50.0,73.33,3
창틀 피노키오,80.0,60.0,70.0,3


## 5. Execute Negative Tests
모든 랜드마크 조합(자기 자신 제외)에 대해 상대 랜드마크 이미지 3장을 사용하여 혼동 여부를 평균 정확도로 확인합니다.


In [14]:
negative_results = []

for questions_for in LANDMARK_NAMES:
    for image_of in LANDMARK_NAMES:
        if questions_for == image_of:
            continue

        image_rel_paths = landmark_images.get(image_of, [])
        sample_metrics = []

        for rel_path in image_rel_paths:
            image_path = os.path.join(DATA_ROOT, rel_path)
            accuracy_data = run_test(questions_for, image_of, image_path)
            if "error" in accuracy_data:
                continue
            sample_metrics.append(accuracy_data)

        if sample_metrics:
            mean_yes = sum(m["YES Accuracy"] for m in sample_metrics) / len(sample_metrics)
            mean_no = sum(m["NO Accuracy"] for m in sample_metrics) / len(sample_metrics)
            mean_overall = sum(m["Overall Accuracy"] for m in sample_metrics) / len(sample_metrics)
        else:
            mean_yes = mean_no = mean_overall = None

        negative_results.append({
            "Questions For": questions_for,
            "Image Of": image_of,
            "YES Accuracy (mean)": round(mean_yes, 2) if mean_yes is not None else None,
            "NO Accuracy (mean)": round(mean_no, 2) if mean_no is not None else None,
            "Overall Accuracy (mean)": round(mean_overall, 2) if mean_overall is not None else None,
            "Samples": len(sample_metrics)
        })
        print("-" * 50)

print("=== Negative Tests Complete ===")
negative_df = pd.DataFrame(negative_results)
negative_df



SyntaxError: unterminated string literal (detected at line 35) (ipython-input-3953595468.py, line 35)

## 6. Results Summary


In [None]:
print("--- Positive Test Results ---")
display(positive_df)

print("\n--- Negative Test Results ---")
display(negative_df)

## 7. Analysis
- Use the positive accuracy table to highlight landmarks where YES accuracy falls below target; add images or clarify hints/questions accordingly.
- In the negative table, any case with non-zero YES accuracy indicates confusing phrasing—add more discriminative NO samples for that landmark pair.
- Document follow-up changes and rerun this notebook so new accuracy numbers serve as regression checkpoints.
