In [14]:
import pandas as pd
import json
from pathlib import Path
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ========================
# 1. 결과 파일 목록
# ========================

RESULT_FILES = {
    "clean-3": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_clean-prompts-3_results.csv",
    "clean-4": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_clean-prompts-4_results.csv",
    "dataset-1": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_dataset-1-prompts_results.csv",
    "dataset-2": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_dataset-2-prompts_results.csv",
    "dataset-3": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_dataset-3-prompts_results.csv",
    "dataset-4": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_dataset-4-prompts_results.csv",
    "artifact-1": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_dataset-artifact-1-prompts_results.csv",
    "artifact-2": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_dataset-artifact-2-prompts_results.csv",
    "artifact-3": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_dataset-artifact-3-prompts_results.csv",
    "artifact-4": r"C:\Users\hanna\Lectures\Research_Project\Codes\PromptEngineering\Results\qwen3vl_dataset-artifact-4-prompts_results.csv",
}


# ========================
# 2. 유틸 함수
# ========================

def load_result_file(path):
    path = Path(path)
    if path.suffix == ".csv":
        return pd.read_csv(path)
    elif path.suffix == ".json":
        with open(path, "r", encoding="utf-8") as f:
            return pd.DataFrame(json.load(f))
    else:
        raise ValueError(f"Unsupported file type: {path}")

def label_to_binary(x):
    if x == "disease":
        return 1
    elif x == "normal":
        return 0
    return None

# ========================
# 3. 프롬프트별 평가
# ========================

rows = []

for prompt_name, file_path in RESULT_FILES.items():
    df = load_result_file(file_path)

    # binary 변환
    df["model_binary"] = df["model_label"].apply(label_to_binary)

    eval_df = df.dropna(subset=["binarylabel", "model_binary"])

    y_true = eval_df["binarylabel"].astype(int)
    y_pred = eval_df["model_binary"].astype(int)

    rows.append({
        "prompt": prompt_name,
        "num_samples": len(eval_df),
        "pred_disease": (y_pred == 1).sum(),
        "pred_normal": (y_pred == 0).sum(),
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
    })

# ========================
# 4. 결과 테이블
# ========================

order = [
    "clean-3",
    "clean-4",
    "dataset-1",
    "dataset-2",
    "dataset-3",
    "dataset-4",
    "artifact-1",
    "artifact-2",
    "artifact-3",
    "artifact-4",
]

result_df = pd.DataFrame(rows)
result_df["prompt"] = pd.Categorical(
    result_df["prompt"],
    categories=order,
    ordered=True
)
result_df = result_df.sort_values("prompt")


print("\n=== Prompt-wise Performance Comparison ===")
print(result_df.to_string(index=False))





=== Prompt-wise Performance Comparison ===
    prompt  num_samples  pred_disease  pred_normal  accuracy  precision   recall       f1
   clean-3          100           100            0  0.520000   0.520000 1.000000 0.684211
   clean-4          100            63           37  0.670000   0.650794 0.788462 0.713043
 dataset-1           99             1           98  0.494949   1.000000 0.019608 0.038462
 dataset-2          100           100            0  0.520000   0.520000 1.000000 0.684211
 dataset-3          100           100            0  0.520000   0.520000 1.000000 0.684211
 dataset-4          100            64           36  0.640000   0.625000 0.769231 0.689655
artifact-1          100             0          100  0.480000   0.000000 0.000000 0.000000
artifact-2          100           100            0  0.520000   0.520000 1.000000 0.684211
artifact-3          100             2           98  0.500000   1.000000 0.038462 0.074074
