In [1]:
import json
import csv

# JSON 파일 읽기
with open("/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/data/aokvqa/test.json", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]  # jsonl 형식이라면 줄 단위로 읽음

# CSV 파일 쓰기
with open("/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/data/preprocess/aokvqa_test.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    # 헤더 작성
    writer.writerow(["question_id", "image_path", "question", "label"])
    
    # 각 항목 작성
    for item in data:
        writer.writerow([
            item["question_id"],
            item["image"],
            item["text"],
            ";".join(item["label"])  # 리스트를 문자열로 합치기
        ])


In [2]:
import json
import os
import re
from collections import Counter
from pathlib import Path
import pandas as pd

# 정규화 함수 (구두점/대소문자/공백 통일)
_punc = re.compile(r"[^\w\s]")
_ws = re.compile(r"\s+")

def normalize_answer(s: str) -> str:
    s = s.strip().lower()
    s = _punc.sub(" ", s)          # 구두점 제거
    s = _ws.sub(" ", s).strip()    # 공백 정리
    return s

def majority_vote(answers):
    norm = [normalize_answer(a.get("answer","")) for a in answers if isinstance(a, dict)]
    norm = [a for a in norm if a]  # 빈 문자열 제거
    if not norm:
        return ""
    cnt = Counter(norm)
    best = sorted(cnt.items(), key=lambda x: (-x[1], len(x[0]), x[0]))[0][0]
    return best

def load_annotations(path):
    if path.endswith(".jsonl"):
        items = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    items.append(json.loads(line))
        return items
    else:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict):
            return [data]
        return data

def vizwiz_to_dataframe(json_path, image_root=""):
    data = load_annotations(json_path)

    rows = []
    for i, item in enumerate(data):
        answ = item.get("answerable", 0)
        if not (answ == 1 or answ is True or str(answ) == "1"):
            continue

        img = item.get("image") or item.get("image_path") or ""
        q = item.get("question", "")
        label = majority_vote(item.get("answers", []))

        qid = item.get("question_id")
        if qid is None:
            qid = Path(img).stem if img else f"q_{i:06d}"

        image_path = os.path.join(image_root, img) if image_root else img

        rows.append({
            "question_id": qid,
            "image_path": image_path,
            "question": q,
            "label": label
        })

    return pd.DataFrame(rows)


In [11]:
# JSON 파일 → DataFrame
df = vizwiz_to_dataframe(
    json_path="/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/data/VizWiz/val.json",
    image_root="/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/data/VizWiz/val"
)

# 확인
df.head()

Unnamed: 0,question_id,image_path,question,label
0,VizWiz_val_00000001,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,Can you tell me what this medicine is please?,night time
1,VizWiz_val_00000002,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,What is the title of this book?,dog years
2,VizWiz_val_00000003,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,Which one is the blue one?,right
3,VizWiz_val_00000005,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,What the screen says? Thank you.,windows 7
4,VizWiz_val_00000009,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,Can you describe for me what's going on outsid...,nothing


In [12]:
df.to_csv(
    "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/data/preprocess/vizwiz_val.csv",
    index=False
)

In [4]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv("/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/data/preprocess/llava-1.5-7b-hf-vizwiz_val-llava_answers.csv")

# label 값 뒤집기 (1 -> 0, 0 -> 1)
df["label"] = df["label"].apply(lambda x: 0 if x == 1 else 1)

# 결과를 새로운 파일로 저장
df.to_csv("/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/data/preprocess/llava-1.5-7b-hf-vizwiz_val-llava_answers_label_change.csv", index=False)


In [12]:
import pandas as pd

# CSV 읽기
df = pd.read_csv("/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/llava-1.5-7b-hf-vizwiz_val-llava_answers.csv")

# 데이터 확인
df.head()

Unnamed: 0,image_path,question,gold_answer,model_answer,label
0,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,Can you tell me what this medicine is please?,night time,Night time,0
1,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,What is the title of this book?,dog years,Dog Years,0
2,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,Which one is the blue one?,right,Right,0
3,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,What the screen says? Thank you.,windows 7,Windows home premium,1
4,/data3/KJE/code/WIL_DeepLearningProject_2/VLM_...,Can you describe for me what's going on outsid...,nothing,Trees,1


In [13]:
# label 값 분포 (개수)
print("label 개수 분포:")
print(df["label"].value_counts())

# label 값 비율
print("\nlabel 비율:")
print(df["label"].value_counts(normalize=True))

label 개수 분포:
label
1    1584
0    1350
Name: count, dtype: int64

label 비율:
label
1    0.539877
0    0.460123
Name: proportion, dtype: float64


In [14]:
for _, row in df.iterrows():
    if row["label"] == 1:
        print("gold_answer:", row["gold_answer"])
        print("model_answer:", row["model_answer"])
        print("---")

gold_answer: windows 7
model_answer: Windows home premium
---
gold_answer: nothing
model_answer: Trees
---
gold_answer: unanswerable
model_answer: Finger
---
gold_answer: loading
model_answer: Running
---
gold_answer: dropper bottle
model_answer: Yes
---
gold_answer: starting windows
model_answer: No
---
gold_answer: unanswerable
model_answer: Crash
---
gold_answer: unanswerable
model_answer: Pills
---
gold_answer: unanswerable
model_answer: Force quit
---
gold_answer: unanswerable
model_answer: Yes
---
gold_answer: unanswerable
model_answer: Computer
---
gold_answer: unanswerable
model_answer: 32 inch
---
gold_answer: unanswerable
model_answer: 09/11/2011
---
gold_answer: boobs
model_answer: Person
---
gold_answer: unanswerable
model_answer: Orange
---
gold_answer: unanswerable
model_answer: Store
---
gold_answer: clive cussler dirk cussler
model_answer: Science fiction
---
gold_answer: looking at computer
model_answer: Working
---
gold_answer: unanswerable
model_answer: Pless
---
gol

In [10]:
import os
import json
from glob import glob

# JSON 파일들이 들어있는 디렉토리 경로
base_dir = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output/llava1.5"

# 특정 패턴의 파일만 가져오기 (예: layer0_head0scores.json 같은 파일들)
file_pattern = os.path.join(base_dir, "results1.5_Controlled_Images_A_base_0.01_fouroption_True_layer*_head*scores.json")
json_files = glob(file_pattern)

print(f"찾은 파일 개수: {len(json_files)}")

all_correct_ids = []

for file in json_files:
    with open(file, "r") as f:
        data = json.load(f)
        if "correct_id" in data:
            all_correct_ids.append(set(data["correct_id"]))

# 교집합 구하기 (겹치는 값들)
if all_correct_ids:
    common_ids = set.intersection(*all_correct_ids)
    print("모든 파일에 공통으로 있는 correct_id 값들:", common_ids)
else:
    print("파일에서 correct_id 값을 찾을 수 없음")


찾은 파일 개수: 1
모든 파일에 공통으로 있는 correct_id 값들: {1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 20, 21, 23, 27, 28, 29, 30, 32, 33, 38, 40, 42, 44, 46, 48, 49, 51, 52, 53, 54, 56, 57, 58, 64, 68, 72, 73, 74, 75, 77, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 107, 110, 113, 114, 116, 117, 118, 120, 122, 125, 126, 127, 128, 130, 131, 132, 136, 138, 140, 142, 143, 144, 145, 146, 148, 151, 152, 153, 154, 155, 159, 160, 162, 163, 164, 165, 166, 169, 171, 174, 175, 176, 179, 180, 181, 182, 184, 185, 186, 190, 192, 194, 196, 197, 198, 199, 200, 201, 202, 203, 206, 208, 209, 213, 214, 215, 216, 220, 221, 223, 225, 226, 228, 231, 232, 233, 234, 235, 236, 237, 241, 242, 243, 245, 247, 248, 252, 254, 255, 257, 258, 259, 260, 262, 264, 265, 266, 267, 269, 271, 272, 273, 275, 277, 279, 284, 285, 286, 287, 288, 291, 293, 295, 297, 298, 302, 305, 306, 307, 309, 311, 314, 315, 317, 318, 320, 321, 323, 325, 326, 328, 329}


In [11]:
import numpy as np 

np.array(common_ids)

array({1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 20, 21, 23, 27, 28, 29, 30, 32, 33, 38, 40, 42, 44, 46, 48, 49, 51, 52, 53, 54, 56, 57, 58, 64, 68, 72, 73, 74, 75, 77, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, 93, 94, 97, 98, 99, 100, 103, 104, 105, 106, 107, 110, 113, 114, 116, 117, 118, 120, 122, 125, 126, 127, 128, 130, 131, 132, 136, 138, 140, 142, 143, 144, 145, 146, 148, 151, 152, 153, 154, 155, 159, 160, 162, 163, 164, 165, 166, 169, 171, 174, 175, 176, 179, 180, 181, 182, 184, 185, 186, 190, 192, 194, 196, 197, 198, 199, 200, 201, 202, 203, 206, 208, 209, 213, 214, 215, 216, 220, 221, 223, 225, 226, 228, 231, 232, 233, 234, 235, 236, 237, 241, 242, 243, 245, 247, 248, 252, 254, 255, 257, 258, 259, 260, 262, 264, 265, 266, 267, 269, 271, 272, 273, 275, 277, 279, 284, 285, 286, 287, 288, 291, 293, 295, 297, 298, 302, 305, 306, 307, 309, 311, 314, 315, 317, 318, 320, 321, 323, 325, 326, 328, 329},
      dtype=object)

In [12]:
import numpy as np 

np.save('/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output/llava1.5_base_correct_idx.npy', np.array(common_ids))