In [5]:
import os
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

# 디렉토리 안의 모든 JSON 파일 가져오기
input_dir = "../results/aggre_xsum_coref"
input_files = [
    os.path.join(input_dir, f) for f in os.listdir(input_dir)
    if f.endswith(".json") and os.path.isfile(os.path.join(input_dir, f))
]

print(input_files)

results = []

# Balanced Accuracy 기반 threshold (val score 중에서 선택)
def choose_threshold_by_bacc(scores, labels):
    best_thresh = 0.0
    best_bacc = 0.0
    for thresh in sorted(set(scores)):
        preds = [1 if s > thresh else 0 for s in scores]
        bacc = balanced_accuracy_score(labels, preds)
        if bacc > best_bacc:
            best_bacc = bacc
            best_thresh = thresh
    return best_thresh, best_bacc

# 테스트 세트 평가
def evaluate_predictions(labels, preds):
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(labels, preds)
    }

# 평가 함수
def evaluate_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except json.JSONDecodeError:
        print(f"❌ JSON Decode Error 발생: {file_path} 스킵합니다.")
        return []  # 오류 발생 시 결과 없음

    val_scores, val_labels = [], []
    test_scores, test_labels = [], []

    for entry in data:
        if entry.get("cut") == "val":
            val_scores.append(entry["score"])
            val_labels.append(entry["label"])
        elif entry.get("cut") == "test":
            test_scores.append(entry["score"])
            test_labels.append(entry["label"])

    if not val_scores or not test_scores:
        return []

    strategies = []

    # 전략: Balanced Accuracy
    thresh1, bacc1 = choose_threshold_by_bacc(val_scores, val_labels)
    preds1 = [1 if s > thresh1 else 0 for s in test_scores]
    eval1 = evaluate_predictions(test_labels, preds1)
    eval1.update({
        "filename": os.path.basename(file_path),
        "strategy": "Balanced Accuracy (val score 기반)",
        "threshold": thresh1,
        "val_score": bacc1
    })
    strategies.append(eval1)

    return strategies

# 모든 파일 평가
for file in input_files:
    result = evaluate_file(file)
    results.extend(result)

# balanced_accuracy 기준 내림차순 정렬
results = sorted(results, key=lambda x: x["balanced_accuracy"], reverse=True)

# 결과 저장
output_txt_path = "xsum_coref.txt"
with open(output_txt_path, "w", encoding="utf-8") as fout:
    for r in results:
        fout.write(f"File: {r['filename']}\n")
        fout.write(f"Strategy: {r['strategy']}\n")
        fout.write(f"  Threshold (from val set): {r['threshold']:.4f}\n")
        fout.write(f"  Validation Strategy Score: {r['val_score']:.4f}\n")
        fout.write(f"  Test Accuracy: {r['accuracy']:.4f}\n")
        fout.write(f"  Test Precision: {r['precision']:.4f}\n")
        fout.write(f"  Test Recall: {r['recall']:.4f}\n")
        fout.write(f"  Test F1 Score: {r['f1']:.4f}\n")
        fout.write(f"  Test Balanced Accuracy: {r['balanced_accuracy']:.4f}\n")
        fout.write("\n")

print(f"✅ 모든 평가 결과가 {output_txt_path}에 저장되었습니다.")

['../results/aggre_xsum_coref/fenice_original.json', '../results/aggre_xsum_coref/fenice_wr0p3_wb0p7_wcc0_wc1_wm1_ww0_k2.json', '../results/aggre_xsum_coref/fenice_wr0_wb1_wcc1_wc1_wm0_ww1_k1.json', '../results/aggre_xsum_coref/fenice_wr0_wb1_wcc1_wc1_wm0_ww1_k1_ORCA.json', '../results/aggre_xsum_coref/fenice_wr0p3_wb0p7_wcc1_wc1_wm1_ww0_k2.json', '../results/aggre_xsum_coref/fenice_wr0p3_wb0p7_wcc0_wc1_wm0_ww1_k2.json']
✅ 모든 평가 결과가 xsum_coref.txt에 저장되었습니다.


In [30]:
import os
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

dir_cnndm = "../results/aggre_cnndm"
dir_xsum = "../results/aggre_xsum"
output_txt_path = "./single_threshold/single_threshold.txt"

files_cnndm = set(f for f in os.listdir(dir_cnndm) if f.endswith(".json"))
files_xsum = set(f for f in os.listdir(dir_xsum) if f.endswith(".json"))
common_files = sorted(files_cnndm & files_xsum)

print(f"공통 JSON 파일 수: {len(common_files)}")

def choose_threshold_by_bacc(scores, labels):
    best_thresh, best_bacc = 0.0, 0.0
    for thresh in sorted(set(scores)):
        preds = [1 if s > thresh else 0 for s in scores]
        bacc = balanced_accuracy_score(labels, preds)
        if bacc > best_bacc:
            best_bacc = bacc
            best_thresh = thresh
    return best_thresh, best_bacc

def evaluate_predictions(labels, preds):
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(labels, preds)
    }

# 결과 저장을 위한 리스트
results = []

for filename in common_files:
    try:
        with open(os.path.join(dir_cnndm, filename), "r", encoding="utf-8") as f1:
            data_cnndm = json.load(f1)
        with open(os.path.join(dir_xsum, filename), "r", encoding="utf-8") as f2:
            data_xsum = json.load(f2)
    except json.JSONDecodeError:
        print(f"❌ JSON Decode Error 발생: {filename} 스킵")
        continue

    val_scores, val_labels = [], []
    for entry in data_cnndm + data_xsum:
        if entry.get("cut") == "val":
            val_scores.append(entry["score"])
            val_labels.append(entry["label"])

    test_scores_cnndm, test_labels_cnndm = [], []
    for entry in data_cnndm:
        if entry.get("cut") == "test":
            test_scores_cnndm.append(entry["score"])
            test_labels_cnndm.append(entry["label"])

    test_scores_xsum, test_labels_xsum = [], []
    for entry in data_xsum:
        if entry.get("cut") == "test":
            test_scores_xsum.append(entry["score"])
            test_labels_xsum.append(entry["label"])

    if not val_scores or not test_scores_cnndm or not test_scores_xsum:
        print(f"⚠️ 데이터 부족: {filename} 스킵")
        continue

    threshold, val_bacc = choose_threshold_by_bacc(val_scores, val_labels)

    preds_cnndm = [1 if s > threshold else 0 for s in test_scores_cnndm]
    preds_xsum = [1 if s > threshold else 0 for s in test_scores_xsum]

    eval_cnndm = evaluate_predictions(test_labels_cnndm, preds_cnndm)
    eval_xsum = evaluate_predictions(test_labels_xsum, preds_xsum)

    avg_bacc = (eval_cnndm["balanced_accuracy"] + eval_xsum["balanced_accuracy"]) / 2

    results.append({
        "filename": filename,
        "threshold": threshold,
        "val_bacc": val_bacc,
        "avg_bacc": avg_bacc,
        "cnndm": eval_cnndm,
        "xsum": eval_xsum
    })

# 평균 balanced accuracy 기준으로 정렬
results = sorted(results, key=lambda x: x["avg_bacc"], reverse=True)

# 결과 출력
with open(output_txt_path, "w", encoding="utf-8") as fout:
    for r in results:
        fout.write(f"[{r['filename']}] Threshold from joint val: {r['threshold']:.4f} (Val BACC: {r['val_bacc']:.4f})\n")
        fout.write(f"  [CNN/DM Test Set]\n")
        for key, val in r["cnndm"].items():
            fout.write(f"    {key}: {val:.4f}\n")
        fout.write(f"  [XSum Test Set]\n")
        for key, val in r["xsum"].items():
            fout.write(f"    {key}: {val:.4f}\n")
        fout.write(f"  ▶️ 평균 Balanced Accuracy: {r['avg_bacc']:.4f}\n\n")

print(f"✅ 평균 balanced accuracy 기준 정렬된 결과가 {output_txt_path}에 저장되었습니다.")


공통 JSON 파일 수: 13
✅ 평균 balanced accuracy 기준 정렬된 결과가 ./single_threshold/single_threshold.txt에 저장되었습니다.
