In [1]:
import os
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

# 디렉토리 내 JSON 파일 수집
input_dir = "./"
input_files = [
    os.path.join(input_dir, f) for f in os.listdir(input_dir)
    if f.endswith(".json") and os.path.isfile(os.path.join(input_dir, f))
]

print("▶ 평가 대상 파일:", input_files)

results = []

# 전략: Balanced Accuracy 기준, val set score 중 하나를 threshold로 선택
def choose_threshold_from_val_scores(scores, labels):
    unique_thresholds = np.unique(scores)
    best_thresh = unique_thresholds[0]
    best_bacc = 0.0

    for thresh in unique_thresholds:
        preds = [1 if s > thresh else 0 for s in scores]
        bacc = balanced_accuracy_score(labels, preds)
        if bacc > best_bacc:
            best_bacc = bacc
            best_thresh = thresh

    return best_thresh, best_bacc

# 예측값으로 평가 지표 계산
def evaluate_predictions(labels, preds):
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(labels, preds)
    }

# 파일 하나 평가
def evaluate_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    val_scores, val_labels = [], []
    test_scores, test_labels = [], []

    for entry in data:
        if entry.get("cut") == "val":
            val_scores.append(entry["score"])
            val_labels.append(entry["label"])
        elif entry.get("cut") == "test":
            test_scores.append(entry["score"])
            test_labels.append(entry["label"])

    if not val_scores or not test_scores:
        print(f"⚠️ {file_path} → 'val' 또는 'test' 데이터 부족으로 평가 생략")
        return []

    strategies = []

    # threshold = val 점수 중에서 선택
    thresh, val_bacc = choose_threshold_from_val_scores(val_scores, val_labels)
    test_preds = [1 if s > thresh else 0 for s in test_scores]
    test_eval = evaluate_predictions(test_labels, test_preds)
    test_eval.update({
        "filename": os.path.basename(file_path),
        "strategy": "Best BACC from val scores",
        "threshold": thresh,
        "val_score": val_bacc
    })
    strategies.append(test_eval)

    return strategies

# 전체 평가 수행
for file in input_files:
    results.extend(evaluate_file(file))

# 결과 저장
output_txt_path = "threshold_eval_from_val_scores.txt"
with open(output_txt_path, "w", encoding="utf-8") as fout:
    for r in results:
        fout.write(f"File: {r['filename']}\n")
        fout.write(f"Strategy: {r['strategy']}\n")
        fout.write(f"  Threshold (from val set): {r['threshold']:.6f}\n")
        fout.write(f"  Validation Balanced Accuracy: {r['val_score']:.4f}\n")
        fout.write(f"  Test Accuracy: {r['accuracy']:.4f}\n")
        fout.write(f"  Test Precision: {r['precision']:.4f}\n")
        fout.write(f"  Test Recall: {r['recall']:.4f}\n")
        fout.write(f"  Test F1 Score: {r['f1']:.4f}\n")
        fout.write(f"  Test Balanced Accuracy: {r['balanced_accuracy']:.4f}\n")
        fout.write("\n")

print(f"✅ 모든 평가 결과가 '{output_txt_path}'에 저장되었습니다.")


▶ 평가 대상 파일: ['./fizz_original_aggrefact_cnndm_e_mean.json', './fizz_original_aggrefact_xsum_e-c_min.json', './fizz_original_aggrefact_xsum_e_mean.json', './fizz_original_aggrefact_cnndm_e_min.json', './fizz_original_aggrefact_cnndm_e-c_mean.json', './fizz_original_aggrefact_xsum_e_min.json', './fizz_original_aggrefact_cnndm_e-c_min.json', './fizz_original_aggrefact_xsum_e-c_mean.json']
✅ 모든 평가 결과가 'threshold_eval_from_val_scores.txt'에 저장되었습니다.


In [2]:
import os
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

# 디렉토리 내 JSON 파일 수집
input_dir = "./"
input_files = [
    os.path.join(input_dir, f) for f in os.listdir(input_dir)
    if f.endswith(".json") and os.path.isfile(os.path.join(input_dir, f))
]

print("▶ 평가 대상 파일:", input_files)

results = []

# 전략: Balanced Accuracy 기준, val set score 중 하나를 threshold로 선택
def choose_threshold_from_val_scores(scores, labels):
    unique_thresholds = np.unique(scores)
    best_thresh = unique_thresholds[0]
    best_bacc = 0.0

    for thresh in unique_thresholds:
        preds = [1 if s > thresh else 0 for s in scores]
        bacc = balanced_accuracy_score(labels, preds)
        if bacc > best_bacc:
            best_bacc = bacc
            best_thresh = thresh

    return best_thresh, best_bacc

# 예측값으로 평가 지표 계산
def evaluate_predictions(labels, preds):
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(labels, preds)
    }

# 파일 하나 평가
def evaluate_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    val_scores, val_labels = [], []
    test_scores, test_labels = [], []

    for entry in data:
        if entry.get("cut") == "val":
            val_scores.append(entry["score"])
            val_labels.append(entry["label"])
        elif entry.get("cut") == "test":
            test_scores.append(entry["score"])
            test_labels.append(entry["label"])

    if not val_scores or not test_scores:
        print(f"⚠️ {file_path} → 'val' 또는 'test' 데이터 부족으로 평가 생략")
        return []

    strategies = []

    # threshold = val 점수 중에서 선택
    thresh, val_bacc = choose_threshold_from_val_scores(val_scores, val_labels)
    test_preds = [1 if s > thresh else 0 for s in test_scores]
    test_eval = evaluate_predictions(test_labels, test_preds)
    test_eval.update({
        "filename": os.path.basename(file_path),
        "strategy": "Best BACC from val scores",
        "threshold": thresh,
        "val_score": val_bacc
    })
    strategies.append(test_eval)

    return strategies

# 전체 평가 수행
for file in input_files:
    results.extend(evaluate_file(file))

# 결과 저장
output_txt_path = "threshold_eval_from_val_scores.txt"
with open(output_txt_path, "w", encoding="utf-8") as fout:
    for r in results:
        fout.write(f"File: {r['filename']}\n")
        fout.write(f"Strategy: {r['strategy']}\n")
        fout.write(f"  Threshold (from val set): {r['threshold']:.6f}\n")
        fout.write(f"  Validation Balanced Accuracy: {r['val_score']:.4f}\n")
        fout.write(f"  Test Accuracy: {r['accuracy']:.4f}\n")
        fout.write(f"  Test Precision: {r['precision']:.4f}\n")
        fout.write(f"  Test Recall: {r['recall']:.4f}\n")
        fout.write(f"  Test F1 Score: {r['f1']:.4f}\n")
        fout.write(f"  Test Balanced Accuracy: {r['balanced_accuracy']:.4f}\n")
        fout.write("\n")

print(f"✅ 모든 평가 결과가 '{output_txt_path}'에 저장되었습니다.")


▶ 평가 대상 파일: ['./fizz_original_aggrefact_cnndm_e_mean.json', './fizz_original_aggrefact_xsum_e-c_min.json', './fizz_original_aggrefact_xsum_e_mean.json', './fizz_original_aggrefact_cnndm_e_min.json', './fizz_original_aggrefact_cnndm_e-c_mean.json', './fizz_original_aggrefact_xsum_e_min.json', './fizz_original_aggrefact_cnndm_e-c_min.json', './fizz_original_aggrefact_xsum_e-c_mean.json']
✅ 모든 평가 결과가 'threshold_eval_from_val_scores.txt'에 저장되었습니다.
