In [1]:
import os
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

# 디렉토리 내 JSON 파일 수집
input_dir = "./"
input_files = [
    os.path.join(input_dir, f) for f in os.listdir(input_dir)
    if f.endswith(".json") and os.path.isfile(os.path.join(input_dir, f))
]

print("▶ 평가 대상 파일:", input_files)

results = []

# 전략: Balanced Accuracy 기준, val set score 중 하나를 threshold로 선택
def choose_threshold_from_val_scores(scores, labels):
    unique_thresholds = np.unique(scores)
    best_thresh = unique_thresholds[0]
    best_bacc = 0.0

    for thresh in unique_thresholds:
        preds = [1 if s > thresh else 0 for s in scores]
        bacc = balanced_accuracy_score(labels, preds)
        if bacc > best_bacc:
            best_bacc = bacc
            best_thresh = thresh

    return best_thresh, best_bacc

# 예측값으로 평가 지표 계산
def evaluate_predictions(labels, preds):
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(labels, preds)
    }

# 파일 하나 평가
def evaluate_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    val_scores, val_labels = [], []
    test_scores, test_labels = [], []

    for entry in data:
        if entry.get("cut") == "val":
            val_scores.append(entry["score"])
            val_labels.append(entry["label"])
        elif entry.get("cut") == "test":
            test_scores.append(entry["score"])
            test_labels.append(entry["label"])

    if not val_scores or not test_scores:
        print(f"⚠️ {file_path} → 'val' 또는 'test' 데이터 부족으로 평가 생략")
        return []

    strategies = []

    # threshold = val 점수 중에서 선택
    thresh, val_bacc = choose_threshold_from_val_scores(val_scores, val_labels)
    test_preds = [1 if s > thresh else 0 for s in test_scores]
    test_eval = evaluate_predictions(test_labels, test_preds)
    test_eval.update({
        "filename": os.path.basename(file_path),
        "strategy": "Best BACC from val scores",
        "threshold": thresh,
        "val_score": val_bacc
    })
    strategies.append(test_eval)

    return strategies

# 전체 평가 수행
for file in input_files:
    results.extend(evaluate_file(file))

# 결과 저장
output_txt_path = "threshold_eval_from_val_scores.txt"
with open(output_txt_path, "w", encoding="utf-8") as fout:
    for r in results:
        fout.write(f"File: {r['filename']}\n")
        fout.write(f"Strategy: {r['strategy']}\n")
        fout.write(f"  Threshold (from val set): {r['threshold']:.6f}\n")
        fout.write(f"  Validation Balanced Accuracy: {r['val_score']:.4f}\n")
        fout.write(f"  Test Accuracy: {r['accuracy']:.4f}\n")
        fout.write(f"  Test Precision: {r['precision']:.4f}\n")
        fout.write(f"  Test Recall: {r['recall']:.4f}\n")
        fout.write(f"  Test F1 Score: {r['f1']:.4f}\n")
        fout.write(f"  Test Balanced Accuracy: {r['balanced_accuracy']:.4f}\n")
        fout.write("\n")

print(f"✅ 모든 평가 결과가 '{output_txt_path}'에 저장되었습니다.")


▶ 평가 대상 파일: ['./fizz_original_factcc_e-c_min.json', './fizz_original_xsumfaith_softmin_2_0p2_0.json', './fizz_original_cogensumm_softmin_2_0p2_0.json', './fizz_original_frank_softmin_2_0p2_0.json', './fizz_original_factcc_e_mean.json', './fizz_original_cogensumm_e_min.json', './fizz_original_factcc_softmin_3_0p2_0.json', './fizz_original_factcc_e_min.json', './fizz_original_xsumfaith_e_mean.json', './fizz_original_frank_e_min.json', './fizz_original_xsumfaith_e-c_min.json', './fizz_original_frank_e-c_min.json', './fizz_original_frank_softmin_3_0p2_0.json', './fizz_original_factcc_e-c_mean.json', './fizz_original_xsumfaith_e-c_mean.json', './fizz_original_xsumfaith_softmin_3_0p2_0.json', './fizz_original_frank_e_mean.json', './fizz_original_xsumfaith_e_min.json', './fizz_original_cogensumm_softmin_3_0p2_0.json', './fizz_original_frank_e-c_mean.json', './fizz_original_cogensumm_e-c_mean.json', './fizz_original_factcc_softmin_2_0p2_0.json', './fizz_original_cogensumm_e-c_min.json', './fiz

In [2]:
import os
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

# 디렉토리 내 JSON 파일 수집
input_dir = "./"
output_dir = "./analysis"
os.makedirs(output_dir, exist_ok=True)

input_files = [
    os.path.join(input_dir, f) for f in os.listdir(input_dir)
    if f.endswith(".json") and os.path.isfile(os.path.join(input_dir, f))
]

print("▶ 평가 대상 파일:", input_files)

results = []

# 전략: Balanced Accuracy 기준, val set score 중 하나를 threshold로 선택
def choose_threshold_from_val_scores(scores, labels):
    unique_thresholds = np.unique(scores)
    best_thresh = unique_thresholds[0]
    best_bacc = 0.0

    for thresh in unique_thresholds:
        preds = [1 if s > thresh else 0 for s in scores]
        bacc = balanced_accuracy_score(labels, preds)
        if bacc > best_bacc:
            best_bacc = bacc
            best_thresh = thresh

    return best_thresh, best_bacc

# 예측값으로 평가 지표 계산
def evaluate_predictions(labels, preds):
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(labels, preds)
    }

# 파일 하나 평가 및 오답만 저장
def evaluate_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    val_scores, val_labels = [], []
    test_scores, test_labels = [], []
    test_indices = []

    for idx, entry in enumerate(data):
        if entry.get("cut") == "val":
            val_scores.append(entry["score"])
            val_labels.append(entry["label"])
        elif entry.get("cut") == "test":
            test_scores.append(entry["score"])
            test_labels.append(entry["label"])
            test_indices.append(idx)

    if not val_scores or not test_scores:
        print(f"⚠️ {file_path} → 'val' 또는 'test' 데이터 부족으로 평가 생략")
        return []

    # threshold = val 점수 중에서 선택
    thresh, val_bacc = choose_threshold_from_val_scores(val_scores, val_labels)
    test_preds = [1 if s > thresh else 0 for s in test_scores]

    # prediction 추가 + 오답만 추출
    filtered_wrong = []
    for idx, pred in zip(test_indices, test_preds):
        entry = data[idx]
        entry["prediction"] = pred
        if pred != entry["label"]:
            filtered_wrong.append(entry)

    # 오답 항목만 저장
    output_path = os.path.join(output_dir, os.path.basename(file_path))
    with open(output_path, "w", encoding="utf-8") as f_out:
        json.dump(filtered_wrong, f_out, ensure_ascii=False, indent=2)

    # 성능 기록
    test_eval = evaluate_predictions(test_labels, test_preds)
    test_eval.update({
        "filename": os.path.basename(file_path),
        "strategy": "Best BACC from val scores",
        "threshold": thresh,
        "val_score": val_bacc
    })

    return [test_eval]

# 전체 평가 수행
for file in input_files:
    results.extend(evaluate_file(file))

# 결과 저장
output_txt_path = "threshold_eval_from_val_scores.txt"
with open(output_txt_path, "w", encoding="utf-8") as fout:
    for r in results:
        fout.write(f"File: {r['filename']}\n")
        fout.write(f"Strategy: {r['strategy']}\n")
        fout.write(f"  Threshold (from val set): {r['threshold']:.6f}\n")
        fout.write(f"  Validation Balanced Accuracy: {r['val_score']:.4f}\n")
        fout.write(f"  Test Accuracy: {r['accuracy']:.4f}\n")
        fout.write(f"  Test Precision: {r['precision']:.4f}\n")
        fout.write(f"  Test Recall: {r['recall']:.4f}\n")
        fout.write(f"  Test F1 Score: {r['f1']:.4f}\n")
        fout.write(f"  Test Balanced Accuracy: {r['balanced_accuracy']:.4f}\n")
        fout.write("\n")

print(f"✅ 오답(json)들은 '{output_dir}'에 저장되었고, 평가 결과는 '{output_txt_path}'에 저장되었습니다.")


▶ 평가 대상 파일: ['./fizz_original_factcc_e-c_min.json', './fizz_original_xsumfaith_softmin_2_0p2_0.json', './fizz_original_cogensumm_softmin_2_0p2_0.json', './fizz_original_frank_softmin_2_0p2_0.json', './fizz_original_factcc_e_mean.json', './fizz_original_cogensumm_e_min.json', './fizz_original_factcc_softmin_3_0p2_0.json', './fizz_original_factcc_e_min.json', './fizz_original_xsumfaith_e_mean.json', './fizz_original_frank_e_min.json', './fizz_original_xsumfaith_e-c_min.json', './fizz_original_frank_e-c_min.json', './fizz_original_frank_softmin_3_0p2_0.json', './fizz_original_factcc_e-c_mean.json', './fizz_original_xsumfaith_e-c_mean.json', './fizz_original_xsumfaith_softmin_3_0p2_0.json', './fizz_original_frank_e_mean.json', './fizz_original_xsumfaith_e_min.json', './fizz_original_cogensumm_softmin_3_0p2_0.json', './fizz_original_frank_e-c_mean.json', './fizz_original_cogensumm_e-c_mean.json', './fizz_original_factcc_softmin_2_0p2_0.json', './fizz_original_cogensumm_e-c_min.json', './fiz

In [3]:
import os
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

# 디렉토리 설정
input_dir = "./"
output_dir = "./analysis"
os.makedirs(output_dir, exist_ok=True)

# 입력 파일 목록 수집
input_files = [
    os.path.join(input_dir, f) for f in os.listdir(input_dir)
    if f.endswith(".json") and os.path.isfile(os.path.join(input_dir, f))
]

print("▶ 평가 대상 파일:", input_files)

results = []

# 전략: Validation score로 Best BACC 기준 threshold 선택
def choose_threshold_from_val_scores(scores, labels):
    unique_thresholds = np.unique(scores)
    best_thresh = unique_thresholds[0]
    best_bacc = 0.0

    for thresh in unique_thresholds:
        preds = [1 if s > thresh else 0 for s in scores]
        bacc = balanced_accuracy_score(labels, preds)
        if bacc > best_bacc:
            best_bacc = bacc
            best_thresh = thresh

    return best_thresh, best_bacc

# 평가 지표 계산
def evaluate_predictions(labels, preds):
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(labels, preds)
    }

# 파일 하나 처리
def evaluate_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    val_scores, val_labels = [], []

    for entry in data:
        if entry.get("cut") == "val":
            val_scores.append(entry["score"])
            val_labels.append(entry["label"])

    if not val_scores:
        print(f"⚠️ {file_path} → 'val' 데이터 부족으로 평가 생략")
        return []

    # Threshold 선택
    thresh, val_bacc_for_thresh = choose_threshold_from_val_scores(val_scores, val_labels)
    val_preds = [1 if s > thresh else 0 for s in val_scores]
    val_eval = evaluate_predictions(val_labels, val_preds)

    result = {
        "filename": os.path.basename(file_path),
        "strategy": "Best BACC from val scores",
        "threshold": thresh,
        "val_bacc_for_threshold_selection": val_bacc_for_thresh,
        "val_accuracy": val_eval["accuracy"],
        "val_precision": val_eval["precision"],
        "val_recall": val_eval["recall"],
        "val_f1": val_eval["f1"],
        "val_balanced_accuracy": val_eval["balanced_accuracy"],
    }

    return [result]

# 전체 평가
for file in input_files:
    results.extend(evaluate_file(file))

# 결과 저장
output_txt_path = "dawon.txt"
with open(output_txt_path, "w", encoding="utf-8") as fout:
    for r in results:
        fout.write(f"File: {r['filename']}\n")
        fout.write(f"Strategy: {r['strategy']}\n")
        fout.write(f"Threshold (from val set): {r['threshold']:.6f}\n")
        fout.write(f"Validation BACC (for threshold selection): {r['val_bacc_for_threshold_selection']:.4f}\n\n")

        fout.write(f"Validation Results:\n")
        fout.write(f"  Accuracy: {r['val_accuracy']:.4f}\n")
        fout.write(f"  Precision: {r['val_precision']:.4f}\n")
        fout.write(f"  Recall: {r['val_recall']:.4f}\n")
        fout.write(f"  F1 Score: {r['val_f1']:.4f}\n")
        fout.write(f"  Balanced Accuracy: {r['val_balanced_accuracy']:.4f}\n")
        fout.write("\n" + "="*50 + "\n\n")

print(f"✅ Validation 평가 결과만 '{output_txt_path}'에 저장 완료!")


▶ 평가 대상 파일: ['./fizz_original_factcc_e-c_min.json', './fizz_original_xsumfaith_softmin_2_0p2_0.json', './fizz_original_cogensumm_softmin_2_0p2_0.json', './fizz_original_frank_softmin_2_0p2_0.json', './fizz_original_factcc_e_mean.json', './fizz_original_cogensumm_e_min.json', './fizz_original_xsumfaith_weighted_aggregate.json', './fizz_original_factcc_softmin_3_0p2_0.json', './fizz_original_factcc_e_min.json', './fizz_original_xsumfaith_e_mean.json', './fizz_original_frank_e_min.json', './fizz_original_xsumfaith_e-c_min.json', './fizz_original_frank_e-c_min.json', './fizz_original_frank_softmin_3_0p2_0.json', './fizz_original_factcc_e-c_mean.json', './fizz_original_xsumfaith_e-c_mean.json', './fizz_original_xsumfaith_softmin_3_0p2_0.json', './fizz_original_frank_e_mean.json', './fizz_original_xsumfaith_e_min.json', './fizz_original_cogensumm_softmin_3_0p2_0.json', './fizz_original_frank_e-c_mean.json', './fizz_original_cogensumm_e-c_mean.json', './fizz_original_factcc_softmin_2_0p2_0.js

In [2]:
import os
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix

# 평가할 파일 리스트 직접 입력
input_dir = "./"
output_dir = "./analysis"
os.makedirs(output_dir, exist_ok=True)

# 평가할 파일명을 입력
input_files = [
    "fizz_original_xsumfaith_e-c_min.json",
    "fizz_original_xsumfaith_e-c_mean.json",
    "fizz_original_frank_e-c_min.json",
    "fizz_original_frank_e-c_mean.json",
    "fizz_original_factcc_e-c_min.json",
    "fizz_original_factcc_e-c_mean.json",
]

input_files = [os.path.join(input_dir, f) for f in input_files]

print("▶ 평가 대상 파일:")
for file in input_files:
    print(" -", file)

results = []

# 전략: Balanced Accuracy 기준, val set score 중 하나를 threshold로 선택
def choose_threshold_from_val_scores(scores, labels):
    unique_thresholds = np.unique(scores)
    best_thresh = unique_thresholds[0]
    best_bacc = 0.0

    for thresh in unique_thresholds:
        preds = [1 if s > thresh else 0 for s in scores]
        bacc = balanced_accuracy_score(labels, preds)
        if bacc > best_bacc:
            best_bacc = bacc
            best_thresh = thresh

    return best_thresh, best_bacc

# 예측값으로 평가 지표 계산
def evaluate_predictions(labels, preds):
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, zero_division=0)
    rec = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    bacc = balanced_accuracy_score(labels, preds)

    tn, fp, fn, tp = confusion_matrix(labels, preds, labels=[0, 1]).ravel()

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "balanced_accuracy": bacc,
        "tp": int(tp),
        "tn": int(tn),
        "fp": int(fp),
        "fn": int(fn)
    }

# label 비율 계산
def compute_label_distribution(labels):
    labels = np.array(labels)
    total = len(labels)
    ones = np.sum(labels == 1)
    zeros = np.sum(labels == 0)
    return {
        "total": total,
        "ones": ones,
        "zeros": zeros,
        "ratio_1": ones / total if total > 0 else 0,
        "ratio_0": zeros / total if total > 0 else 0
    }

# 파일 하나 평가 및 오답만 저장
def evaluate_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    val_scores, val_labels = [], []
    test_scores, test_labels = [], []
    test_indices = []

    for idx, entry in enumerate(data):
        if entry.get("cut") == "val":
            val_scores.append(entry["score"])
            val_labels.append(entry["label"])
        elif entry.get("cut") == "test":
            test_scores.append(entry["score"])
            test_labels.append(entry["label"])
            test_indices.append(idx)

    if not val_scores or not test_scores:
        print(f"⚠️ {file_path} → 'val' 또는 'test' 데이터 부족으로 평가 생략")
        return []

    # threshold = val 점수 중에서 선택
    thresh, val_bacc = choose_threshold_from_val_scores(val_scores, val_labels)
    test_preds = [1 if s > thresh else 0 for s in test_scores]

    # prediction 추가 + 오답만 추출
    filtered_wrong = []
    for idx, pred in zip(test_indices, test_preds):
        entry = data[idx]
        entry["prediction"] = pred
        if pred != entry["label"]:
            filtered_wrong.append(entry)

    # 오답 항목만 저장
    output_path = os.path.join(output_dir, os.path.basename(file_path))
    with open(output_path, "w", encoding="utf-8") as f_out:
        json.dump(filtered_wrong, f_out, ensure_ascii=False, indent=2)

    # 성능 기록
    test_eval = evaluate_predictions(test_labels, test_preds)
    test_eval.update({
        "filename": os.path.basename(file_path),
        "strategy": "Best BACC from val scores",
        "threshold": thresh,
        "val_score": val_bacc
    })

    # label 분포 정보 추가
    val_dist = compute_label_distribution(val_labels)
    test_dist = compute_label_distribution(test_labels)
    all_dist = compute_label_distribution(val_labels + test_labels)

    test_eval.update({
        "val_label_distribution": val_dist,
        "test_label_distribution": test_dist,
        "all_label_distribution": all_dist
    })

    return [test_eval]

# 전체 평가 수행
for file in input_files:
    results.extend(evaluate_file(file))

# 결과 저장
output_txt_path = "threshold_eval_from_val_scores_confusion_matrix.txt"
with open(output_txt_path, "w", encoding="utf-8") as fout:
    for r in results:
        fout.write(f"File: {r['filename']}\n")
        fout.write(f"Strategy: {r['strategy']}\n")
        fout.write(f"  Threshold (from val set): {r['threshold']:.6f}\n")
        fout.write(f"  Validation Balanced Accuracy: {r['val_score']:.4f}\n")
        fout.write(f"  Test Accuracy: {r['accuracy']:.4f}\n")
        fout.write(f"  Test Precision: {r['precision']:.4f}\n")
        fout.write(f"  Test Recall: {r['recall']:.4f}\n")
        fout.write(f"  Test F1 Score: {r['f1']:.4f}\n")
        fout.write(f"  Test Balanced Accuracy: {r['balanced_accuracy']:.4f}\n")
        fout.write(f"  Confusion Matrix (tp, tn, fp, fn): ({r['tp']}, {r['tn']}, {r['fp']}, {r['fn']})\n")
        fout.write("  Label Distribution:\n")
        fout.write(f"    [val]   total={r['val_label_distribution']['total']}, 1_ratio={r['val_label_distribution']['ratio_1']:.4f}, 0_ratio={r['val_label_distribution']['ratio_0']:.4f}\n")
        fout.write(f"    [test]  total={r['test_label_distribution']['total']}, 1_ratio={r['test_label_distribution']['ratio_1']:.4f}, 0_ratio={r['test_label_distribution']['ratio_0']:.4f}\n")
        fout.write(f"    [all]   total={r['all_label_distribution']['total']}, 1_ratio={r['all_label_distribution']['ratio_1']:.4f}, 0_ratio={r['all_label_distribution']['ratio_0']:.4f}\n")
        fout.write("\n")

print(f"✅ 오답(json)들은 '{output_dir}'에 저장되었고, 평가 결과는 '{output_txt_path}'에 저장되었습니다.")


▶ 평가 대상 파일:
 - ./fizz_original_xsumfaith_e-c_min.json
 - ./fizz_original_xsumfaith_e-c_mean.json
 - ./fizz_original_frank_e-c_min.json
 - ./fizz_original_frank_e-c_mean.json
 - ./fizz_original_factcc_e-c_min.json
 - ./fizz_original_factcc_e-c_mean.json
✅ 오답(json)들은 './analysis'에 저장되었고, 평가 결과는 'threshold_eval_from_val_scores_confusion_matrix.txt'에 저장되었습니다.
