In [8]:
import os
import pickle
import json

# 처리할 데이터셋 목록
datasets = ["xsumfaith", "frank", "cogensumm", "factcc"]
input_dir = "./"
output_dir = "./"
os.makedirs(output_dir, exist_ok=True)

# cut은 파일 이름 기준으로 지정
def load_and_tag(path, cut_name):
    with open(path, "rb") as f:
        data = pickle.load(f)
    for entry in data:
        entry["cut"] = cut_name
    return data

# 필요한 필드만 추출하여 JSON 포맷 구성
def to_json_format(entry, default_dataset):
    return {
        "document": entry["document"],
        "claim": entry["claim"],
        "bbcid": entry.get("bbcid", None),
        "model_name": entry.get("model_name", None),
        "label": entry["label"],
        "cut": entry["cut"],
        "annotations": entry.get("annotations", []),
        "dataset": entry.get("dataset", default_dataset),
        "origin": entry.get("origin", "xsum")
    }

# 각 데이터셋 처리
for dataset in datasets:
    val_path = os.path.join(input_dir, f"{dataset}_val.pkl")
    test_path = os.path.join(input_dir, f"{dataset}_test.pkl")
    output_json_path = os.path.join(output_dir, f"{dataset}.json")

    val_data = load_and_tag(val_path, "val")
    test_data = load_and_tag(test_path, "test")
    merged_data = val_data + test_data

    final_data = [to_json_format(entry, dataset) for entry in merged_data]

    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(final_data, f, ensure_ascii=False, indent=4)

    print(f"✅ Saved merged data to {output_json_path} ({len(final_data)} entries)")


✅ Saved merged data to ./xsumfaith.json (2500 entries)
✅ Saved merged data to ./frank.json (2246 entries)
✅ Saved merged data to ./cogensumm.json (1681 entries)
✅ Saved merged data to ./factcc.json (1434 entries)
