In [1]:
""" 处理GAIA """

import json

def process_gaia_data(input_path, output_path):
    """
    处理GAIA数据格式转换
    原始格式包含多个字段，目标格式只需要question和answer
    """
    processed_data = []
    
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    # 解析原始JSON数据
                    original_data = json.loads(line.strip())
                    
                    # 提取question和answer
                    question = original_data.get('question', '')
                    
                    # golden_answers可能是一个列表，取第一个答案或合并所有答案
                    golden_answers = original_data.get('golden_answers', [])
                    if isinstance(golden_answers, list) and golden_answers:
                        # 如果多个答案，用逗号分隔（根据原始格式要求）
                        answer = ', '.join(str(ans) for ans in golden_answers)
                    else:
                        answer = str(golden_answers) if golden_answers else ''
                    
                    # 构建目标格式
                    target_format = {
                        "question": question,
                        "answer": answer
                    }
                    
                    processed_data.append(target_format)
                    
                except json.JSONDecodeError as e:
                    print(f"解析JSON错误: {e}, 行内容: {line}")
                    continue
    
    # 写入目标文件
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in processed_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    return len(processed_data)

# 执行处理
input_file = "/share/project/kunluo/Datasets/Reasoning/Benchmark/GAIA/GAIA-processed.jsonl"
output_file = "/share/project/kunluo/Projects/TeamResearcher/storeroom/GAIA.jsonl"

try:
    processed_count = process_gaia_data(input_file, output_file)
    print(f"数据处理完成！成功处理 {processed_count} 条记录")
    print(f"输出文件: {output_file}")
except Exception as e:
    print(f"处理过程中出现错误: {e}")

数据处理完成！成功处理 103 条记录
输出文件: /share/project/kunluo/Projects/TeamResearcher/storeroom/GAIA.jsonl


In [2]:
""" 处理BrowseComp-Plus """

import json
import os

# 定义源文件和目标文件的路径
source_file_path = '/share/project/kunluo/Datasets/Reasoning/Benchmark/BrowseCompPlus/processed.jsonl'
target_file_path = '/share/project/kunluo/Projects/TeamResearcher/storeroom/browsecomp-plus.jsonl'

# 确保目标文件所在的目录存在
target_dir = os.path.dirname(target_file_path)
if not os.path.exists(target_dir):
    os.makedirs(target_dir)
    print(f"已创建目录: {target_dir}")

# 初始化一个计数器，用于统计处理的数据条数
processed_count = 0

print("开始处理数据...")

try:
    # 使用 'with' 语句安全地打开文件
    # 'r' 表示读取模式, 'w' 表示写入模式
    # encoding='utf-8' 确保能正确处理中文字符
    with open(source_file_path, 'r', encoding='utf-8') as infile, \
         open(target_file_path, 'w', encoding='utf-8') as outfile:
        
        # 逐行读取源文件
        for line in infile:
            try:
                # 将每一行的 JSON 字符串解析为 Python 字典
                source_data = json.loads(line)
                
                # 提取需要的字段
                question = source_data.get('question')
                # 'golden_answers' 是一个列表，我们取第一个元素
                golden_answers = source_data.get('golden_answers')
                
                # 确保提取的字段不为空
                if question and golden_answers and len(golden_answers) > 0:
                    answer = golden_answers[0]
                    
                    # 构建目标格式的字典
                    target_data = {
                        "question": question,
                        "answer": answer
                    }
                    
                    # 将目标字典转换为 JSON 字符串并写入目标文件
                    # ensure_ascii=False 保证中文字符能正常写入，而不是被转义
                    # 并在末尾添加换行符，以保持 jsonl 格式
                    outfile.write(json.dumps(target_data, ensure_ascii=False) + '\n')
                    
                    # 计数器加一
                    processed_count += 1
                else:
                    print(f"警告: 跳过格式不完整的数据行: {line.strip()}")

            except json.JSONDecodeError:
                print(f"警告: 跳过无法解析的 JSON 行: {line.strip()}")

    print("\n处理完成！")
    print(f"总共处理了 {processed_count} 条数据。")
    print(f"源文件: {source_file_path}")
    print(f"目标文件: {target_file_path}")

except FileNotFoundError:
    print(f"错误: 源文件未找到 at '{source_file_path}'")
except Exception as e:
    print(f"处理过程中发生错误: {e}")

开始处理数据...

处理完成！
总共处理了 830 条数据。
源文件: /share/project/kunluo/Datasets/Reasoning/Benchmark/BrowseCompPlus/processed.jsonl
目标文件: /share/project/kunluo/Projects/TeamResearcher/storeroom/browsecomp-plus.jsonl
