In [13]:
# 更新逻辑：保存最短的正确答案的 full predict（推理过程），而不是 boxed 的内容本身

# 新的 data_by_index 数据结构初始化
data_by_index_full = {}

# 重读文件，并保存完整的推理内容
for pred_file, detail_file in zip(prediction_files, detail_files):
    with open(pred_file, 'r', encoding='utf-8') as f_pred, open(detail_file, 'r', encoding='utf-8') as f_detail:
        preds = [json.loads(line) for line in f_pred]
        details = [json.loads(line) for line in f_detail]
        for pred, detail in zip(preds, details):
            idx = detail["index"]
            full_answer = pred["predict"]
            correct = detail["accuracy"] == 100.0
            if idx not in data_by_index_full:
                data_by_index_full[idx] = {
                    "prompt": pred["prompt"],
                    "label": pred["label"],
                    "candidates": []
                }
            data_by_index_full[idx]["candidates"].append((full_answer, correct))

# 构造新的最终选择结果
final_results_full = []
for idx in sorted(data_by_index_full.keys()):
    item = data_by_index_full[idx]
    correct_answers = [ans for ans, ok in item["candidates"] if ok]
    if correct_answers:
        # 如果有多个正确答案，选择字符数最短的完整 predict
        best = min(correct_answers, key=len)
    else:
        # 否则使用参考答案
        best = item["label"]
    final_results_full.append({
        "index": idx,
        "prompt": item["prompt"],
        "final_answer": best
    })

# 保存为新的输出文件
output_path_full = os.path.join(input_dir, "merged_shortest_correct_full_predict.jsonl")
with open(output_path_full, 'w', encoding='utf-8') as f:
    for item in final_results_full:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

output_path_full


'./merged_shortest_correct_full_predict.jsonl'

In [6]:
import os
import json
import re
from typing import List

# 输入和输出路径（你可以自定义）
input_path = "merged_shortest_correct_full_predict.jsonl"
output_path = "final_expanded_prompt_reasoning_dataset.json"
instruction = "Solve the following math problem step by step. Write your reasoning clearly using LaTeX. Box the final answer using \\boxed{}."


def extract_problem_from_prompt(prompt: str) -> str:
    """支持全角或半角竖线的 prompt 提取"""
    match = re.search(r'<[｜\|]User[｜\|]>(.*?)<[｜\|]Assistant[｜\|]>', prompt, re.DOTALL)
    return match.group(1).strip() if match else None


def robust_split_sentences(text: str) -> List[str]:
    """基于标点、换行、LaTeX、强调等进行句子切分"""
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    text = re.sub(r'(?<=[。.\?])\s*', '\n', text)  # 标点断句
    text = re.sub(r'(\n+)', '\n', text)
    text = re.sub(r'(\*\*.*?\*\*)', r'\n\1\n', text)  # 加粗符号
    text = re.sub(r'(\\\[.*?\\\])', r'\n\1\n', text)  # LaTeX 块
    text = re.sub(r'(\\boxed\{.*?\})', r'\n\1\n', text)  # boxed 内容
    parts = text.split('\n')
    return [p.strip() for p in parts if p.strip()]


# 主处理逻辑
expanded_dataset = []

with open(input_path, 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        prompt = item["prompt"]
        full_answer = item["final_answer"]

        problem = extract_problem_from_prompt(prompt)
        if not problem:
            continue

        sentences = robust_split_sentences(full_answer)
        box_index = next((i for i, s in enumerate(sentences) if "\\boxed" in s), len(sentences))

        for i in range(1, min(6, box_index + 1)):
            partial_input = " ".join(sentences[:i])
            remaining_output = " ".join(sentences[i:])
            expanded_dataset.append({
                "instruction": instruction,
                "input": f"{problem}\n{partial_input}",
                "output": remaining_output
            })

# 保存为 JSON 文件
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(expanded_dataset, f, ensure_ascii=False, indent=2)

print(f"✅ Done! {len(expanded_dataset)} samples saved to {output_path}")


✅ Done! 37409 samples saved to final_expanded_prompt_reasoning_dataset.json


In [9]:
import os, re, json, glob
from itertools import zip_longest
from collections import defaultdict
from typing import List

# ---------- 可修改参数 ----------
DATA_DIR   = "."  # 存放所有模型 prediction & detail 文件的目录
OUT_PATH   = "math_pref_pairs.jsonl"  # 输出文件名 (jsonl)
MAX_STEP   = 5   # 每条 good/bad 最多截断到前 N 句
SYSTEM_PROMPT = ("Solve the following math problem step by step. "
                 "Write your reasoning clearly using LaTeX. "
                 "Box the final answer using \\boxed{}.")

# ---------- 你的提取与分句函数 ----------
def extract_problem_from_prompt(prompt: str) -> str:
    match = re.search(r'<[｜\|]User[｜\|]>(.*?)<[｜\|]Assistant[｜\|]>', prompt, re.DOTALL)
    return match.group(1).strip() if match else ""

def robust_split_sentences(text: str) -> List[str]:
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    text = re.sub(r'(?<=[。.\?])\s*', '\n', text)      # 标点后断句
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'(\*\*.*?\*\*)', r'\n\1\n', text)   # **强调**
    text = re.sub(r'(\\\[.*?\\\])', r'\n\1\n', text)   # LaTeX 块
    text = re.sub(r'(\\boxed\{.*?\})', r'\n\1\n', text)
    return [s.strip() for s in text.split('\n') if s.strip()]

# ---------- Pass‑1: 读取所有模型，按 index 聚合 ----------
by_idx = defaultdict(lambda: {"prompt":None, "label":None, "good":[], "bad":[]})

pred_files   = sorted(glob.glob(os.path.join(DATA_DIR, "*-generated-predictions.jsonl")))
detail_files = sorted(glob.glob(os.path.join(DATA_DIR, "*-generated-predictions-detailed-results.jsonl")))
assert len(pred_files) == len(detail_files), "⛔ 文件数不匹配！"

for p_file, d_file in zip(pred_files, detail_files):
    with open(p_file, encoding="utf-8") as fp,\
         open(d_file, encoding="utf-8") as fd:
        for pred_line, det_line in zip_longest(fp, fd):
            if pred_line is None or det_line is None:
                break
            pred = json.loads(pred_line)
            det  = json.loads(det_line)
            idx  = det["index"]

            rec = by_idx[idx]
            rec["prompt"] = pred["prompt"]
            rec["label"]  = pred["label"]

            bucket = "good" if det["accuracy"] == 100.0 else "bad"
            rec[bucket].append(pred["predict"])

# ---------- Pass‑2: 生成 preference 样本并写 jsonl ----------
total = 0
with open(OUT_PATH, "w", encoding="utf-8") as fout:
    for idx, rec in by_idx.items():
        # 题目正文
        problem = extract_problem_from_prompt(rec["prompt"])
        if not problem or not rec["good"] or not rec["bad"]:
            continue   # 需要同时有好答案和坏答案

        # 把 label 也当作 good
        if rec["label"] not in rec["good"]:
            rec["good"].append(rec["label"])

        good_list, bad_list = rec["good"], rec["bad"]
        # 数量对齐
        while len(good_list) < len(bad_list): good_list.append(good_list[-1])
        while len(bad_list) < len(good_list): bad_list.append(bad_list[-1])

        for good, bad in zip(good_list, bad_list):
            g_sents = robust_split_sentences(good)
            b_sents = robust_split_sentences(bad)

            if not g_sents or not b_sents:
                continue

            max_k = min(len(g_sents), len(b_sents), MAX_STEP)
            for k in range(1, max_k + 1):
                obj = {
                    "conversations":[
                        {"from":"system", "value": SYSTEM_PROMPT},
                        {"from":"human",  "value": problem}
                    ],
                    "chosen":   {"from":"gpt", "value":" ".join(g_sents[:k])},
                    "rejected": {"from":"gpt", "value":" ".join(b_sents[:k])}
                }
                fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
                total += 1

print(f"✅ 生成完成：{total} 条样本 → {OUT_PATH}")


✅ 生成完成：0 条样本 → math_pref_pairs.jsonl


In [16]:
#!/usr/bin/env python
# coding: utf-8
"""
Build pair‑preference dataset for math GSM‑like tasks.
Author: ChatGPT (o3) – 2025‑05‑06
"""

import os
import re
import json
import glob
from collections import defaultdict

SYSTEM_PROMPT = (
    "Solve the following math problem step by step. "
    "Write your reasoning clearly using LaTeX. "
    "Box the final answer using \\boxed{}."
)

# ---------- 基础工具 ---------- #
def split_sentences(text: str):
    """按句号和换行符切分，去掉空句子"""
    text = text.strip()
    lines = text.splitlines()          # 先按换行
    sents = []
    for ln in lines:
        # 继续按英文/中文句号分段
        parts = re.split(r'(?<=\.|。)\s+', ln)
        for p in parts:
            p = p.strip()
            if p:
                sents.append(p)
    return sents


def accumulate_until_box(sents):
    """
    生成递增前缀列表，
    一旦某句里出现 \\boxed 则把该前缀作为最后一个元素并停止。
    """
    acc, cur = [], []
    for sent in sents:
        cur.append(sent)
        acc.append(" ".join(cur).strip())
        if r"\boxed" in sent:
            break
    return acc


def extract_question_block(prompt: str) -> str:
    """从 prompt 中截取 user 段（去掉 system 与 assistant 标签）"""
    user_tag = "user\n"
    ass_tag = "\nassistant"
    start = prompt.find(user_tag)
    if start == -1:
        return prompt.strip()
    start += len(user_tag)
    end = prompt.find(ass_tag, start)
    if end == -1:
        end = len(prompt)
    return prompt[start:end].strip()


def extract_pure_problem(prompt: str) -> str:
    """去掉最前面的 meta 指令，只保留题目正文"""
    blk = extract_question_block(prompt)
    # meta 指令与正文之间一般只隔一行
    splitted = blk.split("\n", 1)
    return (splitted[1] if len(splitted) == 2 else blk).strip()


# ---------- 读文件、聚合答案 ---------- #
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)


def collect_all_answers(root="."):
    """
    聚合不同模型的正确 / 错误回答：
    {index: {"problem": str, "correct": set(), "incorrect": set()}}
    """
    pattern = os.path.join(root, "**", "*-generated-predictions.jsonl")
    agg = defaultdict(lambda: {"problem": None, "correct": set(), "incorrect": set()})

    for pred_path in glob.glob(pattern, recursive=True):
        prefix = pred_path.rsplit("-generated-predictions.jsonl", 1)[0]
        detail_path = prefix + "-generated-predictions-detailed-results.jsonl"
        if not os.path.exists(detail_path):
            continue  # 没有详细评测文件就跳过

        acc_iter = (rec.get("accuracy", 0.0) for rec in load_jsonl(detail_path))

        for idx, (pred_rec, acc) in enumerate(zip(load_jsonl(pred_path), acc_iter)):
            problem = extract_pure_problem(pred_rec["prompt"])
            entry = agg[idx]
            entry["problem"] = problem
            # 标准答案永远算对
            entry["correct"].add(pred_rec["label"])
            if acc >= 99.9:  # 认为是正确
                entry["correct"].add(pred_rec["predict"])
            else:
                entry["incorrect"].add(pred_rec["predict"])
    return agg


# ---------- 生成数据集 ---------- #
def build_pair_examples(problem, cor_ans, err_ans):
    """
    给定一对(正确回答, 错误回答) -> 多条增量 prefix 样本
    """
    cor_sents = split_sentences(cor_ans)
    err_sents = split_sentences(err_ans)
    cor_prefixes = accumulate_until_box(cor_sents)
    err_prefixes = accumulate_until_box(err_sents)

    n = min(len(cor_prefixes), len(err_prefixes))
    examples = []
    for i in range(n):
        examples.append(
            {
                "conversations": [
                    {"from": "system", "value": SYSTEM_PROMPT},
                    {"from": "human",  "value": problem},
                ],
                "chosen":   {"from": "gpt", "value": cor_prefixes[i]},
                "rejected": {"from": "gpt", "value": err_prefixes[i]},
            }
        )
    return examples


def build_dataset(root_dir=".", out_file="math_pref_dataset.json"):
    agg = collect_all_answers(root_dir)
    dataset = []

    for idx, info in agg.items():
        cor, err = list(info["correct"]), list(info["incorrect"])
        if not err:                 # 全部答对 -> 忽略
            continue
        problem = info["problem"]
        # 让正确 / 错误数量相同（不足的重复补齐）
        pair_cnt = max(len(cor), len(err))
        for k in range(pair_cnt):
            exs = build_pair_examples(
                problem,
                cor[k % len(cor)],
                err[k % len(err)],
            )
            dataset.extend(exs)

    # ----------- 随机采样 10 % -----------
    import os, re, json, glob, random      # ← 增加 random
    keep_num = 100000
    dataset = random.sample(dataset, keep_num)
            
    # 写文件
    with open(out_file, "w", encoding="utf-8") as fout:
        json.dump(dataset, fout, ensure_ascii=False, indent=2)
    print(f"Finished! 生成 {len(dataset)} 条记录 -> {out_file}")


# if __name__ == "__main__":
#     # 当前目录运行即可；如需指定其它路径，改 build_dataset(root_dir=...)
build_dataset(".")


Finished! 生成 100000 条记录 -> math_pref_dataset.json


In [11]:
import json, itertools, pprint, textwrap

with open("math_pref_dataset.json", encoding="utf-8") as f:
    data = json.load(f)

for i, sample in zip(range(5), data):
    print(f"\n=== SAMPLE #{i} ===")
    pprint.pprint(sample, compact=True, width=100)



=== SAMPLE #0 ===
{'chosen': {'from': 'gpt',
            'value': 'Let $x$ be the number of band members in each row for the original '
                     'formation, when two are left over.'},
 'conversations': [{'from': 'system',
                    'value': 'Solve the following math problem step by step. Write your reasoning '
                             'clearly using LaTeX. Box the final answer using \\boxed{}.'},
                   {'from': 'human',
                    'value': 'A rectangular band formation is a formation with $m$ band members in '
                             'each of $r$ rows, where $m$ and $r$ are integers. A particular band '
                             'has less than 100 band members. The director arranges them in a '
                             'rectangular formation and finds that he has two members left over. '
                             'If he increases the number of members in each row by 1 and reduces '
                             'the number 