# save file separately

In [2]:
from datasets import load_dataset
import json
import os
import re

# Define both subsets and splits
subsets = ["main", "socratic"]
splits = ["train", "test"]

# Output directory
base_dir = "gsm8k"

# Function to replace '#### 72' with '\boxed{72}' at the end
def replace_boxed(answer: str) -> str:
    match = re.search(r"####\s*(.+)", answer)
    if match:
        final_answer = match.group(1).strip()
        # Remove original '#### ...'
        answer = re.sub(r"####\s*.+", "", answer).rstrip()
        # Append \boxed{} notation
        answer += f"\n\\boxed{{{final_answer}}}"
    return answer

# Process each subset and split
for subset in subsets:
    for split in splits:
        print(f"Processing subset: {subset} | split: {split} ...")
        try:
            dataset = load_dataset("openai/gsm8k", subset, split=split)
        except Exception as e:
            print(f"Skipping {subset}-{split}: {e}")
            continue

        # Format the data
        formatted_data = []
        for item in dataset:
            input_text = item["question"].strip() if item["question"] else ""
            output_text = item["answer"].strip() if item["answer"] else ""
            output_text = replace_boxed(output_text)
            formatted_data.append({
                "instruction": "",
                "input": input_text,
                "output": output_text
            })

        # Save JSON
        out_dir = os.path.join(base_dir, split)
        os.makedirs(out_dir, exist_ok=True)
        output_path = os.path.join(out_dir, f"{subset}.json")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(formatted_data, f, ensure_ascii=False, indent=2)

        print(f"Exported: {output_path}")

print("All subsets and splits have been processed.")


Processing subset: main | split: train ...
Exported: gsm8k\train\main.json
Processing subset: main | split: test ...
Exported: gsm8k\test\main.json
Processing subset: socratic | split: train ...
Exported: gsm8k\train\socratic.json
Processing subset: socratic | split: test ...
Exported: gsm8k\test\socratic.json
All subsets and splits have been processed.


# save file together

In [1]:
from datasets import load_dataset
import json
import os
import re

# Subsets and splits
subsets = ["main", "socratic"]
splits = ["train", "test"]

# Output directory
os.makedirs("gsm8k", exist_ok=True)

# Function to replace '#### 72' with '\boxed{72}'
def replace_boxed(answer: str) -> str:
    match = re.search(r"####\s*(.+)", answer)
    if match:
        final_answer = match.group(1).strip()
        answer = re.sub(r"####\s*.+", "", answer).rstrip()
        answer += f"\n\\boxed{{{final_answer}}}"
    return answer

# Process each split, combining all subsets
for split in splits:
    print(f"Processing split: {split} ...")
    all_data = []

    for subset in subsets:
        print(f"  Loading subset: {subset} ...")
        try:
            dataset = load_dataset("openai/gsm8k", subset, split=split)
        except Exception as e:
            print(f"  Skipping {subset} ({split}): {e}")
            continue

        for item in dataset:
            input_text = item["question"].strip() if item["question"] else ""
            output_text = item["answer"].strip() if item["answer"] else ""
            output_text = replace_boxed(output_text)
            all_data.append({
                "instruction": "Solve the following math problem step by step. Write your reasoning clearly using LaTeX. Box the final answer using \\boxed{}.",
                "input": input_text,
                "output": output_text
            })

    # Save merged file for this split
    output_path = f"gsm8k/{split}.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)

    print(f"Exported: {output_path}")

print("All splits processed and saved.")


  from .autonotebook import tqdm as notebook_tqdm


Processing split: train ...
  Loading subset: main ...
  Loading subset: socratic ...
Exported: gsm8k/train.json
Processing split: test ...
  Loading subset: main ...
  Loading subset: socratic ...
Exported: gsm8k/test.json
All splits processed and saved.
