In [1]:
from datasets import load_dataset
import json
import os

# Define both subsets and splits
subsets = ["main", "socratic"]
splits = ["train", "test"]

# Output directory
base_dir = "gsm8k"

# Process each subset and split
for subset in subsets:
    for split in splits:
        print(f"Processing subset: {subset} | split: {split} ...")
        try:
            dataset = load_dataset("openai/gsm8k", subset, split=split)
        except Exception as e:
            print(f"Skipping {subset}-{split}: {e}")
            continue

        # Format the data
        formatted_data = []
        for item in dataset:
            formatted_data.append({
                "instruction": "",
                "input": item["question"].strip() if item["question"] else "",
                "output": item["answer"].strip() if item["answer"] else ""
            })

        # Create output path and save
        out_dir = os.path.join(base_dir, split)
        os.makedirs(out_dir, exist_ok=True)
        output_path = os.path.join(out_dir, f"{subset}.json")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(formatted_data, f, ensure_ascii=False, indent=2)

        print(f"Exported: {output_path}")

print("All subsets and splits have been processed.")


  from .autonotebook import tqdm as notebook_tqdm


Processing subset: main | split: train ...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 303474.24 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 263705.94 examples/s]


Exported: gsm8k\train\main.json
Processing subset: main | split: test ...
Exported: gsm8k\test\main.json
Processing subset: socratic | split: train ...


Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 533815.31 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 263819.12 examples/s]


Exported: gsm8k\train\socratic.json
Processing subset: socratic | split: test ...
Exported: gsm8k\test\socratic.json
All subsets and splits have been processed.


In [2]:
from datasets import load_dataset
import json
import os
import re

# Define both subsets and splits
subsets = ["main", "socratic"]
splits = ["train", "test"]

# Output directory
base_dir = "gsm8k"

# Function to replace '#### 72' with '\boxed{72}' at the end
def replace_boxed(answer: str) -> str:
    match = re.search(r"####\s*(.+)", answer)
    if match:
        final_answer = match.group(1).strip()
        # Remove original '#### ...'
        answer = re.sub(r"####\s*.+", "", answer).rstrip()
        # Append \boxed{} notation
        answer += f"\n\\boxed{{{final_answer}}}"
    return answer

# Process each subset and split
for subset in subsets:
    for split in splits:
        print(f"Processing subset: {subset} | split: {split} ...")
        try:
            dataset = load_dataset("openai/gsm8k", subset, split=split)
        except Exception as e:
            print(f"Skipping {subset}-{split}: {e}")
            continue

        # Format the data
        formatted_data = []
        for item in dataset:
            input_text = item["question"].strip() if item["question"] else ""
            output_text = item["answer"].strip() if item["answer"] else ""
            output_text = replace_boxed(output_text)
            formatted_data.append({
                "instruction": "",
                "input": input_text,
                "output": output_text
            })

        # Save JSON
        out_dir = os.path.join(base_dir, split)
        os.makedirs(out_dir, exist_ok=True)
        output_path = os.path.join(out_dir, f"{subset}.json")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(formatted_data, f, ensure_ascii=False, indent=2)

        print(f"Exported: {output_path}")

print("All subsets and splits have been processed.")


Processing subset: main | split: train ...
Exported: gsm8k\train\main.json
Processing subset: main | split: test ...
Exported: gsm8k\test\main.json
Processing subset: socratic | split: train ...
Exported: gsm8k\train\socratic.json
Processing subset: socratic | split: test ...
Exported: gsm8k\test\socratic.json
All subsets and splits have been processed.
