# save file separately

In [5]:
from datasets import load_dataset
import json
import os


# List of all subsets in the hendrycks_math dataset
subsets = [
    "algebra",
    "counting_and_probability",
    "geometry",
    "intermediate_algebra",
    "number_theory",
    "prealgebra",
    "precalculus",
]

split = "test"

# Create output directory
os.makedirs(f"hendrycks_math/{split}", exist_ok=True)

# Iterate over each subset
for subset in subsets:
    print(f"Processing subset: {subset} ...")
    dataset = load_dataset("EleutherAI/hendrycks_math", subset, split=split)

    # Convert to LLaMA Factory format
    formatted_data = []
    for item in dataset:
        formatted_data.append({
            "instruction": "",
            "input": item["problem"].strip() if item["problem"] else "",
            "output": item["solution"].strip() if item["solution"] else ""
        })

    # Save to JSON file
    output_path = f"hendrycks_math/{split}/{subset}.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(formatted_data, f, ensure_ascii=False, indent=2)

    print(f"Exported: {output_path}")

print("All subsets have been processed.")


Processing subset: algebra ...


Using the latest cached version of the dataset since EleutherAI/hendrycks_math couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'algebra' at C:\Users\fzkuji\.cache\huggingface\datasets\EleutherAI___hendrycks_math\algebra\0.0.0\21a5633873b6a120296cce3e2df9d5550074f4a3 (last modified on Mon Apr 28 17:33:33 2025).


Exported: hendrycks_math/test/algebra.json
Processing subset: counting_and_probability ...


Using the latest cached version of the dataset since EleutherAI/hendrycks_math couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'counting_and_probability' at C:\Users\fzkuji\.cache\huggingface\datasets\EleutherAI___hendrycks_math\counting_and_probability\0.0.0\21a5633873b6a120296cce3e2df9d5550074f4a3 (last modified on Mon Apr 28 17:33:41 2025).


Exported: hendrycks_math/test/counting_and_probability.json
Processing subset: geometry ...


Using the latest cached version of the dataset since EleutherAI/hendrycks_math couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'geometry' at C:\Users\fzkuji\.cache\huggingface\datasets\EleutherAI___hendrycks_math\geometry\0.0.0\21a5633873b6a120296cce3e2df9d5550074f4a3 (last modified on Mon Apr 28 17:33:48 2025).


Exported: hendrycks_math/test/geometry.json
Processing subset: intermediate_algebra ...


Using the latest cached version of the dataset since EleutherAI/hendrycks_math couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'intermediate_algebra' at C:\Users\fzkuji\.cache\huggingface\datasets\EleutherAI___hendrycks_math\intermediate_algebra\0.0.0\21a5633873b6a120296cce3e2df9d5550074f4a3 (last modified on Mon Apr 28 17:33:55 2025).


Exported: hendrycks_math/test/intermediate_algebra.json
Processing subset: number_theory ...


Using the latest cached version of the dataset since EleutherAI/hendrycks_math couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'number_theory' at C:\Users\fzkuji\.cache\huggingface\datasets\EleutherAI___hendrycks_math\number_theory\0.0.0\21a5633873b6a120296cce3e2df9d5550074f4a3 (last modified on Mon Apr 28 17:34:02 2025).


Exported: hendrycks_math/test/number_theory.json
Processing subset: prealgebra ...


Using the latest cached version of the dataset since EleutherAI/hendrycks_math couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'prealgebra' at C:\Users\fzkuji\.cache\huggingface\datasets\EleutherAI___hendrycks_math\prealgebra\0.0.0\21a5633873b6a120296cce3e2df9d5550074f4a3 (last modified on Mon Apr 28 17:34:09 2025).


Exported: hendrycks_math/test/prealgebra.json
Processing subset: precalculus ...


Using the latest cached version of the dataset since EleutherAI/hendrycks_math couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'precalculus' at C:\Users\fzkuji\.cache\huggingface\datasets\EleutherAI___hendrycks_math\precalculus\0.0.0\21a5633873b6a120296cce3e2df9d5550074f4a3 (last modified on Mon Apr 28 17:34:15 2025).


Exported: hendrycks_math/test/precalculus.json
All subsets have been processed.


# save file together

In [2]:
from datasets import load_dataset
import json
import os

# All math subsets
subsets = [
    "algebra",
    "counting_and_probability",
    "geometry",
    "intermediate_algebra",
    "number_theory",
    "prealgebra",
    "precalculus",
]

# Process both splits
for split in ["train", "test"]:
    print(f"Processing split: {split} ...")
    all_data = []

    for subset in subsets:
        print(f"  Loading subset: {subset} ...")
        try:
            dataset = load_dataset("EleutherAI/hendrycks_math", subset, split=split)
        except Exception as e:
            print(f"  Skipping {subset} ({split}): {e}")
            continue

        for item in dataset:
            all_data.append({
                "instruction": "Solve the following math problem step by step. Write your reasoning clearly using LaTeX. Box the final answer using \\boxed{}.",
                "input": item["problem"].strip() if item["problem"] else "",
                "output": item["solution"].strip() if item["solution"] else ""
            })

    # Create output directory
    os.makedirs("hendrycks_math", exist_ok=True)
    output_path = f"hendrycks_math/{split}.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)

    print(f"Saved: {output_path}")

print("All done.")


Processing split: train ...
  Loading subset: algebra ...
  Loading subset: counting_and_probability ...
  Loading subset: geometry ...
  Loading subset: intermediate_algebra ...
  Loading subset: number_theory ...
  Loading subset: prealgebra ...


Using the latest cached version of the dataset since EleutherAI/hendrycks_math couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'prealgebra' at C:\Users\fzkuji\.cache\huggingface\datasets\EleutherAI___hendrycks_math\prealgebra\0.0.0\21a5633873b6a120296cce3e2df9d5550074f4a3 (last modified on Mon Apr 28 17:34:09 2025).


  Loading subset: precalculus ...
Saved: hendrycks_math/train.json
Processing split: test ...
  Loading subset: algebra ...
  Loading subset: counting_and_probability ...
  Loading subset: geometry ...
  Loading subset: intermediate_algebra ...
  Loading subset: number_theory ...
  Loading subset: prealgebra ...
  Loading subset: precalculus ...
Saved: hendrycks_math/test.json
All done.
