# Cleanup HENDRYCKS

In [1]:
from pathlib import Path
import json
import pandas as pd
from tqdm import tqdm

ROOT = Path("./")

TRAIN_PARQS = [
    ROOT / "train_parquet" / f"hendrycks_math_train_{i}.parquet"
    for i in range(1, 5)
]
VALID_PARQS = [
    ROOT / "valid_parquet" / f"hendrycks_math_valid_{i}.parquet"
    for i in range(1, 5)
]

OUT_TRAIN_DIR = ROOT / "train_jsonl"
OUT_VALID_DIR = ROOT / "valid_jsonl"
OUT_TRAIN_DIR.mkdir(parents=True, exist_ok=True)
OUT_VALID_DIR.mkdir(parents=True, exist_ok=True)

# sanity
for p in TRAIN_PARQS + VALID_PARQS:
    p


In [2]:
def parquet_to_pairs(in_path: Path, out_jsonl: Path, skip_rows: int = 0):
    print(f"\n=== {in_path.name} -> {out_jsonl.name} ===")
    df = pd.read_parquet(in_path)

    # required columns
    for col in ("problem", "solution"):
        if col not in df.columns:
            raise KeyError(f"{in_path.name} missing column {col!r}. Columns={list(df.columns)}")

    if skip_rows > 0:
        df = df.iloc[skip_rows:].copy()

    # drop NaNs
    before = len(df)
    df = df[df["problem"].notna() & df["solution"].notna()]
    after = len(df)
    print(f"Rows kept after NaN drop: {after}/{before}")

    # write JSONL with tqdm
    with out_jsonl.open("w", encoding="utf-8") as f:
        for q, a in tqdm(zip(df["problem"], df["solution"]),
                         total=len(df), desc=f"Writing {out_jsonl.name}"):
            obj = {"user": str(q).strip(), "assistant": str(a).strip()}
            json.dump(obj, f, ensure_ascii=False)
            f.write("\n")

    print(f"✅ Wrote {after} pairs → {out_jsonl.resolve()}")


In [3]:
for p in tqdm(TRAIN_PARQS, desc="Train files"):
    out = OUT_TRAIN_DIR / (p.stem + ".jsonl")  # e.g., hendrycks_math_train_1_pairs.jsonl
    parquet_to_pairs(p, out)


Train files:   0%|          | 0/4 [00:00<?, ?it/s]


=== hendrycks_math_train_1.parquet -> hendrycks_math_train_1.jsonl ===
Rows kept after NaN drop: 869/869


Writing hendrycks_math_train_1.jsonl: 100%|██████████| 869/869 [00:00<00:00, 85945.20it/s]


✅ Wrote 869 pairs → C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train_1.jsonl

=== hendrycks_math_train_2.parquet -> hendrycks_math_train_2.jsonl ===
Rows kept after NaN drop: 1295/1295


Writing hendrycks_math_train_2.jsonl: 100%|██████████| 1295/1295 [00:00<00:00, 76822.01it/s]
Train files:  50%|█████     | 2/4 [00:00<00:00, 16.30it/s]

✅ Wrote 1295 pairs → C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train_2.jsonl

=== hendrycks_math_train_3.parquet -> hendrycks_math_train_3.jsonl ===
Rows kept after NaN drop: 771/771


Writing hendrycks_math_train_3.jsonl: 100%|██████████| 771/771 [00:00<00:00, 72521.55it/s]


✅ Wrote 771 pairs → C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train_3.jsonl

=== hendrycks_math_train_4.parquet -> hendrycks_math_train_4.jsonl ===
Rows kept after NaN drop: 1744/1744


Writing hendrycks_math_train_4.jsonl: 100%|██████████| 1744/1744 [00:00<00:00, 93880.23it/s]
Train files: 100%|██████████| 4/4 [00:00<00:00, 19.34it/s]

✅ Wrote 1744 pairs → C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train_4.jsonl





In [4]:
for p in tqdm(VALID_PARQS, desc="Valid files"):
    out = OUT_VALID_DIR / (p.stem + ".jsonl")
    parquet_to_pairs(p, out)


Valid files:   0%|          | 0/4 [00:00<?, ?it/s]


=== hendrycks_math_valid_1.parquet -> hendrycks_math_valid_1.jsonl ===
Rows kept after NaN drop: 540/540


Writing hendrycks_math_valid_1.jsonl: 100%|██████████| 540/540 [00:00<00:00, 89721.29it/s]


✅ Wrote 540 pairs → C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\valid_jsonl\hendrycks_math_valid_1.jsonl

=== hendrycks_math_valid_2.parquet -> hendrycks_math_valid_2.jsonl ===
Rows kept after NaN drop: 903/903


Writing hendrycks_math_valid_2.jsonl: 100%|██████████| 903/903 [00:00<00:00, 74018.58it/s]


✅ Wrote 903 pairs → C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\valid_jsonl\hendrycks_math_valid_2.jsonl

=== hendrycks_math_valid_3.parquet -> hendrycks_math_valid_3.jsonl ===
Rows kept after NaN drop: 474/474


Writing hendrycks_math_valid_3.jsonl: 100%|██████████| 474/474 [00:00<00:00, 77255.77it/s]
Valid files:  75%|███████▌  | 3/4 [00:00<00:00, 27.08it/s]

✅ Wrote 474 pairs → C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\valid_jsonl\hendrycks_math_valid_3.jsonl

=== hendrycks_math_valid_4.parquet -> hendrycks_math_valid_4.jsonl ===
Rows kept after NaN drop: 1187/1187


Writing hendrycks_math_valid_4.jsonl: 100%|██████████| 1187/1187 [00:00<00:00, 307608.21it/s]
Valid files: 100%|██████████| 4/4 [00:00<00:00, 27.26it/s]

✅ Wrote 1187 pairs → C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\valid_jsonl\hendrycks_math_valid_4.jsonl





In [5]:
sample_out = OUT_TRAIN_DIR / "hendrycks_math_train_1.jsonl"
print("Sample output:", sample_out.resolve())
for i, line in zip(range(3), sample_out.open("r", encoding="utf-8")):
    print(line.rstrip()[:300])


Sample output: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train_1.jsonl
{"user": "If $AAA_4$ can be expressed as $33_b$, where $A$ is a digit in base 4 and $b$ is a base greater than 5, what is the smallest possible sum $A+b$?", "assistant": "We can rewrite $AAA_4$ and $33_b$ to get \\begin{align*}\n16A+4A+A&=3b+3\\quad\\Rightarrow\\\\\n21A&=3b+3.\n\\end{align*}The smal
{"user": "A very large number $x$ is equal to $2^23^34^45^56^67^78^89^9$. What is the smallest positive integer that, when multiplied with $x$, produces a product that is a perfect square?", "assistant": "For the product to be a perfect square, all the exponents need to be even. So we don't need to 
{"user": "When the base-16 number $66666_{16}$ is written in base 2, how many base-2 digits (bits) does it have?", "assistant": "We have \\begin{align*}\n66666_{16} &= 6\\cdot 16^4 + 6\\cdot 16^3 + 6\\cdot 16^2 + 6\\cdot 16 + 6 \\\\\n&

In [6]:
from pathlib import Path
import json
from tqdm import tqdm

ROOT = Path("./")
IN_FILES = [
    ROOT / "train_jsonl" / "hendrycks_math_train_1.jsonl",
    ROOT / "train_jsonl" / "hendrycks_math_train_2.jsonl",
    ROOT / "train_jsonl" / "hendrycks_math_train_3.jsonl",
    ROOT / "train_jsonl" / "hendrycks_math_train_4.jsonl",
]
OUT_FILE = ROOT / "train_jsonl" / "hendrycks_math_train.jsonl"
OUT_FILE.parent.mkdir(parents=True, exist_ok=True)

for p in IN_FILES:
    print("Expecting:", p.resolve())
    if not p.exists():
        raise FileNotFoundError(f"Missing: {p}")
print("Output  :", OUT_FILE.resolve())


Expecting: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train_1.jsonl
Expecting: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train_2.jsonl
Expecting: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train_3.jsonl
Expecting: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train_4.jsonl
Output  : C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train.jsonl


In [8]:
def count_lines(path: Path) -> int:
    with path.open("r", encoding="utf-8") as f:
        return sum(1 for _ in f)

total_in = sum(count_lines(p) for p in IN_FILES)
kept = 0
skipped = 0

with OUT_FILE.open("w", encoding="utf-8") as fout:
    pbar = tqdm(total=total_in, desc=f"Merging → {OUT_FILE.name}")
    for src in IN_FILES:
        with src.open("r", encoding="utf-8") as fin:
            for line in fin:
                pbar.update(1)
                line = line.strip()
                if not line:
                    skipped += 1
                    continue
                try:
                    obj = json.loads(line)
                except Exception:
                    skipped += 1
                    continue
                u = (obj.get("user") or "").strip()
                a = (obj.get("assistant") or "").strip()
                if not (u and a):
                    skipped += 1
                    continue
                json.dump({"user": u, "assistant": a}, fout, ensure_ascii=False)
                fout.write("\n")
                kept += 1
    pbar.close()

print(f"✅ Done. Kept {kept} pairs, skipped {skipped}.")
print("Wrote:", OUT_FILE.resolve())


Merging → hendrycks_math_train.jsonl: 100%|██████████| 4679/4679 [00:00<00:00, 61699.75it/s]

✅ Done. Kept 4679 pairs, skipped 0.
Wrote: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\hendrycks_math_train.jsonl





In [9]:
IN_VALID = [
    ROOT / "valid_jsonl" / "hendrycks_math_valid_1.jsonl",
    ROOT / "valid_jsonl" / "hendrycks_math_valid_2.jsonl",
    ROOT / "valid_jsonl" / "hendrycks_math_valid_3.jsonl",
    ROOT / "valid_jsonl" / "hendrycks_math_valid_4.jsonl",
]
OUT_VALID = ROOT / "valid_jsonl" / "hendrycks_math_valid.jsonl"

total_in = sum(count_lines(p) for p in IN_VALID)
kept = skipped = 0
with OUT_VALID.open("w", encoding="utf-8") as fout:
    pbar = tqdm(total=total_in, desc=f"Merging → {OUT_VALID.name}")
    for src in IN_VALID:
        with src.open("r", encoding="utf-8") as fin:
            for line in fin:
                pbar.update(1)
                line = line.strip()
                if not line:
                    skipped += 1; continue
                try:
                    obj = json.loads(line)
                except Exception:
                    skipped += 1; continue
                u = (obj.get("user") or "").strip()
                a = (obj.get("assistant") or "").strip()
                if not (u and a):
                    skipped += 1; continue
                json.dump({"user": u, "assistant": a}, fout, ensure_ascii=False)
                fout.write("\n"); kept += 1
    pbar.close()

print(f"✅ Valid merged. Kept {kept}, skipped {skipped}.")
print("Wrote:", OUT_VALID.resolve())


Merging → hendrycks_math_valid.jsonl: 100%|██████████| 3104/3104 [00:00<00:00, 73127.56it/s]

✅ Valid merged. Kept 3104, skipped 0.
Wrote: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\valid_jsonl\hendrycks_math_valid.jsonl





then remove all hendrycks_math_train/valid_#number.jsonl