# Cleanup for GSM9K

In [1]:
from pathlib import Path
import json
import pandas as pd
from tqdm import tqdm

# directory setup
ROOT = Path("./")
TRAIN_PARQUET = ROOT / "train_parquet" / "gsm8k_train.parquet"
VALID_PARQUET = ROOT / "valid_parquet" / "gsm8k_valid.parquet"

OUT_TRAIN = ROOT / "train_jsonl" / "gsm8k_train.jsonl"
OUT_VALID = ROOT / "valid_jsonl" / "gsm8k_valid.jsonl"
OUT_TRAIN.parent.mkdir(parents=True, exist_ok=True)
OUT_VALID.parent.mkdir(parents=True, exist_ok=True)

# sanity check
for p in [TRAIN_PARQUET, VALID_PARQUET]:
    print("Expecting:", p.resolve())
    if not p.exists():
        raise FileNotFoundError(f"Missing: {p.resolve()}")


Expecting: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_parquet\gsm8k_train.parquet
Expecting: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\valid_parquet\gsm8k_valid.parquet


In [2]:
def parquet_to_pairs(in_path: Path, out_jsonl: Path, skip_rows: int = 0):
    print(f"\n=== Processing {in_path.name} → {out_jsonl.name} ===")
    df = pd.read_parquet(in_path)

    # ensure required columns exist
    for col in ("question", "answer"):
        if col not in df.columns:
            raise KeyError(f"Column {col!r} not found in {in_path.name}. Columns={list(df.columns)}")

    # skip first N rows (since it contains keys for each columns)
    if skip_rows > 0:
        df = df.iloc[skip_rows:].copy()

    # drop NaNs
    before = len(df)
    df = df[df["question"].notna() & df["answer"].notna()]
    after = len(df)
    print(f"Rows kept after NaN drop: {after}/{before}")

    # write out to JSONL
    with out_jsonl.open("w", encoding="utf-8") as f:
        for q, a in tqdm(zip(df["question"], df["answer"]),
                         total=len(df), desc=f"Writing {out_jsonl.name}"):
            obj = {"user": str(q).strip(), "assistant": str(a).strip()}
            json.dump(obj, f, ensure_ascii=False)
            f.write("\n")

    print(f"✅ Wrote {after} pairs to {out_jsonl.resolve()}")


In [3]:
parquet_to_pairs(TRAIN_PARQUET, OUT_TRAIN, skip_rows=0)
parquet_to_pairs(VALID_PARQUET, OUT_VALID, skip_rows=0)


=== Processing gsm8k_train.parquet → gsm8k_train.jsonl ===
Rows kept after NaN drop: 7473/7473


Writing gsm8k_train.jsonl: 100%|██████████| 7473/7473 [00:00<00:00, 96603.69it/s]


✅ Wrote 7473 pairs to C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\gsm8k_train.jsonl

=== Processing gsm8k_valid.parquet → gsm8k_valid.jsonl ===
Rows kept after NaN drop: 1319/1319


Writing gsm8k_valid.jsonl: 100%|██████████| 1319/1319 [00:00<00:00, 97282.95it/s]

✅ Wrote 1319 pairs to C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\valid_jsonl\gsm8k_valid.jsonl



