# Cleanup SVAMP

In [None]:
from pathlib import Path
import json
from typing import List, Dict, Iterable, Tuple, Any

TRAIN_JSON = Path("./SVAMP/SVAMP_train_origin.json")
VALID_JSON = Path("./SVAMP/SVAMP_valid_origin.json")

OUT_TRAIN_JSONL = Path("./train_jsonl/svamp_train.jsonl")
OUT_VALID_JSONL = Path("./valid_jsonl/svamp_valid.jsonl")

JOINER = "\n\n"

In [None]:
# loads
# - a JSON array of objects
# - fallback: line-delimited JSON
def load_json_records(path: Path) -> List[Dict[str, Any]]:
    text = path.read_text(encoding="utf-8").strip()
    if not text:
        return []
    # try array
    if text[0] == "[":
        data = json.loads(text)
        if not isinstance(data, list):
            raise ValueError(f"{path} is not a JSON array.")
        return data
    # fallback
    recs = []
    for ln in text.splitlines():
        ln = ln.strip()
        if not ln:
            continue
        recs.append(json.loads(ln))
    return recs

# convert values to clean string
# numbers like 1.0 -> 1
# strip outer whitespace
def to_clean_str(x: Any) -> str:
    if isinstance(x, (int,)):
        return str(x)
    if isinstance(x, float):
        if abs(x - round(x)) < 1e-9:
            return str(int(round(x)))
        return str(x)
    s = str(x)
    return s.strip()

# build (user, assistant) from a raw SVAMP record
# user = Body + joiner + Question
# assistant = Equation + " is " + Answer
def rec_to_pair(rec: Dict[str, Any], joiner: str = JOINER) -> Tuple[str, str]:
    body = to_clean_str(rec.get("Body", ""))
    question = to_clean_str(rec.get("Question", ""))
    equation = to_clean_str(rec.get("Equation", ""))
    answer = to_clean_str(rec.get("Answer", ""))

    user = f"{body}{joiner}{question}".strip()
    assistant = f"{equation} is {answer}".strip()
    return user, assistant

# Write {"user":..., "assistant":...} per line
# returns number of lines written
def write_jsonl(pairs: Iterable[Tuple[str, str]], out_path: Path) -> int:
    n = 0
    with out_path.open("w", encoding="utf-8") as f:
        for user, assistant in pairs:
            obj = {"user": user, "assistant": assistant}
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
            n += 1
    return n

In [None]:
train_raw = load_json_records(TRAIN_JSON)
valid_raw = load_json_records(VALID_JSON)

print(f"Loaded: train={len(train_raw):,}  valid={len(valid_raw):,}")

# transform
train_pairs = [rec_to_pair(r) for r in train_raw]
valid_pairs = [rec_to_pair(r) for r in valid_raw]

n_train = write_jsonl(train_pairs, OUT_TRAIN_JSONL)
n_valid = write_jsonl(valid_pairs, OUT_VALID_JSONL)

print(f"Wrote:  {OUT_TRAIN_JSONL} ({n_train:,} lines)")
print(f"Wrote:  {OUT_VALID_JSONL} ({n_valid:,} lines)")

Loaded: train=700  valid=300
Wrote:  train_jsonl\svamp_train.jsonl (700 lines)
Wrote:  train_jsonl\svamp_valid.jsonl (300 lines)


In [9]:
def peek_jsonl(path: Path, k: int = 3):
    print(f"\n--- {path.name} (first {k}) ---")
    for i, ln in enumerate(path.open("r", encoding="utf-8")):
        if i >= k: break
        print(ln.rstrip())

peek_jsonl(OUT_TRAIN_JSONL, 3)
peek_jsonl(OUT_VALID_JSONL, 3)


--- svamp_train.jsonl (first 3) ---
{"user": "There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups\n\nHow big is each group of bananas?", "assistant": "( 290.0 / 2.0 ) is 145"}
{"user": "Marco and his dad went strawberry picking. Marco's dad's strawberries weighed 11 pounds. If together their strawberries weighed 30 pounds.\n\nHow much did Marco's strawberries weigh?", "assistant": "( 30.0 - 11.0 ) is 19"}
{"user": "Edward spent $ 6 to buy 2 books each book costing him the same amount of money. Now he has $ 12.\n\nHow much did each book cost?", "assistant": "( 6.0 / 2.0 ) is 3"}

--- svamp_valid.jsonl (first 3) ---
{"user": "Mary is baking a cake. The recipe calls for 6 cups of flour 8 cups of sugar and 7 cups of salt. She already put in 5 cups of flour.\n\nHow many more cups of sugar than cups of salt does she need to add now?", "assistant": "( 8.0 - 7.0 ) is 1"}
{"user": "Paul got a box of so

In [10]:
def sanity_svamp_jsonl(path: Path):
    bad = 0
    for i, ln in enumerate(path.open("r", encoding="utf-8")):
        try:
            obj = json.loads(ln)
        except Exception as e:
            print(f"[{i}] JSON error: {e} -> {ln[:200]}")
            bad += 1
            continue
        if "user" not in obj or "assistant" not in obj:
            print(f"[{i}] missing keys: {obj}")
            bad += 1
        if not isinstance(obj.get("user", ""), str) or not isinstance(obj.get("assistant", ""), str):
            print(f"[{i}] non-string values: {obj}")
            bad += 1
    print(f"{path.name}: {('OK' if bad==0 else f'{bad} issues found')}")

sanity_svamp_jsonl(OUT_TRAIN_JSONL)
sanity_svamp_jsonl(OUT_VALID_JSONL)

svamp_train.jsonl: OK
svamp_valid.jsonl: OK
