# Cleanup SCIQ

In [None]:
from pathlib import Path
import json
from typing import Any, Dict, Iterable, Tuple, List

from tqdm.auto import tqdm

try:
    import pyarrow.parquet as pq
    import pyarrow as pa
    HAVE_PA = True
except Exception:
    HAVE_PA = False

try:
    import pandas as pd
    HAVE_PD = True
except Exception:
    HAVE_PD = False

BASE = Path("sciq")
TRAIN_FILE = BASE / "sciq_train_origin.parquet"
VALID_FILE = BASE / "sciq_valid_origin.parquet"

OUT_TRAIN = Path("./train_jsonl/sciq_train.jsonl")
OUT_VALID = Path("./valid_jsonl/sciq_valid.jsonl")

BATCH_SIZE = 8192
DO_DEDUP = True
JOINER = "\n\n"


In [None]:
def to_clean_str(x: Any) -> str:
    if isinstance(x, int):
        return str(x)
    if isinstance(x, float):
        if abs(x - round(x)) < 1e-9:
            return str(int(round(x)))
        return str(x)
    return str(x).strip()


def extract_pair_strict(obj: Dict[str, Any]) -> Tuple[str, str] | None:
    if not all(k in obj for k in ("question", "correct_answer", "support")):
        return None
    q = to_clean_str(obj["question"])
    a = to_clean_str(obj["correct_answer"])
    s = to_clean_str(obj["support"])
    if not q or not a or not s:
        return None
    user = q
    assistant = f"The answer is {a}.{JOINER}{s}"
    return user, assistant


def build_jsonl_from_parquet(in_path: Path, out_path: Path) -> Dict[str, int]:
    out_path.parent.mkdir(parents=True, exist_ok=True)

    total_rows = None
    if HAVE_PA:
        try:
            total_rows = pq.ParquetFile(in_path).metadata.num_rows
        except Exception:
            total_rows = None

    seen = set()
    kept = dropped = deduped = total_seen = 0

    desc = f"writing {out_path.name}"
    with out_path.open("w", encoding="utf-8") as f, tqdm(
        total=total_rows, unit="rows", desc=desc, leave=False
    ) as pbar:

        if HAVE_PA:
            pf = pq.ParquetFile(in_path)
            for batch in pf.iter_batches(batch_size=BATCH_SIZE):
                rows = pa.Table.from_batches([batch]).to_pylist()
                for rec in rows:
                    total_seen += 1
                    pair = extract_pair_strict(rec)
                    if pair is None:
                        dropped += 1
                        continue
                    user, assistant = pair

                    if DO_DEDUP:
                        h = (user, assistant)
                        if h in seen:
                            deduped += 1
                            continue
                        seen.add(h)

                    f.write(json.dumps({"user": user, "assistant": assistant}, ensure_ascii=False) + "\n")
                    kept += 1

                pbar.update(len(rows))
                pbar.set_postfix(written=kept, dropped=dropped, deduped=deduped)

        elif HAVE_PD:
            df = pd.read_parquet(in_path)
            recs = df.to_dict(orient="records")
            for rec in tqdm(recs, unit="rows", desc=desc, leave=False):
                total_seen += 1
                pair = extract_pair_strict(rec)
                if pair is None:
                    dropped += 1
                    continue
                user, assistant = pair

                if DO_DEDUP:
                    h = (user, assistant)
                    if h in seen:
                        deduped += 1
                        continue
                    seen.add(h)

                f.write(json.dumps({"user": user, "assistant": assistant}, ensure_ascii=False) + "\n")
                kept += 1
        else:
            raise RuntimeError("Neither pyarrow nor pandas is available to read parquet.")

    return {
        "total_rows_in_file": int(total_rows) if total_rows is not None else total_seen,
        "seen_rows": total_seen,
        "written": kept,
        "dropped_missing_or_empty": dropped,
        "deduped": deduped,
    }

In [3]:
print("Inputs:")
print(" -", TRAIN_FILE)
print(" -", VALID_FILE)

train_stats = build_jsonl_from_parquet(TRAIN_FILE, OUT_TRAIN)
valid_stats = build_jsonl_from_parquet(VALID_FILE, OUT_VALID)

print("\n--- Done ---")
print("Train:", train_stats)
print("Valid:", valid_stats)

Inputs:
 - sciq\sciq_train_origin.parquet
 - sciq\sciq_valid_origin.parquet


writing sciq_train.jsonl:   0%|          | 0/11679 [00:00<?, ?rows/s]

writing sciq_valid.jsonl:   0%|          | 0/1000 [00:00<?, ?rows/s]


--- Done ---
Train: {'total_rows_in_file': 11679, 'seen_rows': 11679, 'written': 10481, 'dropped_missing_or_empty': 1198, 'deduped': 0}
Valid: {'total_rows_in_file': 1000, 'seen_rows': 1000, 'written': 887, 'dropped_missing_or_empty': 113, 'deduped': 0}


In [4]:
def peek_jsonl(path: Path, k: int = 3):
    print(f"\n--- {path.name} (first {k}) ---")
    with path.open("r", encoding="utf-8") as f:
        for i, ln in enumerate(f):
            if i >= k: break
            print(ln.rstrip())

peek_jsonl(OUT_TRAIN, 3)
peek_jsonl(OUT_VALID, 3)


--- sciq_train.jsonl (first 3) ---
{"user": "What type of organism is commonly used in preparation of foods such as cheese and yogurt?", "assistant": "The answer is mesophilic organisms.\n\nMesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine."}
{"user": "What phenomenon makes global winds blow northeast to southwest or the reverse in the northern hemisphere and northwest to southeast or the reverse in the southern hemisphere?", "assistant": "The answer is coriolis effect.\n\nWithout Coriolis Effect the global winds would blow north to south or south to north. But Coriolis makes them blow northeast to southwest or the reverse in the Northern Hemisphere

In [None]:
def sanity_check_jsonl(path: Path, max_lines: int | None = None):
    n = bad_json = nonstring = short_user = short_asst = 0
    with path.open("r", encoding="utf-8") as f:
        for i, ln in enumerate(tqdm(f, unit="lines", desc=f"scan {path.name}", leave=False)):
            if (max_lines is not None) and (i >= max_lines): break
            try:
                obj = json.loads(ln)
            except Exception:
                bad_json += 1; continue
            u, a = obj.get("user",""), obj.get("assistant","")
            if not isinstance(u, str) or not isinstance(a, str):
                nonstring += 1; continue
            if len(u) < 5: short_user += 1
            if len(a) < 3: short_asst += 1
            n += 1
    print(f"{path.name}: scanned {n} | bad_json={bad_json} nonstring={nonstring} "
          f"| short_user(<5)={short_user} short_asst(<3)={short_asst}")

sanity_check_jsonl(OUT_TRAIN)
sanity_check_jsonl(OUT_VALID)

scan sciq_train.jsonl: 0lines [00:00, ?lines/s]

sciq_train.jsonl: scanned 10481 | bad_json=0 nonstring=0 | short_user(<5)=0 short_asst(<3)=0


scan sciq_valid.jsonl: 0lines [00:00, ?lines/s]

sciq_valid.jsonl: scanned 887 | bad_json=0 nonstring=0 | short_user(<5)=0 short_asst(<3)=0
