# Cleanup for OpenOrca

In [1]:
from pathlib import Path
import json
import pandas as pd
from tqdm import tqdm

PARQUET_PATH = Path("./sft_datasets/openorca.parquet")
OUT_DIR      = Path("./sft_datasets")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_JSONL    = OUT_DIR / "openorca.jsonl"

print("Reading:", PARQUET_PATH.resolve())
df = pd.read_parquet(PARQUET_PATH)
print("Shape:", df.shape)
print("Columns:", list(df.columns))


Reading: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\sft_datasets\openorca.parquet
Shape: (994896, 4)
Columns: ['id', 'system_prompt', 'question', 'response']


In [None]:
# sanity check required columns
for col in ("question", "response"):
    if col not in df.columns:
        raise KeyError(f"Missing required column: {col!r}")

SKIP_ROWS = 1  # skip the first row (metadata/headers)
df_use = df.iloc[SKIP_ROWS:].copy()

# drop rows with missing question/response
before = len(df_use)
df_use = df_use[df_use["question"].notna() & df_use["response"].notna()]
after = len(df_use)
print(f"Kept {after}/{before} rows after dropping NaNs.")

# write JSONL
with OUT_JSONL.open("w", encoding="utf-8") as f:
    for q, a in tqdm(zip(df_use["question"], df_use["response"]),
                     total=len(df_use), desc=f"Writing {OUT_JSONL.name}"):
        obj = {"user": str(q).strip(), "assistant": str(a).strip()}
        json.dump(obj, f, ensure_ascii=False)
        f.write("\n")

print("✅ Wrote:", OUT_JSONL.resolve())


Kept 994895/994895 rows after dropping NaNs.


Writing openorca.jsonl: 100%|██████████| 994895/994895 [00:16<00:00, 59441.73it/s]

✅ Wrote: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\sft_datasets\openorca.jsonl





In [3]:
# show the first few lines from the JSONL we just wrote
for i, line in zip(range(5), OUT_JSONL.open("r", encoding="utf-8")):
    print(line.rstrip()[:300])


{"user": "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "assistant": "Midsummer House 
{"user": "What happens next in this paragraph?\n\nShe then rubs a needle on a cotton ball then pushing it onto a pencil and wrapping thread around it. She then holds up a box of a product and then pouring several liquids into a bowl. she\nChoose your answer from: A. adds saucepan and shakes up the p
{"user": "Please answer the following question: I want to test the ability of students to read a passage and answer questions about it. Could you please come up with a good question for the passage \"In 1901, the Federation of Australia was the process by which the six separate British self-governin
{"user": "James runs a TV show and there are 5 main characters and 4 minor characters. He pays th

In [None]:
from pathlib import Path
import json, random
from tqdm import tqdm

# INPUT (the file you just created from OpenOrca)
IN_JSONL = Path("./sft_datasets/openorca.jsonl")

# OUTPUTS
OUT_TRAIN = Path("./train_jsonl/openorca_train.jsonl")
OUT_VALID = Path("./valid_jsonl/openorca_valid.jsonl")
OUT_TRAIN.parent.mkdir(parents=True, exist_ok=True)
OUT_VALID.parent.mkdir(parents=True, exist_ok=True)

# SPLIT SETTINGS
TRAIN_FRACTION = 0.95 # 95% train / 5% valid
SEED = 1234 # reproducible shuffle
DEDUP = True # turn off if you don't want dedup


In [5]:
def iter_jsonl_lines(p):
    with p.open("r", encoding="utf-8") as f:
        for ln in f:
            ln = ln.strip()
            if not ln:
                continue
            try:
                obj = json.loads(ln)
            except Exception:
                continue
            yield obj

def normalize_pair(u, a):
    return (u or "").strip(), (a or "").strip()

# load
rows = []
for obj in iter_jsonl_lines(IN_JSONL):
    u, a = obj.get("user"), obj.get("assistant")
    u, a = normalize_pair(u, a)
    if u and a:
        rows.append({"user": u, "assistant": a})

print(f"Loaded: {len(rows)} pairs")

# deduplication
if DEDUP:
    seen = set()
    deduped = []
    for r in rows:
        key = (r["user"], r["assistant"])
        if key in seen:
            continue
        seen.add(key)
        deduped.append(r)
    rows = deduped
    print(f"After dedup: {len(rows)} pairs")


Loaded: 994894 pairs
After dedup: 984770 pairs


In [6]:
random.Random(SEED).shuffle(rows)
n_total = len(rows)
n_train = int(n_total * TRAIN_FRACTION)
train_rows = rows[:n_train]
valid_rows = rows[n_train:]

print({"total": n_total, "train": len(train_rows), "valid": len(valid_rows)})


{'total': 984770, 'train': 935531, 'valid': 49239}


In [7]:
with OUT_TRAIN.open("w", encoding="utf-8") as f:
    for r in tqdm(train_rows, desc=f"Writing {OUT_TRAIN.name}", total=len(train_rows)):
        json.dump(r, f, ensure_ascii=False)
        f.write("\n")

with OUT_VALID.open("w", encoding="utf-8") as f:
    for r in tqdm(valid_rows, desc=f"Writing {OUT_VALID.name}", total=len(valid_rows)):
        json.dump(r, f, ensure_ascii=False)
        f.write("\n")

print("✅ Wrote:")
print("  -", OUT_TRAIN.resolve())
print("  -", OUT_VALID.resolve())


Writing openorca_train.jsonl: 100%|██████████| 935531/935531 [00:16<00:00, 58316.05it/s]
Writing openorca_valid.jsonl: 100%|██████████| 49239/49239 [00:00<00:00, 62204.19it/s]


✅ Wrote:
  - C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\openorca_train.jsonl
  - C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\valid_jsonl\openorca_valid.jsonl


In [8]:
print("Train sample:", json.loads(next(open(OUT_TRAIN, "r", encoding="utf-8"))))
print("Valid sample:", json.loads(next(open(OUT_VALID, "r", encoding="utf-8"))))


Train sample: {'user': "Generate a question about the following movie plot: Eleven-year-old Kevin has a vivid imagination and is fascinated by history, particularly Ancient Greece; his parents ignore his activities, having become more obsessed with buying the latest household gadgets to keep up with their neighbours. One night, as Kevin is sleeping, an armoured knight on a horse bursts out of his wardrobe. Kevin is scared and hides as the knight rides off into a forest setting where once his bedroom wall was; when Kevin looks back out, the room is back to normal and he finds one of his photos on the wall similar to the forest he saw. The next night he prepares a satchel with supplies and a Polaroid camera, but is surprised when six dwarves spill out of the wardrobe. Kevin quickly learns the group has stolen a large, worn map, and are looking for an exit from Kevin's room before they are discovered. They find that Kevin's bedroom wall can be pushed, revealing a long hallway. Kevin is he