# Cleanup for Dolly

In [None]:
from pathlib import Path
import json, random
from tqdm import tqdm

IN_JSONL = Path("./sft_datasets/databricks-dolly-15k.jsonl")
OUT_TRAIN = Path("./train_jsonl/dolly_train.jsonl")
OUT_VALID = Path("./valid_jsonl/dolly_valid.jsonl")

OUT_TRAIN.parent.mkdir(parents=True, exist_ok=True)
OUT_VALID.parent.mkdir(parents=True, exist_ok=True)

TRAIN_FRACTION = 0.95
SEED = 1337
DEDUP = True

print("Reading from:", IN_JSONL.resolve())


Reading from: C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\sft_datasets\databricks-dolly-15k.jsonl


In [None]:
def clean_text(x):
    if x is None:
        return ""
    # keep original newlines to preserve structure; just strip ends
    return str(x).strip()

def join_user(instr, ctx):
    instr = clean_text(instr)
    ctx   = clean_text(ctx)
    if instr and ctx:
        return f"{instr} {ctx}".strip()
    return (instr or ctx).strip()

def iter_jsonl_with_total(path: Path):
    total = sum(1 for _ in path.open("r", encoding="utf-8"))
    with path.open("r", encoding="utf-8") as f:
        for line in tqdm(f, total=total, desc=f"Reading {path.name}"):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except Exception:
                # bad line -> skip
                continue


In [3]:
pairs = []
for obj in iter_jsonl_with_total(IN_JSONL):
    u = join_user(obj.get("instruction"), obj.get("context"))
    a = clean_text(obj.get("response"))
    if u and a:
        pairs.append({"user": u, "assistant": a})

print(f"Loaded pairs: {len(pairs)}")

if DEDUP:
    seen = set()
    deduped = []
    for r in pairs:
        key = (r["user"], r["assistant"])
        if key in seen:
            continue
        seen.add(key)
        deduped.append(r)
    pairs = deduped
    print(f"After dedup: {len(pairs)}")


Reading databricks-dolly-15k.jsonl: 100%|██████████| 15011/15011 [00:00<00:00, 164577.31it/s]

Loaded pairs: 15011
After dedup: 14996





In [4]:
random.Random(SEED).shuffle(pairs)

n_total = len(pairs)
n_train = int(n_total * TRAIN_FRACTION)
train_rows = pairs[:n_train]
valid_rows = pairs[n_train:]

print({"total": n_total, "train": len(train_rows), "valid": len(valid_rows)})

with OUT_TRAIN.open("w", encoding="utf-8") as f:
    for r in tqdm(train_rows, desc=f"Writing {OUT_TRAIN.name}", total=len(train_rows)):
        json.dump(r, f, ensure_ascii=False)
        f.write("\n")

with OUT_VALID.open("w", encoding="utf-8") as f:
    for r in tqdm(valid_rows, desc=f"Writing {OUT_VALID.name}", total=len(valid_rows)):
        json.dump(r, f, ensure_ascii=False)
        f.write("\n")

print("✅ Wrote:")
print("  -", OUT_TRAIN.resolve())
print("  -", OUT_VALID.resolve())


{'total': 14996, 'train': 14246, 'valid': 750}


Writing dolly_train.jsonl: 100%|██████████| 14246/14246 [00:00<00:00, 75495.23it/s]
Writing dolly_valid.jsonl: 100%|██████████| 750/750 [00:00<00:00, 65123.55it/s]

✅ Wrote:
  - C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\dolly_train.jsonl
  - C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\valid_jsonl\dolly_valid.jsonl





In [5]:
print("Train sample:", json.loads(next(open(OUT_TRAIN, "r", encoding="utf-8"))))
print("Valid sample:", json.loads(next(open(OUT_VALID, "r", encoding="utf-8"))))


Train sample: {'user': 'Why do some people like horror films?', 'assistant': "Horror films can provide a safe way for us to mentally rehearse how we'd handle real danger.  Some people actually enjoy the scary emotions that horror films elicit.   It can provide a unique adrenaline rush, somewhat similar to other fearful activities, like white water rafting, jumping out of airplanes, amusement park rides and flying in a small aircraft.  For many, horror films are their least liked movie genre for those same reasons.  They might prefer a romantic comedy or action triller, without all the blood and diabolical antagonists."}
Valid sample: {'user': 'How popular were figs in ancient Rome? Traditional dried fruit such as raisins, figs, dates, apricots and apples have been a staple of Mediterranean diets for millennia. This is due partly to their early cultivation in the Middle Eastern region known as the Fertile Crescent, made up by parts of modern Iran, Iraq, southwest Turkey, Syria, Lebanon,