# Cleanup Ultrachat

In [None]:
from pathlib import Path
import json, ast
import pandas as pd
import numpy as np
from tqdm import tqdm

ROOT = Path("./")
PARQUET_IN = ROOT / "train_parquet" / "ultrachat_train_1.parquet" # (this is for testing since i was tryna resolve an error)
OUT_JSONL = ROOT / "train_jsonl" / (PARQUET_IN.stem + ".jsonl")
OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)

df = pd.read_parquet(PARQUET_IN, engine="pyarrow")
print("Rows:", len(df), "Columns:", list(df.columns))


Rows: 69288 Columns: ['prompt', 'prompt_id', 'messages']


In [2]:
def maybe_parse_messages(x):
    # pyArrow scalar -> python
    if hasattr(x, "as_py"):
        try:
            x = x.as_py()
        except Exception:
            pass

    # NumPy ndarray -> list
    if isinstance(x, np.ndarray):
        # structured array?
        if x.dtype.names:  # ('content','role')
            out = []
            names = list(x.dtype.names)
            for rec in x:
                vals = rec.tolist() if hasattr(rec, "tolist") else list(rec)
                d = dict(zip(names, vals))
                out.append({"role": str(d.get("role", "")).strip(),
                            "content": str(d.get("content", "")).strip()})
            return out
        # plain object array -> tolist()
        x = x.tolist()

    # already a list?
    if isinstance(x, list):
        # list of tuples? try to coerce
        if x and isinstance(x[0], tuple):
            out = []
            for t in x:
                if len(t) == 2:
                    a, b = t
                    a, b = ("" if a is None else str(a)), ("" if b is None else str(b))
                    if b.lower() in {"user","assistant"}:
                        out.append({"role": b.strip().lower(), "content": a.strip()})
                    elif a.lower() in {"user","assistant"}:
                        out.append({"role": a.strip().lower(), "content": b.strip()})
            return out
        # assume list of dicts
        return x

    # bytes -> string
    if isinstance(x, (bytes, bytearray)):
        try:
            x = x.decode("utf-8", errors="ignore")
        except Exception:
            return []

    # string -> json / ast
    if isinstance(x, str):
        s = x.strip()
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}")):
            try:
                val = json.loads(s)
                if isinstance(val, list):
                    return val
            except Exception:
                try:
                    val = ast.literal_eval(s)
                    if isinstance(val, list):
                        return val
                except Exception:
                    return []
        return []

    # sast resort: ast on str(x)
    try:
        val = ast.literal_eval(str(x))
        if isinstance(val, list):
            return val
    except Exception:
        pass
    return []

def messages_to_pairs(messages):
    """Pair strictly in order: user → assistant; drop incomplete tails/empties."""
    pairs, last_user = [], None
    for m in messages or []:
        if not isinstance(m, dict):
            continue
        role = (m.get("role") or "").strip().lower()
        content = (m.get("content") or "").strip()
        if not content:
            continue
        if role == "user":
            last_user = content
        elif role == "assistant" and last_user is not None:
            pairs.append({"user": last_user, "assistant": content})
            last_user = None
    return pairs


In [3]:
for i in range(min(5, len(df))):
    raw = df.iloc[i]["messages"] if "messages" in df.columns else None
    msgs = maybe_parse_messages(raw)
    ps = messages_to_pairs(msgs)
    print(f"row {i}: type={type(raw).__name__}  msgs={len(msgs)}  pairs={len(ps)}")
    if ps:
        print("  sample:", ps[0]["user"][:60].replace("\n"," "), "->", ps[0]["assistant"][:60].replace("\n"," "))


row 0: type=ndarray  msgs=6  pairs=3
  sample: What are some useful tips for organizing my closet and keepi -> 1. Start by decluttering: Go through your clothes and access
row 1: type=ndarray  msgs=6  pairs=3
  sample: How does sitting for prolonged periods of time affect overal -> Sitting for prolonged periods of time can have negative impa
row 2: type=ndarray  msgs=8  pairs=4
  sample: Can you summarize the benefits of 3D printing for surgeons a -> Surgeons at the VA Puget Sound Health Care System have great
row 3: type=ndarray  msgs=6  pairs=3
  sample: Write a Java program that creates a singly linked list of in -> Here is one possible implementation of a singly linked list 
row 4: type=ndarray  msgs=14  pairs=7
  sample: Offer a guide to making homemade shakshuka with tomato sauce -> Ingredients: - 1 tablespoon olive oil - 1 onion, chopped - 2


In [4]:
total_pairs = 0
with OUT_JSONL.open("w", encoding="utf-8") as f:
    for row in tqdm(df.itertuples(index=False), total=len(df), desc=f"Rows {PARQUET_IN.name}"):
        msgs = maybe_parse_messages(getattr(row, "messages", None))
        for pair in messages_to_pairs(msgs):
            json.dump(pair, f, ensure_ascii=False)
            f.write("\n")
            total_pairs += 1

print(f"✅ Done. Wrote {total_pairs} pairs → {OUT_JSONL.resolve()}")


Rows ultrachat_train_1.parquet: 100%|██████████| 69288/69288 [00:04<00:00, 17125.45it/s]

✅ Done. Wrote 219527 pairs → C:\Users\insoo\Documents\Personal\Projects\LLM\LLM-dummy-documentation\8_Supervised_finetuning_Data_Schema\train_jsonl\ultrachat_train_1.jsonl





In [5]:
FILES = [
    ROOT / "train_parquet" / "ultrachat_train_1.parquet",
    ROOT / "train_parquet" / "ultrachat_train_2.parquet",
    ROOT / "train_parquet" / "ultrachat_train_3.parquet",
    ROOT / "valid_parquet" / "ultrachat_valid_1.parquet",
]

for p in FILES:
    out_path = (ROOT / ("train_jsonl" if "train_" in p.name else "valid_jsonl") / (p.stem + ".jsonl"))
    out_path.parent.mkdir(parents=True, exist_ok=True)

    df = pd.read_parquet(p, engine="pyarrow")
    total_pairs = 0
    with out_path.open("w", encoding="utf-8") as f:
        for row in tqdm(df.itertuples(index=False), total=len(df), desc=f"Rows {p.name}"):
            msgs = maybe_parse_messages(getattr(row, "messages", None))
            for pair in messages_to_pairs(msgs):
                json.dump(pair, f, ensure_ascii=False)
                f.write("\n")
                total_pairs += 1
    print(f"✅ {p.name}: wrote {total_pairs} pairs → {out_path}")


Rows ultrachat_train_1.parquet: 100%|██████████| 69288/69288 [00:04<00:00, 17004.67it/s]


✅ ultrachat_train_1.parquet: wrote 219527 pairs → train_jsonl\ultrachat_train_1.jsonl


Rows ultrachat_train_2.parquet: 100%|██████████| 69288/69288 [00:04<00:00, 16352.71it/s]


✅ ultrachat_train_2.parquet: wrote 219225 pairs → train_jsonl\ultrachat_train_2.jsonl


Rows ultrachat_train_3.parquet: 100%|██████████| 69289/69289 [00:04<00:00, 16318.95it/s]


✅ ultrachat_train_3.parquet: wrote 219011 pairs → train_jsonl\ultrachat_train_3.jsonl


Rows ultrachat_valid_1.parquet: 100%|██████████| 23110/23110 [00:01<00:00, 15946.97it/s]

✅ ultrachat_valid_1.parquet: wrote 73149 pairs → valid_jsonl\ultrachat_valid_1.jsonl





In [6]:
from pathlib import Path
import re, json
from tqdm import tqdm

ROOT = Path("./")

TRAIN_DIR = ROOT / "train_jsonl"
VALID_DIR = ROOT / "valid_jsonl"

# find ultrachat shards
def sort_key(p):
    m = re.search(r"ultrachat_(?:train|valid)_(\d+)", p.stem)
    return int(m.group(1)) if m else 0

TRAIN_FILES = sorted(TRAIN_DIR.glob("ultrachat_train_*.jsonl"), key=sort_key)
VALID_FILES = sorted(VALID_DIR.glob("ultrachat_valid_*.jsonl"), key=sort_key)

OUT_TRAIN = TRAIN_DIR / "ultrachat_train.jsonl"
OUT_VALID = VALID_DIR / "ultrachat_valid.jsonl"

print("Train shards:")
for p in TRAIN_FILES: print("  -", p.name)
print("Valid shards:")
for p in VALID_FILES: print("  -", p.name)

OUT_TRAIN.parent.mkdir(parents=True, exist_ok=True)
OUT_VALID.parent.mkdir(parents=True, exist_ok=True)


Train shards:
  - ultrachat_train_1.jsonl
  - ultrachat_train_2.jsonl
  - ultrachat_train_3.jsonl
Valid shards:
  - ultrachat_valid_1.jsonl


In [7]:
def count_lines(path: Path) -> int:
    with path.open("r", encoding="utf-8") as f:
        return sum(1 for _ in f)

def merge_jsonls(in_files, out_file):
    total = sum(count_lines(p) for p in in_files)
    kept = skipped = 0
    with out_file.open("w", encoding="utf-8") as fout:
        pbar = tqdm(total=total, desc=f"Merging → {out_file.name}")
        for src in in_files:
            with src.open("r", encoding="utf-8") as fin:
                for line in fin:
                    pbar.update(1)
                    line = line.strip()
                    if not line:
                        skipped += 1; continue
                    try:
                        obj = json.loads(line)
                    except Exception:
                        skipped += 1; continue
                    u = (obj.get("user") or "").strip()
                    a = (obj.get("assistant") or "").strip()
                    if not (u and a):
                        skipped += 1; continue
                    json.dump({"user": u, "assistant": a}, fout, ensure_ascii=False)
                    fout.write("\n")
                    kept += 1
        pbar.close()
    print(f"✅ {out_file.name}: kept {kept}, skipped {skipped}.")


In [8]:
if TRAIN_FILES:
    merge_jsonls(TRAIN_FILES, OUT_TRAIN)
else:
    print("No UltraChat train shards found in", TRAIN_DIR)

if VALID_FILES:
    merge_jsonls(VALID_FILES, OUT_VALID)
else:
    print("No UltraChat valid shards found in", VALID_DIR)


Merging → ultrachat_train.jsonl: 100%|██████████| 657763/657763 [00:17<00:00, 37098.54it/s]


✅ ultrachat_train.jsonl: kept 657763, skipped 0.


Merging → ultrachat_valid.jsonl: 100%|██████████| 73149/73149 [00:01<00:00, 37368.39it/s]

✅ ultrachat_valid.jsonl: kept 73149, skipped 0.





then remove all ultrachat_train/valid_#number.jsonl