In [1]:
from tokenizers import ByteLevelBPETokenizer
BPE_VOCAB  = "../bpe/bpe_model-vocab.json"
BPE_MERGES = "../bpe/bpe_model-merges.txt"
INFO_DEFAULT = ""

In [2]:
tokenizer = ByteLevelBPETokenizer(BPE_VOCAB, BPE_MERGES, lowercase=False, add_prefix_space=True)

In [None]:
from pathlib import Path
import json, numpy as np

def info_token_stats(jsonl_path: Path):
    """
    Scan a single JSONL file and compute token stats for the `info` field.
    Uses the current tokenizer (no caps). Returns (max_len, max_row_idx, max_text).
    """
    max_len = -1
    max_row = -1
    max_text = ""
    lengths = []
    total_rows = 0
    bad_rows = 0

    with jsonl_path.open("r", encoding="utf-8") as f:
        for i, ln in enumerate(f, 1):
            if not ln.strip():
                continue
            total_rows += 1
            try:
                obj = json.loads(ln)
            except json.JSONDecodeError:
                bad_rows += 1
                continue

            info = obj.get("info", INFO_DEFAULT)
            if not isinstance(info, str):
                info = "" if info is None else str(info)

            ids = tokenizer.encode(info).ids
            L = len(ids)
            lengths.append(L)

            if L > max_len:
                max_len = L
                max_row = i
                max_text = info

    if not lengths:
        print(f"No valid rows found. total_rows={total_rows}, bad_rows={bad_rows}")
        return 0, -1, ""

    arr = np.array(lengths, dtype=np.int32)
    print(
        f"[{jsonl_path.name}] rows_scanned={len(arr)}  bad_rows={bad_rows}\n"
        f"max={max_len} tokens (row #{max_row})  "
        f"mean={arr.mean():.1f}  median={np.median(arr):.1f}  p90={np.percentile(arr,90):.0f}  p95={np.percentile(arr,95):.0f}"
    )
    return max_len, max_row, max_text

In [4]:
from pathlib import Path

target = Path("./info_datasets/info_train.jsonl")
max_len, max_row, max_text = info_token_stats(target)
print("\nLongest `info` preview:\n", max_text[:500])


[info_train.jsonl] rows_scanned=3000  bad_rows=0
max=69 tokens (row #2395)  mean=61.4  median=61.0  p90=64  p95=65

Longest `info` preview:
 user_name: Drew Gonzalez
city: Miami
country: United States
time_zone: America/Los_Angeles
favorite_color: white
preferred_language: TypeScript
hobby: playing piano
role: QA engineer
primary_interest: UI design
pet: cat
