# Notebook: 03 - Tokenizer

Purpose: regenerate the canonical `urdu_tokenizer_training.txt` from `urdu_stories_final_preprocessed.json`.

Run this after `01-cleaning.ipynb` and before `04-dataset-eda.ipynb`.

In [None]:
# Imports & paths
import json, re
from pathlib import Path

NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent
CLEAN_JSON = ROOT / 'urdu_stories_final_preprocessed.json'
OUT_TXT = ROOT / 'urdu_tokenizer_training.txt'

if not CLEAN_JSON.exists():
    raise FileNotFoundError(f"{CLEAN_JSON} not found â€” run preprocessing/01-cleaning.ipynb first")

In [None]:
# Regenerate tokenizer training text
with open(CLEAN_JSON, 'r', encoding='utf-8') as f:
    stories_src = json.load(f)


def remove_author_name(text):
    return re.sub(r'^.*?<EOP>\s*', '', text).strip()

lines = []
for item in stories_src:
    content = remove_author_name(item.get('content', ''))
    content = content.replace('<EOS>', ' <EOS> ').replace('<EOP>', ' <EOP> ')
    content = re.sub(r'\s+', ' ', content).strip()
    if content:
        lines.append(content)

# De-duplicate while preserving order
seen = set()
uniq_lines = []
for line in lines:
    if line in seen:
        continue
    seen.add(line)
    uniq_lines.append(line)

with open(OUT_TXT, 'w', encoding='utf-8') as out_f:
    for line in uniq_lines:
        out_f.write(line + '\n')

print('Wrote', len(uniq_lines), 'lines to', OUT_TXT.name)

Wrote 287 lines to urdu_tokenizer_training.txt
