# Notebook: 01 - Cleaning

Purpose: conservative, reversible cleaning pipeline. Run the pipeline cell to regenerate `urdu_stories_final_preprocessed.json`.

In [1]:
# Imports & paths
import json, re, unicodedata
from pathlib import Path
from collections import Counter

NOTEBOOK_DIR = Path.cwd()
PHASE1_ROOT = NOTEBOOK_DIR
SRC_JSON = PHASE1_ROOT / 'urdu_stories_final.json'
if not SRC_JSON.exists():
    raise FileNotFoundError(f"{SRC_JSON} not found — run 01-data-collection.ipynb to create latest dataset")
CLEAN_DIR = PHASE1_ROOT
CLEAN_JSON = CLEAN_DIR / 'urdu_stories_final_preprocessed.json'

In [5]:
# Core normalization and cleaning utilities
import re, unicodedata

_DIACRITICS_RE = re.compile('[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]')

def normalize_urdu(text: str) -> str:
    if not text:
        return text
    t = unicodedata.normalize('NFC', text)
    t = _DIACRITICS_RE.sub('', t)
    t = re.sub('[\u0622\u0623\u0625\u0671]', '\u0627', t)  # Alef variants → ا
    t = re.sub('\u0643', '\u06A9', t)  # ك -> ک
    t = re.sub('\u064A', '\u06CC', t)  # ي -> ی
    t = re.sub('[\u200C\u200D\uFEFF]', '', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t


def collapse_duplicate_markers(text: str) -> (str, int):
    t = text
    changed = 0
    for tok in ['<EOS>','<EOP>','<EOT>']:
        pattern = re.compile(r'(?:' + re.escape(tok) + r')[\s\n]*(?:' + re.escape(tok) + r')+')
        t, n = pattern.subn(tok, t)
        changed += n
    t = re.sub(r'\s+', ' ', t)
    return t.strip(), changed

_TERMINATOR_RE = re.compile(r'([\u06D4\u061F\.\!\?])(?!\s*<EOS>)')

def insert_missing_eos(text: str) -> (str, int):
    inserts = 0
    parts = []
    last = 0
    for m in _TERMINATOR_RE.finditer(text):
        pos = m.start(1)
        prev_slice = text[last:pos+1]
        if len(prev_slice.strip()) >= 8:
            after = text[m.end(1): m.end(1)+10]
            if '<EOS>' not in after:
                inserts += 1
                parts.append(text[last:m.end(1)] + ' <EOS>')
                last = m.end(1)
    parts.append(text[last:])
    if inserts:
        return ''.join(parts), inserts
    return text, 0


def clean_story_text(orig: str) -> (str, dict):
    t = orig
    changes = {'normalized': False, 'collapsed_markers': 0, 'inserted_eos': 0, 'added_eot': 0}
    norm = normalize_urdu(t)
    if norm != t:
        changes['normalized'] = True
        t = norm
    t, collapsed = collapse_duplicate_markers(t)
    changes['collapsed_markers'] = collapsed
    t, inserted = insert_missing_eos(t)
    changes['inserted_eos'] = inserted
    if '<EOT>' not in t:
        t = t.strip() + ' <EOT>'
        changes['added_eot'] = 1
    t = re.sub(r'\s*<\s*(EOP|EOS|EOT)\s*>\s*', r' <\1> ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t, changes

In [None]:
# Run cleaning pipeline
with open(SRC_JSON,'r',encoding='utf-8') as f:
    orig_stories = json.load(f)

cleaned = []
change_log = []
summary_counts = Counter()
for i,s in enumerate(orig_stories):
    content = s.get('content','')
    new_content, changes = clean_story_text(content)
    new_entry = dict(s)
    new_entry['content'] = new_content
    cleaned.append(new_entry)
    if any(changes.values()):
        change_log.append({'idx': i, 'title': s.get('urdu_title',''), **changes})
    for k,v in changes.items():
        if isinstance(v, int) and v>0:
            summary_counts[k] += v

# write cleaned JSON (non-destructive)
with open(CLEAN_JSON,'w',encoding='utf-8') as f:
    json.dump(cleaned, f, ensure_ascii=False, indent=2)

report = {
    'stories_total': len(cleaned),
    'stories_changed': len(change_log),
    'inserted_eos_total': summary_counts.get('inserted_eos',0),
    'collapsed_markers_total': summary_counts.get('collapsed_markers',0),
    'added_eot_total': summary_counts.get('added_eot',0)
}

print('Clean complete — stories:', len(cleaned), 'changed:', len(change_log), 'EOS inserted:', summary_counts.get('inserted_eos',0))

Clean complete — stories: 287 changed: 287 EOS inserted: 1543 (artifacts kept in-memory)
