# Notebook: 02 - Segmentation audit

Purpose: compare marker-based segmentation (<EOS>) vs rule-based (punctuation) sentence splitting.

In [7]:
# Imports & paths
import json, re
from pathlib import Path

NOTEBOOK_DIR = Path.cwd()
PHASE1_ROOT = NOTEBOOK_DIR
DATA_JSON = PHASE1_ROOT / 'urdu_stories_final_preprocessed.json'
if not DATA_JSON.exists():
    raise FileNotFoundError(f"{DATA_JSON} not found — run 01-cleaning.ipynb to create it")

In [8]:
# Core segmenters
import re

def marker_segment(text: str):
    parts = [p.strip() for p in re.split(r'<EOS>', text) if p.strip()]
    parts = [re.sub(r'<EOP>|<EOT>', '', p).strip() for p in parts]
    return [p for p in parts if p]

_rule_split_re = re.compile(r'(?<=[\u06D4\u061F\.\?\!])\s+')

def rule_segment(text: str):
    t = re.sub(r'<EOP>|<EOS>|<EOT>', ' ', text)
    t = re.sub(r'\s+', ' ', t).strip()
    return [p.strip() for p in _rule_split_re.split(t) if p.strip()]


In [9]:
# Run segmentation audit
with open(DATA_JSON,'r',encoding='utf-8') as f:
    stories = json.load(f)

results = []
for i,s in enumerate(stories):
    content = s.get('content','')
    n_marker = len(marker_segment(content))
    n_rule = len(rule_segment(content))
    results.append({'idx': i, 'title': s.get('urdu_title','')[:80], 'n_marker': n_marker, 'n_rule': n_rule, 'diff': n_marker-n_rule})

# disagreements
disagree = [r for r in results if r['n_marker'] != r['n_rule']]
agree = len(results) - len(disagree)

segmentation_disagreements = disagree
segmentation_audit_summary = {'total_stories': len(results), 'agree_count': agree, 'disagree_count': len(disagree)}

print('Segmentation audit — agree:', agree, 'disagree:', len(disagree))

Segmentation audit — agree: 1771 disagree: 9
