# Notebook: 01 - Cleaning

Purpose: conservative, reversible cleaning pipeline. Run the pipeline cell to regenerate `urdu_stories_final_preprocessed.json`.

**Character filtering**: This notebook removes only sentences (text between `<EOS>` markers) that contain non-Ukdu characters (Latin letters, digits, etc.). Pure Urdu sentences are preserved intact.

In [10]:
# Imports & paths
import json, re, unicodedata
from pathlib import Path
from collections import Counter

NOTEBOOK_DIR = Path.cwd()
PHASE1_ROOT = NOTEBOOK_DIR
SRC_JSON = PHASE1_ROOT / 'urdu_stories_final.json'
if not SRC_JSON.exists():
    raise FileNotFoundError(f"{SRC_JSON} not found — run data-collection to create latest dataset")
CLEAN_DIR = PHASE1_ROOT
CLEAN_JSON = CLEAN_DIR / 'urdu_stories_final_preprocessed.json'

In [11]:
# Core normalization and cleaning utilities
import re, unicodedata

# Define Urdu Unicode range
URDU_RANGE = range(0x0600, 0x06FF + 1)

# Characters to KEEP (besides Urdu)
KEEP_CHARS = {'۔', '؟', '،', '؛', '!', ' '}

def is_urdu_or_allowed(char):
    """Check if character is Urdu or explicitly allowed."""
    if char in KEEP_CHARS:
        return True
    if ord(char) in URDU_RANGE:
        return True
    return False

def sentence_has_non_urdu(sentence):
    """Check if a sentence contains non-Ukdu characters (excluding special tokens)."""
    # Remove special tokens for checking
    temp = sentence
    for tok in ['<EOS>', '<EOP>', '<EOT>']:
        temp = temp.replace(tok, '')
    
    # Check each character
    for char in temp:
        if not is_urdu_or_allowed(char):
            return True
    return False

def remove_sentences_with_non_urdu(text):
    """Remove sentences that contain non-Ukdu characters."""
    # Split by EOS marker
    parts = text.split('<EOS>')
    
    clean_parts = []
    removed_count = 0
    
    for part in parts:
        # Check if this part (sentence) has non-Ukdu
        if sentence_has_non_urdu(part):
            removed_count += 1
            continue
        
        # Keep the part and add back EOS marker
        if part.strip():
            clean_parts.append(part.strip())
    
    # Rejoin with EOS markers
    result = ' <EOS> '.join(clean_parts)
    
    # Ensure EOT at the end
    if '<EOT>' not in result:
        result = result.strip() + ' <EOT>'
    
    return result.strip(), removed_count


_DIACRITICS_RE = re.compile('[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]')

def normalize_urdu(text: str) -> str:
    if not text:
        return text
    t = unicodedata.normalize('NFC', text)
    t = _DIACRITICS_RE.sub('', t)
    t = re.sub('[\u0622\u0623\u0625\u0671]', '\u0627', t)  # Alef variants
    t = re.sub('\u0643', '\u06A9', t)  # ك -> ک
    t = re.sub('\u064A', '\u06CC', t)  # ي -> ی
    t = re.sub('[\u200C\u200D\uFEFF]', '', t)
    t = re.sub('\u0001', '\u06D4', t)  # Fix SOH -> Urdu full stop
    t = re.sub(r'\s+', ' ', t).strip()
    return t


def collapse_duplicate_markers(text: str) -> (str, int):
    t = text
    changed = 0
    for tok in ['<EOS>','<EOP>','<EOT>']:
        pattern = re.compile(r'(?:' + re.escape(tok) + r')[\s\n]*(?:' + re.escape(tok) + r')+')
        t, n = pattern.subn(tok, t)
        changed += n
    t = re.sub(r'\s+', ' ', t)
    return t.strip(), changed


_TERMINATOR_RE = re.compile(r'([\u06D4\u061F\.\!\?])(?!\s*<EOS>)')

def insert_missing_eos(text: str) -> (str, int):
    inserts = 0
    parts = []
    last = 0
    for m in _TERMINATOR_RE.finditer(text):
        pos = m.start(1)
        prev_slice = text[last:pos+1]
        if len(prev_slice.strip()) >= 8:
            after = text[m.end(1): m.end(1)+10]
            if '<EOS>' not in after:
                inserts += 1
                parts.append(text[last:m.end(1)] + ' <EOS>')
                last = m.end(1)
    parts.append(text[last:])
    if inserts:
        return ''.join(parts), inserts
    return text, 0


def clean_story_text(orig: str) -> (str, dict):
    t = orig
    changes = {'normalized': False, 'collapsed_markers': 0, 'inserted_eos': 0, 'added_eot': 0, 'removed_sentences': 0}
    
    # Step 1: Normalize Urdu (before sentence removal)
    norm = normalize_urdu(t)
    if norm != t:
        changes['normalized'] = True
        t = norm
    
    # Step 2: Collapse duplicate markers
    t, collapsed = collapse_duplicate_markers(t)
    changes['collapsed_markers'] = collapsed
    
    # Step 3: Remove sentences with non-Ukdu characters
    t, removed = remove_sentences_with_non_urdu(t)
    changes['removed_sentences'] = removed
    
    # Step 4: Insert missing EOS
    t, inserted = insert_missing_eos(t)
    changes['inserted_eos'] = inserted
    
    # Step 5: Add EOT if missing
    if '<EOT>' not in t:
        t = t.strip() + ' <EOT>'
        changes['added_eot'] = 1
    
    # Step 6: Clean up whitespace
    t = re.sub(r'\s*<\s*(EOP|EOS|EOT)\s*>\s*', r' <\1> ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t, changes


print('Cleaning utilities defined')

Cleaning utilities defined


In [12]:
# Run cleaning pipeline
with open(SRC_JSON,'r',encoding='utf-8') as f:
    orig_stories = json.load(f)

cleaned = []
change_log = []
summary_counts = Counter()
for i,s in enumerate(orig_stories):
    content = s.get('content','')
    new_content, changes = clean_story_text(content)
    new_entry = dict(s)
    new_entry['content'] = new_content
    cleaned.append(new_entry)
    if any(changes.values()):
        change_log.append({'idx': i, 'title': s.get('urdu_title',''), **changes})
    for k,v in changes.items():
        if isinstance(v, int) and v>0:
            summary_counts[k] += v

print('Clean complete — stories:', len(cleaned))
print('Sentences removed:', summary_counts.get('removed_sentences', 0))

Clean complete — stories: 1780
Sentences removed: 19561


In [13]:
# Save cleaned JSON
with open(CLEAN_JSON,'w',encoding='utf-8') as f:
    json.dump(cleaned, f, ensure_ascii=False, indent=2)

print('Saved', len(cleaned), 'stories to', CLEAN_JSON.name)

Saved 1780 stories to urdu_stories_final_preprocessed.json
