# 01 — Prepare Texts

In [1]:
from pathlib import Path, PurePath
def find_root(markers=("config.json","corpus.json",".git")):
    p = Path.cwd()
    for cand in (p, *p.parents):
        if any((cand/m).exists() for m in markers): return cand
    return Path.cwd()
ROOT = find_root()
RAW  = ROOT/"data_raw"; CLEAN = ROOT/"data_clean"; OUT = ROOT/"outputs"
for d in (RAW, CLEAN, OUT): d.mkdir(parents=True, exist_ok=True)
print("ROOT:", ROOT)

ROOT: d:\OneDrive\Documents\My Learning Resource\University Courses\DLSU\2025-26\T1\CSC715M\assignments\mc01


In [2]:
import json, re, html
cfg = json.loads((ROOT/"config.json").read_text(encoding="utf-8"))
TARGET = cfg["target_words_per_language"]
langs  = cfg["languages"]
langs

['ilocano',
 'kapampangan',
 'maguindanao',
 'ibanag',
 'tausug',
 'pangasinan',
 'kankanaey',
 'tagalog',
 'cebuano',
 'hiligaynon',
 'bikol',
 'maranao',
 'waray',
 'chavacano',
 'spanish',
 'english']

In [3]:
import re

def normalize(s: str) -> str:
    s = s.lower()
    s = re.sub(r"https?://\S+|www\.\S+", " ", s)
    s = re.sub(r"\d+", " ", s)
    s = re.sub(r"[^\w\sñáéíóúü-]", " ", s)  # keep letters, digits removed, hyphen kept
    s = re.sub(r"\s+", " ", s).strip()
    return s

ok, warn, miss = [], [], []
for lang in langs:
    rawp, cleanp = RAW / f"{lang}.txt", CLEAN / f"{lang}.txt"
    if not rawp.exists() or rawp.stat().st_size == 0:
        miss.append(lang); print(f"[skip] {lang}: missing {rawp.name}"); continue

    text = normalize(rawp.read_text(encoding="utf-8", errors="ignore"))
    toks = text.split()

    if not toks:
        miss.append(lang); print(f"[skip] {lang}: 0 tokens after normalization"); continue

    keep = toks[:TARGET] if len(toks) >= TARGET else toks
    if len(toks) < TARGET: warn.append((lang, len(toks)))

    cleanp.write_text(" ".join(keep), encoding="utf-8")
    ok.append((lang, len(keep)))

print("\nDone.")
print("ok     :", [f"{l}({n})" for l, n in ok])
print("warn   :", [f"{l}({n})" for l, n in warn])  # under TARGET kept
print("missing:", miss)


Done.
ok     : ['ilocano(50000)', 'kapampangan(50000)', 'maguindanao(50000)', 'ibanag(50000)', 'tausug(50000)', 'pangasinan(50000)', 'kankanaey(50000)', 'tagalog(50000)', 'cebuano(50000)', 'hiligaynon(50000)', 'bikol(50000)', 'maranao(50000)', 'waray(50000)', 'chavacano(50000)', 'spanish(50000)', 'english(50000)']
warn   : []
missing: []
