In [16]:
from pathlib import Path

def find_root(markers=("config.json","corpus.json",".git")):
    p = Path.cwd()
    for cand in (p, *p.parents):
        if any((cand/m).exists() for m in markers):
            return cand
    return Path.cwd()

ROOT = find_root()
RAW  = ROOT/"data_raw"
WIKI = RAW/"wikipedia"
RAW.mkdir(parents=True, exist_ok=True)
WIKI.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)
print("RAW :", RAW)

ROOT: d:\OneDrive\Documents\My Learning Resource\University Courses\DLSU\2025-26\T1\CSC715M\assignments\mc01
RAW : d:\OneDrive\Documents\My Learning Resource\University Courses\DLSU\2025-26\T1\CSC715M\assignments\mc01\data_raw


In [17]:
import json

cfg = json.loads((ROOT/"config.json").read_text(encoding="utf-8"))
corp = json.loads((ROOT/"corpus.json").read_text(encoding="utf-8"))

TARGET_WORDS = int(cfg.get("target_words_per_language", 50_000))
LANGS = cfg["languages"]

# Keep only wikipedia entries for languages in config.json
wiki_sources = [s for s in corp["sources"]
                if s.get("source")=="wikipedia" and s["lang"] in LANGS]

print("Languages (config):", len(LANGS), LANGS)
print("Wikipedia sources  :", len(wiki_sources))
wiki_sources[:5]

Languages (config): 16 ['ilocano', 'kapampangan', 'maguindanao', 'ibanag', 'tausug', 'pangasinan', 'kankanaey', 'tagalog', 'cebuano', 'hiligaynon', 'bikol', 'aklanon', 'waray', 'chavacano', 'spanish', 'english']
Wikipedia sources  : 4


[{'lang': 'chavacano', 'source': 'wikipedia', 'dump_code': 'cbk_zam'},
 {'lang': 'waray', 'source': 'wikipedia', 'dump_code': 'war'},
 {'lang': 'kapampangan', 'source': 'wikipedia', 'dump_code': 'pam'},
 {'lang': 'bikol', 'source': 'wikipedia', 'dump_code': 'bcl'}]

In [18]:
import urllib.request, shutil

def dump_url(code: str) -> str:
    base = f"{code}wiki"
    return f"https://dumps.wikimedia.org/{base}/latest/{base}-latest-pages-articles.xml.bz2"

def download(url: str, out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    print("↘", url)
    req = urllib.request.Request(url, headers={"User-Agent":"Mozilla/5.0 (ling-cluster/1.0)"})
    with urllib.request.urlopen(req) as r, open(out_path, "wb") as f:
        shutil.copyfileobj(r, f)
    # quick validity check
    sig = out_path.read_bytes()[:3]
    assert sig == b"BZh", f"Not a bzip2 file (got {sig}); delete and re-run."

In [19]:
import re, bz2, html, unicodedata

def normalize(s: str) -> str:
    s = s.lower()
    s = re.sub(r"https?://\S+|www\.\S+", " ", s)
    s = re.sub(r"\d+", " ", s)
    s = re.sub(r"[^\w\sñáéíóúü-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

TEXT_OPEN  = re.compile(r"<text\b[^>]*>", re.IGNORECASE)
TEXT_CLOSE = re.compile(r"</text>", re.IGNORECASE)

def extract_until_words(xml_bz2: Path, out_txt: Path, target_words: int, margin: float = 1.2) -> dict:
    """
    Streams the bz2 XML, writes one cleaned page per line to out_txt until
    cumulative words >= target_words * margin. Keeps memory tiny.
    """
    out_txt.parent.mkdir(parents=True, exist_ok=True)
    goal = int(target_words * margin)
    words_acc = 0
    pages = 0
    in_text = False
    buf = []

    with bz2.open(xml_bz2, "rt", encoding="utf-8", errors="ignore") as fin, \
         open(out_txt, "w", encoding="utf-8") as fout:
        for line in fin:
            if not in_text:
                m = TEXT_OPEN.search(line)
                if m:
                    in_text = True
                    buf.append(line[m.end():])  # content after <text ...>
            else:
                buf.append(line)
                if TEXT_CLOSE.search(line):
                    # combine, cut at </text>, clean, write
                    chunk = "".join(buf)
                    chunk = TEXT_CLOSE.split(chunk, maxsplit=1)[0]
                    buf.clear(); in_text = False

                    txt = normalize(chunk)
                    if txt:
                        fout.write(txt + "\n")
                        pages += 1
                        words_acc += len(txt.split())
                        if words_acc >= goal:
                            break

    return {"pages": pages, "words": words_acc, "goal": goal}


In [20]:
MARGIN = 1.2  # e.g., collect ~60k words so later normalization can still cap to cleanly

for src in wiki_sources:
    lang, code = src["lang"], src["dump_code"]

    xml_path = WIKI / f"{code}wiki-latest-pages-articles.xml.bz2"
    out_txt  = RAW  / f"{lang}.txt"

    if not xml_path.exists():
        print(f"\nDownloading {lang} ({code}) …")
        try:
            download(dump_url(code), xml_path)
        except Exception as e:
            print(f"[{lang}] download failed → {e}")
            continue
    else:
        print(f"\nUsing existing dump for {lang}: {xml_path.name}")

    # Start fresh (optional): delete prior partial output
    if out_txt.exists(): out_txt.unlink()

    print(f"Extracting {lang} → {out_txt.name} (target ≈ {int(TARGET_WORDS*MARGIN):,} words)")
    stats = extract_until_words(xml_path, out_txt, target_words=TARGET_WORDS, margin=MARGIN)
    size_mb = out_txt.stat().st_size/1_000_000
    print(f"[{lang}] pages={stats['pages']:,}, words≈{stats['words']:,}, file={size_mb:.2f} MB")


Using existing dump for chavacano: cbk_zamwiki-latest-pages-articles.xml.bz2
Extracting chavacano → chavacano.txt (target ≈ 6,000 words)
[chavacano] pages=16, words≈7,776, file=0.05 MB

Using existing dump for waray: warwiki-latest-pages-articles.xml.bz2
Extracting waray → waray.txt (target ≈ 6,000 words)
[waray] pages=12, words≈6,286, file=0.04 MB

Using existing dump for kapampangan: pamwiki-latest-pages-articles.xml.bz2
Extracting kapampangan → kapampangan.txt (target ≈ 6,000 words)
[kapampangan] pages=18, words≈12,662, file=0.07 MB

Downloading bikol (bcl) …
↘ https://dumps.wikimedia.org/bclwiki/latest/bclwiki-latest-pages-articles.xml.bz2
Extracting bikol → bikol.txt (target ≈ 6,000 words)
[bikol] pages=19, words≈6,883, file=0.04 MB
