In [19]:
from pathlib import Path
import os, re, csv, json, tarfile, zipfile, gzip, io, html, unicodedata
from datetime import datetime

def find_root(markers=("config.json","corpus.json")):
    p = Path.cwd()
    for c in (p, *p.parents):
        if any((c/m).exists() for m in markers):
            return c
    return Path.cwd()

ROOT   = find_root()
RAW    = ROOT/"data_raw"
LEIPZ  = RAW/"leipzig"
OUT    = ROOT/"outputs"
OUT.mkdir(parents=True, exist_ok=True)

TARGET = 50_000  # word target per rubric

print("ROOT :", ROOT)
print("LEIPZ:", LEIPZ)
print("OUT  :", OUT)

ROOT : d:\OneDrive\Documents\My Learning Resource\University Courses\DLSU\2025-26\T1\CSC715M\assignments\mc01
LEIPZ: d:\OneDrive\Documents\My Learning Resource\University Courses\DLSU\2025-26\T1\CSC715M\assignments\mc01\data_raw\leipzig
OUT  : d:\OneDrive\Documents\My Learning Resource\University Courses\DLSU\2025-26\T1\CSC715M\assignments\mc01\outputs


In [20]:
MANUAL_MAP = {
    "tgl"  : "tagalog",
    "ceb" : "cebuano",
    "ilo" : "ilocano",
    "hil" : "hiligaynon",
    "war" : "waray",
    "pam" : "kapampangan",
    "bcl" : "bikol",
    "pag" : "pangasinan",
    "cbk" : "chavacano",
    "eng" : "english",
    "spa" : "spanish",
}

In [21]:
def normalize(s:str)->str:
    s = s.lower()
    s = re.sub(r"https?://\S+|www\.\S+", " ", s)
    s = re.sub(r"\d+", " ", s)
    s = re.sub(r"[^\w\sñáéíóúü-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def guess_key_from_name(name: str) -> str:
    """
    Attempt to guess a short language key from a Leipzig archive name.
    We pick the first token before '_' if it looks like a code (letters only, 2–5 chars).
    e.g., 'tl_news_2012_1M.tar.gz' -> 'tl'
          'ceb_wikipedia_2020_300K.zip' -> 'ceb'
    """
    stem = Path(name).name
    stem = re.sub(r"\.(tar\.gz|tgz|zip|gz)$", "", stem, flags=re.I)
    token = stem.split("_")[0]
    token = re.sub(r"[^A-Za-z]", "", token)
    if 2 <= len(token) <= 5:
        return token.lower()
    return stem.lower()

def to_lang_out(archive_name: str) -> str:
    key = guess_key_from_name(archive_name)
    return MANUAL_MAP.get(key, key)  # fallback to the guessed key if not in MANUAL_MAP

In [22]:
archives = []
if LEIPZ.exists():
    for p in sorted(LEIPZ.iterdir()):
        if p.suffix.lower() in (".zip", ".gz"):
            archives.append(p)
        elif p.suffix.lower() == ".tgz" or str(p).lower().endswith(".tar.gz"):
            archives.append(p)

print("Found archives:", len(archives))
for a in archives: print(" -", a.name)
assert archives, "Place Leipzig archives into data_raw/leipzig/ and rerun."


Found archives: 8
 - bcl_community_2017.tar.gz
 - ceb_community_2017.tar.gz
 - eng_wikipedia_2016_10K.tar.gz
 - ilo_community_2017.tar.gz
 - pag_community_2017.tar.gz
 - pam_community_2017.tar.gz
 - spa_wikipedia_2021_10K.tar.gz
 - tgl_community_2017.tar.gz


In [23]:
def read_sentences_from_member(fobj) -> str:
    """
    Read a Leipzig '*-sentences.txt' member file and return plain text (one sentence per line).
    Expected format is usually TSV: <id>\t<sentence>
    We keep only the sentence column (index 1) if it exists; otherwise whole line.
    """
    out_lines = []
    for raw in fobj:
        if isinstance(raw, bytes):
            line = raw.decode("utf-8", "ignore").rstrip("\n")
        else:
            line = raw.rstrip("\n")
        if not line.strip():
            continue
        parts = line.split("\t")
        if len(parts) >= 2:
            out_lines.append(parts[1].strip())
        else:
            out_lines.append(line.strip())
    return "\n".join(out_lines)

def extract_text_from_archive(arc_path: Path) -> str:
    """
    Return concatenated sentences text from a Leipzig archive.
    Searches for the first member matching '*-sentences.txt' (or '.txt.gz' inside zip/tar).
    """
    name = arc_path.name.lower()
    if name.endswith(".zip"):
        with zipfile.ZipFile(arc_path, "r") as zf:
            # try uncompressed sentences first
            candidates = [m for m in zf.namelist() if m.endswith("-sentences.txt")]
            gz_candidates = [m for m in zf.namelist() if m.endswith("-sentences.txt.gz")]
            if candidates:
                with zf.open(candidates[0], "r") as f:
                    return read_sentences_from_member(f)
            if gz_candidates:
                with zf.open(gz_candidates[0], "r") as f:
                    with gzip.open(io.BytesIO(f.read()), "rt", encoding="utf-8", errors="ignore") as gf:
                        return read_sentences_from_member(gf)
            raise FileNotFoundError("No *-sentences.txt found inside ZIP.")
    elif name.endswith(".tar.gz") or name.endswith(".tgz"):
        mode = "r:gz"
        with tarfile.open(arc_path, mode) as tf:
            members = tf.getmembers()
            cand = next((m for m in members if m.name.endswith("-sentences.txt")), None)
            if cand:
                f = tf.extractfile(cand); assert f is not None
                return read_sentences_from_member(f)
            cand_gz = next((m for m in members if m.name.endswith("-sentences.txt.gz")), None)
            if cand_gz:
                f = tf.extractfile(cand_gz); assert f is not None
                with gzip.open(io.BytesIO(f.read()), "rt", encoding="utf-8", errors="ignore") as gf:
                    return read_sentences_from_member(gf)
            raise FileNotFoundError("No *-sentences.txt found inside TAR.GZ.")
    elif name.endswith(".gz"):
        # Could be a single *-sentences.txt.gz file
        with gzip.open(arc_path, "rt", encoding="utf-8", errors="ignore") as gf:
            return read_sentences_from_member(gf)
    else:
        raise ValueError(f"Unsupported archive format: {arc_path}")


In [24]:
summary = []
for arc in archives:
    lang_out = to_lang_out(arc.name)
    raw_txt_path = RAW / f"{lang_out}.txt"

    try:
        print(f"\n[{lang_out}] Extracting from {arc.name} …")
        text = extract_text_from_archive(arc)
        if not text.strip():
            print(f"  [warn] Empty text extracted from {arc.name}. Skipping write.")
            raw_count = 0
            norm_count = 0
        else:
            # Write raw extracted sentences (one per line)
            raw_txt_path.write_text(text, encoding="utf-8")
            # Counts
            raw_count  = len(text.split())
            norm_count = len(normalize(text).split())
            print(f"  wrote → {raw_txt_path.name}")
            print(f"  words: raw={raw_count:,} | normalized={norm_count:,}")

        meets = norm_count >= TARGET
        summary.append({
            "archive": arc.name,
            "lang_out": lang_out,
            "raw_words": raw_count,
            "normalized_words": norm_count,
            "meets_50k": bool(meets),
            "output_path": str(raw_txt_path),
        })
    except Exception as e:
        print(f"  [error] {arc.name}: {e}")
        summary.append({
            "archive": arc.name,
            "lang_out": lang_out,
            "raw_words": 0,
            "normalized_words": 0,
            "meets_50k": False,
            "output_path": str(raw_txt_path),
            "error": str(e),
        })

print("\nDone ingesting Leipzig archives.")



[bikol] Extracting from bcl_community_2017.tar.gz …
  wrote → bikol.txt
  words: raw=281,402 | normalized=273,651

[cebuano] Extracting from ceb_community_2017.tar.gz …
  wrote → cebuano.txt
  words: raw=10,666,146 | normalized=10,032,403

[english] Extracting from eng_wikipedia_2016_10K.tar.gz …
  wrote → english.txt
  words: raw=209,518 | normalized=206,131

[ilocano] Extracting from ilo_community_2017.tar.gz …
  wrote → ilocano.txt
  words: raw=391,789 | normalized=381,602

[pangasinan] Extracting from pag_community_2017.tar.gz …
  wrote → pangasinan.txt
  words: raw=64,484 | normalized=58,456

[kapampangan] Extracting from pam_community_2017.tar.gz …
  wrote → kapampangan.txt
  words: raw=300,047 | normalized=288,806

[spanish] Extracting from spa_wikipedia_2021_10K.tar.gz …
  wrote → spanish.txt
  words: raw=208,068 | normalized=203,412

[tagalog] Extracting from tgl_community_2017.tar.gz …
  wrote → tagalog.txt
  words: raw=18,629,220 | normalized=18,651,175

Done ingesting Leip