### Scraper for scraping Burmese corpus texts from BBC Burmese
#### scrape bbc burmese articles, clean text to pure burmese corpus files


In [1]:
import re
import time
import requests
import unicodedata
import polars as pl
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pathlib import Path
from datetime import datetime

In [2]:
# base config
BASE_URL = "https://www.bbc.com/burmese"
TOPIC_URL = "https://www.bbc.com/burmese/topics/cl3rq8rkqgxt"
ARTICLE_PATTERN = "/articles/"
MAX_ARTICLES, MAX_PAGES, DELAY_SEC = 1000, 100, 0.8

In [3]:
# output
OUTPUT_DIR = Path("data/bbcburmese")
DOWNLOADED_FILE = OUTPUT_DIR / "downloaded_urls.txt"
FNAME_TMPL = "bbcburmese_{ts}_{id}.txt"

In [4]:
# cleaning rules
SENT_MIN, SENT_MAX = 5, 4000
KEEP_BURMESE_RE = re.compile(r"[^\u1000-\u109F။၊ ]+")
URL_RE = re.compile(r"http\S+|www\.\S+", re.UNICODE)

In [5]:
# http
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "corpus-scraper/1.0"})

In [6]:
# helpers
log = lambda m: print(f"[log] {m}")
warn = lambda m: print(f"[warn] {m}")

In [7]:
def load_urls() -> set[str]:
    return set(DOWNLOADED_FILE.read_text("utf-8").splitlines()) if DOWNLOADED_FILE.exists() else set()

def save_urls(urls: set[str]):
    all_urls = load_urls().union(urls)
    DOWNLOADED_FILE.parent.mkdir(parents=True, exist_ok=True)
    DOWNLOADED_FILE.write_text("\n".join(sorted(all_urls)), "utf-8")

def norm_unicode(s: str) -> str:
    return unicodedata.normalize("NFC", s)

def clean_burmese(s: str) -> str:
    s = URL_RE.sub(" ", s)
    s = KEEP_BURMESE_RE.sub(" ", s)
    return norm_unicode(re.sub(r"\s+", " ", s).strip())

def split_sents(s: str) -> list[str]:
    return [p.strip() for p in re.split(r"(?<=[။၊])", s) if p.strip()]

In [8]:
def get_links() -> list[str]:
    links = set()
    for page in range(1, MAX_PAGES + 1):
        if len(links) >= MAX_ARTICLES: break
        url = TOPIC_URL if page == 1 else f"{TOPIC_URL}?page={page}"
        try:
            r = SESSION.get(url, timeout=12)
            r.raise_for_status()
        except Exception as e:
            warn(f"topic page {page} failed: {e}")
            time.sleep(DELAY_SEC)
            continue
        soup = BeautifulSoup(r.text, "html.parser")
        for a in soup.find_all("a", href=True):
            if ARTICLE_PATTERN in a["href"]:
                links.add(urljoin(BASE_URL, a["href"]))
        log(f"page {page} -> links {len(links)}")
        time.sleep(DELAY_SEC)
    return sorted(links)[:MAX_ARTICLES]


In [9]:

def scrape(url: str) -> str:
    try:
        r = SESSION.get(url, timeout=12)
        r.raise_for_status()
    except Exception as e:
        warn(f"article {url} failed: {e}")
        return ""
    soup = BeautifulSoup(r.text, "html.parser")
    return "\n".join(p.get_text(" ", strip=True) for p in soup.find_all("p"))


In [10]:

def save_article(text: str, idx: int, ts: str) -> Path:
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    path = OUTPUT_DIR / FNAME_TMPL.format(ts=ts, id=idx)
    path.write_text(text, "utf-8")
    return path

In [11]:
def build_articles(links: list[str], ts: str) -> list[dict]:
    recs = []
    for i, url in enumerate(links, 1):
        log(f"scrape {i}/{len(links)} {url}")
        raw = scrape(url)
        if not raw: continue
        clean = clean_burmese(raw)
        sents = [s for s in split_sents(clean) if SENT_MIN <= len(s) <= SENT_MAX]
        text = "\n".join(sents) if sents else clean
        path = save_article(text, i, ts)
        recs.append({"article_id": i, "url": url, "file_path": str(path),
                     "sent_count": len(sents), "chars": len(text)})
        time.sleep(DELAY_SEC)
    return recs


In [12]:
def analyze(recs: list[dict]) -> pl.DataFrame:
    if not recs: return pl.DataFrame()
    df = pl.DataFrame(recs)
    samples = []
    for p in df["file_path"]:
        try:
            first = Path(p).read_text("utf-8").splitlines()[0]
        except Exception:
            first = ""
        samples.append(first)
    df = df.with_columns(pl.Series("sample", samples))
    log(f"articles: {df.height}")
    log(f"total chars: {df['chars'].sum()}")
    log(f"total sents: {df['sent_count'].sum()}")
    return df


In [13]:
def load_sentences(files: list[str]) -> pl.DataFrame:
    rows = []
    for fp in files:
        try:
            lines = [l.strip() for l in Path(fp).read_text("utf-8").splitlines() if l.strip()]
        except Exception: continue
        for i, l in enumerate(lines, 1):
            rows.append({"file_path": fp, "line": i, "sentence": l, "chars": len(l)})
    return pl.DataFrame(rows).unique(subset=["sentence"]) if rows else pl.DataFrame()

In [14]:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
urls_old = load_urls()
urls_new = [u for u in get_links() if u not in urls_old]
log(f"new links: {len(urls_new)}")

df_sentences = pl.DataFrame([])

if urls_new:
    recs = build_articles(urls_new, ts)
    if recs:
        save_urls({r["url"] for r in recs})
        df_articles = analyze(recs)
        df_sentences = load_sentences(df_articles["file_path"].to_list())
        log(f"sentences: {df_sentences.height}")

search_str = "မြန်မာ"  # change or set to "" to skip
if not df_sentences.is_empty() and search_str:
    res = df_sentences.filter(pl.col("sentence").str.contains(search_str))
    display(res.select(["file_path", "line", "chars", "sentence"]).head(20))

[log] page 1 -> links 24
[log] page 2 -> links 48
[log] page 3 -> links 72
[log] page 4 -> links 96
[log] page 5 -> links 120
[log] page 6 -> links 144
[log] page 7 -> links 168
[log] page 8 -> links 192
[log] page 9 -> links 216
[log] page 10 -> links 240
[log] page 11 -> links 264
[log] page 12 -> links 288
[log] page 13 -> links 312
[log] page 14 -> links 336
[log] page 15 -> links 360
[log] page 16 -> links 384
[log] page 17 -> links 408
[log] page 18 -> links 432
[log] page 19 -> links 454
[log] page 20 -> links 464
[log] page 21 -> links 464
[log] page 22 -> links 464
[log] page 23 -> links 464
[log] page 24 -> links 464
[log] page 25 -> links 464
[log] page 26 -> links 464
[log] page 27 -> links 464
[log] page 28 -> links 464
[log] page 29 -> links 464
[log] page 30 -> links 464
[log] page 31 -> links 464
[log] page 32 -> links 464
[log] page 33 -> links 464
[log] page 34 -> links 464
[log] page 35 -> links 464
[log] page 36 -> links 464
[log] page 37 -> links 464
[log] page 38 

file_path,line,chars,sentence
str,i64,i64,str
"""data/bbcburmese/bbcburmese_202…",23,163,"""ဓာတ်ပုံ ရင်းမြစ် မြန်မာစစ်တပ်ဟ…"
"""data/bbcburmese/bbcburmese_202…",27,161,"""နယ်မြေလုံခြုံရေးကြောင့် ကွင်းဆ…"
"""data/bbcburmese/bbcburmese_202…",186,185,"""ကိုးကန့်နဲ့ ကချင်တပ်တွေကြားက ပ…"
"""data/bbcburmese/bbcburmese_202…",46,174,"""မစံပယ်တို့ကို နေခိုင်းထားတဲ့ဂိ…"
"""data/bbcburmese/bbcburmese_202…",83,199,"""ဒါအပြင် တရုတ်နိုင်ငံခြားရေးဝန်…"
…,…,…,…
"""data/bbcburmese/bbcburmese_202…",82,136,"""ဆရာမ အယောက် ၅၀ ကျော် အသစ်ခန့်အ…"
"""data/bbcburmese/bbcburmese_202…",68,133,"""ကလေးငယ်တွေကို ကာကွယ်စောင့်ရှော…"
"""data/bbcburmese/bbcburmese_202…",1,101,"""ဓာတ်ပုံ ရင်းမြစ် ၁၉ ရာစု မြန်မ…"
"""data/bbcburmese/bbcburmese_202…",90,224,"""နန်းမြို့ထဲက မြန်မာမှူးမတ်တွေန…"
