In [9]:
import json
import uuid
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

# -------------------------------
# STEP 1: Collect all article links (auto-load)
# -------------------------------
def get_all_links(base_url):
    driver = webdriver.Chrome()
    driver.get(base_url)

    links = set()
    click_count = 0

    while True:
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        for a in soup.select("a[href*='/a/']"):
            url = a.get("href")
            if url and url.startswith("/a/"):
                links.add("https://burmese.voanews.com" + url)

        try:
            button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "a.link-showMore"))
            )
            driver.execute_script("arguments[0].click();", button)
            click_count += 1
            print(f"Clicked load more {click_count} times, total links so far: {len(links)}")
        except:
            print("No more button, finished loading all articles.")
            break

    driver.quit()
    return list(links)

# -------------------------------
# STEP 2: Scrape each article with retry
# -------------------------------
def scrape_article(url, retries=3):
    for attempt in range(retries):
        try:
            r = requests.get(url, timeout=10)
            r.raise_for_status()
            soup = BeautifulSoup(r.text, "html.parser")

            title_el = soup.find("h1")
            title = title_el.get_text(strip=True) if title_el else ""

            paragraphs = []
            for selector in ["div.wsw p", "div.content p", "div.article__body p"]:
                ps = [p.get_text(strip=True) for p in soup.select(selector)]
                paragraphs.extend(ps)

            paragraphs = [p for p in paragraphs if p and "No media source currently available" not in p]
            text = "\n".join(paragraphs)

            return {
                "id": str(uuid.uuid4()),
                "title": title,
                "text": text,
                "domain": "News",
                "url": url
            }

        except Exception as e:
            print(f"Error scraping {url} attempt {attempt+1}: {e}")
            time.sleep(1)
    return None

# -------------------------------
# STEP 3: Save JSON chunk
# -------------------------------
def save_chunk(data, chunk_num, output_dir="data"):
    Path(output_dir).mkdir(exist_ok=True)
    filename = f"{output_dir}/voa_burmese_part{chunk_num}.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"💾 Saved {len(data)} articles to {filename}")

# -------------------------------
# STEP 4: Run scraper in parallel & auto-split
# -------------------------------
def scrape_all(base_url, workers=20, chunk_size=1000):
    links = get_all_links(base_url)
    print(f"Found {len(links)} article links")

    results = []
    chunk_num = 1

    with ThreadPoolExecutor(max_workers=workers) as executor:
        future_to_url = {executor.submit(scrape_article, url): url for url in links}
        for i, future in enumerate(as_completed(future_to_url), 1):
            data = future.result()
            if data:
                results.append(data)

            # Save every chunk_size articles
            if len(results) >= chunk_size:
                save_chunk(results, chunk_num)
                chunk_num += 1
                results = []

    # Save remaining articles
    if results:
        save_chunk(results, chunk_num)

    print("✅ Scraping finished!")

# -------------------------------
# RUN
# -------------------------------
if __name__ == "__main__":
    scrape_all("https://burmese.voanews.com/myanmar", workers=28, chunk_size=1000)


Clicked load more 1 times, total links so far: 12
Clicked load more 2 times, total links so far: 24
Clicked load more 3 times, total links so far: 36
Clicked load more 4 times, total links so far: 48
Clicked load more 5 times, total links so far: 60
Clicked load more 6 times, total links so far: 72
Clicked load more 7 times, total links so far: 84
Clicked load more 8 times, total links so far: 96
Clicked load more 9 times, total links so far: 108
Clicked load more 10 times, total links so far: 120
Clicked load more 11 times, total links so far: 132
Clicked load more 12 times, total links so far: 144
Clicked load more 13 times, total links so far: 156
Clicked load more 14 times, total links so far: 168
Clicked load more 15 times, total links so far: 180
Clicked load more 16 times, total links so far: 192
Clicked load more 17 times, total links so far: 204
Clicked load more 18 times, total links so far: 216
Clicked load more 19 times, total links so far: 228
Clicked load more 20 times, t