In [22]:
from urllib.parse import urlsplit, urlunsplit, urljoin
import time
import random
import json
import re
import requests
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from pathlib import Path

In [23]:
# =========================
# HELPERS
# =========================
def normalize_url(url):
    parts = list(urlsplit(url))
    parts[3] = ""  # remove query
    parts[4] = ""  # remove fragment
    return urlunsplit(parts)

# =========================
# CONFIG
# =========================
URL = "https://www.press.bmwgroup.com/global/article"
BASE_URL = "https://www.press.bmwgroup.com"
MAX_SCROLLS = 10
SCROLL_PAUSE = 2

# =========================
# SETUP DRIVER
# =========================
options = Options()
options.add_argument("--headless=new")
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

wait = WebDriverWait(driver, 20)

# =========================
# OPEN PAGE
# =========================
driver.get(URL)
time.sleep(3)

# =========================
# ACCEPT COOKIES (IF PRESENT)
# =========================
try:
    cookie_btn = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((
            By.XPATH,
            "//button[.//text()[contains(., 'Accept') or contains(., 'Agree')]]"
        ))
    )
    cookie_btn.click()
    time.sleep(1)
except:
    pass

# =========================
# CLICK "SHOW MORE"
# =========================
try:
    show_more = wait.until(
        EC.presence_of_element_located((By.ID, "lazy-load-button"))
    )
    driver.execute_script(
        "arguments[0].scrollIntoView({block: 'center'});",
        show_more
    )
    time.sleep(1)
    try:
        show_more.click()
    except ElementClickInterceptedException:
        driver.execute_script("arguments[0].click();", show_more)
    time.sleep(2)
except:
    print("WARNING: Show more button not found")

# =========================
# SCROLL TO LOAD ALL ARTICLES
# =========================
for i in range(MAX_SCROLLS):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(SCROLL_PAUSE)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    article_links = {
        normalize_url(urljoin(BASE_URL, a.get("href")))
        for a in soup.select('a[href*="/global/article/detail/"]')
    }

    print(f"Scroll {i+1}/10: Found {len(article_links)} articles (total)")

# =========================
# FINAL RESULT
# =========================
driver.quit()

article_links = sorted(article_links)

print("\n" + "="*60)
print("SCRAPING COMPLETE")
print("="*60)
print(f"Total articles found: {len(article_links)}")
print(f"First 10 article links:")
print("="*60)
for i, link in enumerate(article_links[:10], 1):
    print(f"{i:2}. {link}")
print("="*60)

python(24017) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24018) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24019) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24020) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24021) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24022) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24026) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24027) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24028) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24029) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24030) Malloc

Scroll 1/10: Found 68 articles (total)
Scroll 2/10: Found 88 articles (total)
Scroll 3/10: Found 108 articles (total)
Scroll 4/10: Found 128 articles (total)
Scroll 5/10: Found 147 articles (total)
Scroll 6/10: Found 167 articles (total)
Scroll 7/10: Found 187 articles (total)
Scroll 8/10: Found 206 articles (total)
Scroll 9/10: Found 226 articles (total)
Scroll 10/10: Found 246 articles (total)

SCRAPING COMPLETE
Total articles found: 246
First 10 article links:
 1. https://www.press.bmwgroup.com/global/article/detail/T0301185EN/cv-of-ilka-horstmeier-member-of-the-board-of-management-of-bmw-ag-people-and-real-estate-labour-relations-director
 2. https://www.press.bmwgroup.com/global/article/detail/T0443474EN/specifications-of-the-bmw-5-series-sedan-valid-from-03/2025
 3. https://www.press.bmwgroup.com/global/article/detail/T0449749EN/bmw-group-plant-regensburg-pilots-thermal-oil-system-for-heat-generation-in-paint-shop
 4. https://www.press.bmwgroup.com/global/article/detail/T0450128E

In [24]:
def extract_article_id(url: str) -> str:
    """
    Extracts article ID like T0454296EN from URL
    """
    m = re.search(r"/detail/(T\d+EN)", url)
    return m.group(1) if m else url.rstrip("/").split("/")[-1]

# =========================
# FIXED EVAL SET (PINNED)
# =========================
FIXED_EVAL_IDS = {
    "T0450921EN",
    "T0451220EN",
    "T0452795EN",
    "T0452972EN",
    "T0443474EN"
}

train_links = []
eval_links = []

for url in article_links:
    article_id = extract_article_id(url)
    if article_id in FIXED_EVAL_IDS:
        eval_links.append(url)
    else:
        train_links.append(url)

print("="*60)
print("TRAIN/EVAL SPLIT")
print("="*60)
print(f"Train articles: {len(train_links)} (98%)")
print(f"Eval articles:  {len(eval_links)} (2%)")
print(f"\nEval article IDs:")
for eval_id in [extract_article_id(u) for u in eval_links]:
    print(f"  - {eval_id}")
print("="*60)
assert len(eval_links) == len(FIXED_EVAL_IDS), "Eval set size mismatch!"

TRAIN/EVAL SPLIT
Train articles: 241 (98%)
Eval articles:  5 (2%)

Eval article IDs:
  - T0443474EN
  - T0450921EN
  - T0451220EN
  - T0452795EN
  - T0452972EN


In [25]:
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}
TIMEOUT = 20
SLEEP = 1.0  # be polite to BMW servers

def scrape_article(url: str) -> dict:
    r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "html.parser")

    article_div = soup.find("div", class_="article-detail")
    if not article_div:
        raise RuntimeError("article-detail div not found")

    # TITLE
    h1 = article_div.find("h1")
    title = h1.get_text(" ", strip=True) if h1 else ""

    # DATE
    date = ""

    # 1) Prefer <span class="date">...</span>
    date_span = article_div.select_one("div.article-info span.date")
    if date_span:
        date = date_span.get_text(" ", strip=True)

    # 2) Fallback: any span.date anywhere inside article_div
    if not date:
        date_span2 = article_div.select_one("span.date")
        if date_span2:
            date = date_span2.get_text(" ", strip=True)

    # 3) Fallback: <time> tag (some pages have it)
    if not date:
        time_tag = article_div.find("time")
        if time_tag:
            date = time_tag.get_text(" ", strip=True)

    # 4) Ultimate fallback
    if not date:
        date = "Not specified"

    # BODY
    body_div = article_div.find("div", id="article-text")
    paragraphs = []

    if body_div:
        for p in body_div.find_all("p"):
            text = p.get_text(" ", strip=True)
            if len(text) > 30:
                paragraphs.append(text)

    body_text = "\n\n".join(paragraphs)

    return {
        "id": extract_article_id(url),
        "url": url,
        "title": title,
        "date": date,
        "text": body_text
    }

In [26]:
train_dir = Path("data/raw/train")
eval_dir = Path("data/raw/eval")

for d in [train_dir, eval_dir]:
    if d.exists():
        shutil.rmtree(d)
    d.mkdir(parents=True, exist_ok=True)

print("="*60)
print("FOLDER SETUP")
print("="*60)
print("Cleared and recreated:")
print("   - data/raw/train/")
print("   - data/raw/eval/")
print("="*60)

FOLDER SETUP
Cleared and recreated:
   - data/raw/train/
   - data/raw/eval/


In [27]:
train_articles = []

for i, url in enumerate(train_links, 1):
    try:
        article = scrape_article(url)
        train_articles.append(article)

        with open(f"data/raw/train/{article['id']}.json", "w", encoding="utf-8") as f:
            json.dump(article, f, ensure_ascii=False, indent=2)

        print(f"[{i:3}/{len(train_links)}] TRAIN: {article['id'][:50]}")
        time.sleep(SLEEP)

    except Exception as e:
        print(f"TRAIN FAILED: {url[:60]}... → {str(e)[:40]}")

[  1/241] TRAIN: T0301185EN
[  2/241] TRAIN: T0449749EN
[  3/241] TRAIN: T0450128EN
[  4/241] TRAIN: T0450130EN
[  5/241] TRAIN: T0450131EN
[  6/241] TRAIN: T0450133EN
[  7/241] TRAIN: T0450299EN
[  8/241] TRAIN: T0450377EN
[  9/241] TRAIN: T0450581EN
[ 10/241] TRAIN: T0450587EN
[ 11/241] TRAIN: T0450605EN
[ 12/241] TRAIN: T0450661EN
[ 13/241] TRAIN: T0450668EN
[ 14/241] TRAIN: T0450698EN
[ 15/241] TRAIN: T0450699EN
[ 16/241] TRAIN: T0450706EN
[ 17/241] TRAIN: T0450747EN
[ 18/241] TRAIN: T0450758EN
[ 19/241] TRAIN: T0450780EN
[ 20/241] TRAIN: T0450782EN
[ 21/241] TRAIN: T0450786EN
[ 22/241] TRAIN: T0450787EN
[ 23/241] TRAIN: T0450833EN
[ 24/241] TRAIN: T0450842EN
[ 25/241] TRAIN: T0450844EN
[ 26/241] TRAIN: T0450856EN
[ 27/241] TRAIN: T0450940EN
[ 28/241] TRAIN: T0450970EN
[ 29/241] TRAIN: T0450977EN
[ 30/241] TRAIN: T0450991EN
[ 31/241] TRAIN: T0450997EN
[ 32/241] TRAIN: T0451008EN
[ 33/241] TRAIN: T0451034EN
[ 34/241] TRAIN: T0451042EN
[ 35/241] TRAIN: T0451047EN
[ 36/241] TRAIN: T04

In [28]:
eval_articles = []
for i, url in enumerate(eval_links, 1):
    try:
        article = scrape_article(url)
        eval_articles.append(article)

        with open(f"data/raw/eval/{article['id']}.json", "w", encoding="utf-8") as f:
            json.dump(article, f, ensure_ascii=False, indent=2)

        print(f"[{i}/{len(eval_links)}] EVAL:  {article['id'][:50]}")
        time.sleep(SLEEP)

    except Exception as e:
        print(f"EVAL FAILED: {url[:60]}... → {str(e)[:40]}")

[1/5] EVAL:  T0443474EN
[2/5] EVAL:  T0450921EN
[3/5] EVAL:  T0451220EN
[4/5] EVAL:  T0452795EN
[5/5] EVAL:  T0452972EN
