In [13]:
from bs4 import BeautifulSoup
import requests
import re
import json
from tqdm import tqdm
from time import sleep
from typing import List, Dict, Any

CITATION_RE = re.compile(r"\[.+?\]")
MULTI_PARADIGM_RE = re.compile(r"^\s*Multi-paradigm\s*:?", flags=re.I)
SPLIT_RE = re.compile(r"[,;\u2022\u00B7\n]+")  # comma, semicolon, bullet, middot, newline
WHITESPACE_RE = re.compile(r"\s+")

def extract_paradigms_from_html(html: str) -> List[str]:
    """Return a list of paradigms found in the page's infobox."""

    soup = BeautifulSoup(html, "html.parser")
    infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
    if not infobox:
        return []

    # Search for a <th> that contains the word "Paradigm"
    th = infobox.find(
        "th",
        string=lambda s: isinstance(s, str) and "paradigm" in s.lower(),
    )
    if not th:
        return []
    td = th.find_next("td")
    if not td:
        return []

    # Remove footnotes
    for sup in td.find_all("sup"):
        sup.decompose()

    # Strategy 1: use the anchor tags – this covers most cases (including Rust)
    anchors = [a.get_text(" ", strip=True) for a in td.select("a[href]")]
    paradigms = [a for a in anchors if a]

    # Strategy 2: if anchors produced nothing, fall back to raw text split
    if not paradigms:
        raw = CITATION_RE.sub("", td.get_text(" ", strip=True))
        raw = MULTI_PARADIGM_RE.sub("", raw)
        parts = SPLIT_RE.split(raw)
        paradigms = [WHITESPACE_RE.sub(" ", p).strip() for p in parts if p.strip()]

    # Deduplicate while preserving order (case‑insensitive)
    seen = set()
    unique = []
    for p in paradigms:
        key = p.lower()
        if key not in seen:
            seen.add(key)
            unique.append(p)
    return unique


def process_records(records: List[Dict[str, Any]], delay: float = 0.5) -> List[Dict[str, Any]]:
    """Enrich every record with a `paradigm` key and return the new list."""

    enriched = []
    for rec in tqdm(records, desc="Scraping", unit="page"):
        url = rec.get("url")
        paradigms: List[str]
        if not url:
            paradigms = ["TODO: add manually"]
        else:
            try:
                r = requests.get(url, timeout=20)
                r.raise_for_status()
                paradigms = " ".join(extract_paradigms_from_html(r.text)) 
                if not paradigms:
                    paradigms = ["TODO: add manually"]
            except Exception as e:
                print(f"[WARN] {url}: {e}")
                paradigms = ["TODO: add manually"]
        new_rec = dict(rec)  # shallow copy keeps all original keys
        new_rec["paradigm"] = paradigms
        enriched.append(new_rec)
        sleep(delay)
    return enriched


with open("/home/bfh/irsed/daten/ProgLang/24/prog_lang.json", "r", encoding="utf-8") as f:
    original_records = json.load(f)

enriched = process_records(original_records)

# Write JSON
with open("/home/bfh/irsed/daten/ProgLang/out.json", "w", encoding="utf-8") as f:
    json.dump(enriched, f, ensure_ascii=False, indent=2)

print(f"Written {len(enriched)} records to /home/bfh/irsed/daten/ProgLang")

Scraping: 100%|██████████| 24/24 [00:22<00:00,  1.07page/s]

Written 24 records to /home/bfh/irsed/daten/ProgLang



