In [23]:
from bs4 import BeautifulSoup
import requests
import re
import json
from tqdm import tqdm
from time import sleep
from typing import List, Dict, Any

CITATION_RE         = re.compile(r"\[.+?\]")
MULTI_PARADIGM_RE   = re.compile(r"^\s*Multi-paradigm\s*:?", flags=re.I)
SPLIT_RE            = re.compile(r"[,;\u2022\u00B7\n]+")  # comma, semicolon, bullet, middot, newline
WHITESPACE_RE       = re.compile(r"\s+")


def extract_infobox_from_html(html: str) -> Dict[str, str]:
    """
    Liest alle Key-Value-Paare aus der Infobox aus (th scope="row" → td.infobox-data).
    Gibt ein Dictionary zurück, in dem die infobox-Felder als key:value stehen.
    """
    soup = BeautifulSoup(html, "html.parser")
    infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
    if not infobox:
        return {}

    data: Dict[str, str] = {}
    # Jede Zeile (<tr>) in der Infobox durchlaufen
    for row in infobox.find_all("tr"):
        th = row.find("th", scope="row")
        # Manche Infoboxen haben kein scope="row", dann einfach <th> ohne scope
        if not th:
            th = row.find("th", class_=lambda c: c and "infobox-label" in c)

        td = row.find("td", class_=lambda c: c and "infobox-data" in c)
        # Fallback: wenn td keine Klasse hat, aber da ist, verwenden
        if not td:
            td = row.find("td")

        if th and td:
            key = th.get_text(" ", strip=True)
            # Alle Fußnoten entfernen
            for sup in td.find_all("sup"):
                sup.decompose()
            # Inneren Text zu einem String zusammenfassen
            value = td.get_text(" ", strip=True)
            # Mehrfach-Leerzeichen reduzieren
            value = WHITESPACE_RE.sub(" ", value).strip()
            data[key] = value

    return data


def process_records(records: List[Dict[str, Any]], delay: float = 0.5) -> List[Dict[str, Any]]:
    """
    Geht jedes Record-Dict durch, lädt die zugehörige URL, extrahiert Infobox-Felder
    (und optional Paradigmen), und fügt sie dem neuen Record als verschachteltes Dict
    unter "infobox" ein. Gibt die neue Liste zurück.
    """
    enriched: List[Dict[str, Any]] = []
    for rec in tqdm(records, desc="Scraping", unit="page"):
        url = rec.get("url")
        infobox_fields: Dict[str, str]
        paradigms: List[str]

        if not url:
            infobox_fields = {}
            paradigms = []
        else:
            try:
                r = requests.get(url, timeout=20)
                r.raise_for_status()
                html = r.text

                # 1. Alle Infobox-Felder holen
                infobox_fields = extract_infobox_from_html(html)
                if not paradigms:
                    paradigms = []
            except Exception as e:
                print(f"[WARN] {url}: {e}")
                infobox_fields = {}
                paradigms = []

        # Neues Record erstellen (shallow copy)
        new_rec = dict(rec)
        # Infobox-Felder als verschachteltes Dict hinzufügen
        new_rec["infobox"] = infobox_fields
        # Optional: Paradigma-Liste (kann auch aus infobox_fields["Paradigm"] stammen)
        new_rec["paradigm_list"] = paradigms

        enriched.append(new_rec)
        sleep(delay)

    return enriched


if __name__ == "__main__":
    print("Starting scraping...")
    # # Beispiel: JSON-Datei mit [{"name": "...", "url": "..."}]-Records
    # with open("/home/bfh/irsed/daten/ProgLang/24/prog_lang.json", "r", encoding="utf-8") as f:
    #     original_records = json.load(f)

    # enriched = process_records(original_records)

    # # Ausgabe-Datei schreiben
    # out_path = "/home/bfh/irsed/daten/ProgLang/out.json"
    # with open(out_path, "w", encoding="utf-8") as f:
    #     json.dump(enriched, f, ensure_ascii=False, indent=2)

    # print(f"Written {len(enriched)} records to {out_path}")


Starting scraping...


In [24]:
with open("/home/bfh/irsed/daten/ProgLang/24/prog_lang.json", "r", encoding="utf-8") as f:
    original_records = json.load(f)

def load_html(url: str) -> str:
    """
    Lädt den HTML-Inhalt von der angegebenen URL.
    """
    try:
        sleep(0.5)  # Kurze Pause, um die Server nicht zu überlasten
        response = requests.get(url, timeout=20)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"[ERROR] Could not load {url}: {e}")
        return ""  # Leerer String bei Fehler
    
    
urls = [record.get("url") for record in original_records]

infoboxes = []

for url in urls:
    html_content = load_html(url)
    if html_content:
        infobox = extract_infobox_from_html(html_content)
        if url == "https://en.wikipedia.org/wiki/MATLAB":
            infobox = {
                "paradigms": "multi-paradigm: functional, imperative, procedural, object-oriented, array",
                "first_appeared": "late 1970s",
                "stable_release": "R2024b[2] Edit this on Wikidata / September 12, 2024; 8 months ago",
                "typing_discipline": "dynamic, weak"
            }

        infoboxes.append({"url": url, "infobox": infobox})
        print(f"Infobox for {url}: {infobox}")
    else:
        print(f"Failed to retrieve or parse HTML for {url}")


Infobox for https://en.wikipedia.org/wiki/Elixir_(programming_language): {'Paradigms': 'multi-paradigm : functional , concurrent , distributed , process-oriented', 'Designed\xa0by': 'José Valim', 'First\xa0appeared': '2012 ; 13 years ago ( 2012 )', 'Stable release': '1.18.4 / 21 May 2025 ; 11 days ago ( 21 May 2025 )', 'Typing discipline': 'dynamic , strong', 'Platform': 'Erlang', 'License': 'Apache License 2.0', 'Filename extensions': '.ex, .exs', 'Website': 'elixir-lang .org'}
Infobox for https://en.wikipedia.org/wiki/MATLAB: {'paradigms': 'multi-paradigm: functional, imperative, procedural, object-oriented, array', 'first_appeared': 'late 1970s', 'stable_release': 'R2024b[2] Edit this on Wikidata / September 12, 2024; 8 months ago', 'typing_discipline': 'dynamic, weak'}
Infobox for https://en.wikipedia.org/wiki/Rust_(programming_language): {'Paradigms': 'Concurrent functional generic imperative structured', 'Developer': 'The Rust Team', 'First\xa0appeared': 'January 19, 2012 ; 13 ye

In [28]:
rewrite_key = {
    "paradigm": "paradigms"
}

for entry in infoboxes:
    new_infobox = {}
    for k, v in entry["infobox"].items():
        new_key = re.sub(r"\s+", "_", k.lower())
        if new_key in rewrite_key:
            new_key = rewrite_key[new_key]
        new_infobox[new_key] = v
    entry["infobox"] = new_infobox
    
for entry in infoboxes:
    print(entry["infobox"])


{'paradigms': 'multi-paradigm : functional , concurrent , distributed , process-oriented', 'designed_by': 'José Valim', 'first_appeared': '2012 ; 13 years ago ( 2012 )', 'stable_release': '1.18.4 / 21 May 2025 ; 11 days ago ( 21 May 2025 )', 'typing_discipline': 'dynamic , strong', 'platform': 'Erlang', 'license': 'Apache License 2.0', 'filename_extensions': '.ex, .exs', 'website': 'elixir-lang .org'}
{'paradigms': 'multi-paradigm: functional, imperative, procedural, object-oriented, array', 'first_appeared': 'late 1970s', 'stable_release': 'R2024b[2] Edit this on Wikidata / September 12, 2024; 8 months ago', 'typing_discipline': 'dynamic, weak'}
{'paradigms': 'Concurrent functional generic imperative structured', 'developer': 'The Rust Team', 'first_appeared': 'January 19, 2012 ; 13 years ago ( 2012-01-19 )', 'stable_release': '1.87.0 / May 15, 2025 ; 17 days ago ( May 15, 2025 )', 'typing_discipline': 'Affine inferred nominal static strong', 'implementation_language': 'OCaml (2006–20

In [29]:
key_count = {}

for infobox in infoboxes:
    for key, value in infobox["infobox"].items():
        # print(f"{key}: {value}")
        if(key not in key_count):
            key_count[key] = 1
        else:
            key_count[key] += 1
print("\nKey counts:")
for key, count in key_count.items():
    print(f"{key}: {count}")


Key counts:
paradigms: 24
designed_by: 21
first_appeared: 24
stable_release: 23
typing_discipline: 24
platform: 10
license: 17
filename_extensions: 23
website: 22
developer: 17
implementation_language: 8
os: 14
memory_management: 6
preview_release: 6
family: 3
scope: 2


In [27]:
filtered_infoboxes = []
filter_keys = {"paradigms", "first_appeared", "stable_release", "typing_discipline"}

for entry in infoboxes:
    filtered = {k: v for k, v in entry["infobox"].items() if k in filter_keys}
    filtered_infoboxes.append({"url": entry["url"], "infobox": filtered})

print(len(filtered_infoboxes), "filtered infoboxes:")

for entry in filtered_infoboxes:
    print(entry)

24 filtered infoboxes:
{'url': 'https://en.wikipedia.org/wiki/Elixir_(programming_language)', 'infobox': {'paradigms': 'multi-paradigm : functional , concurrent , distributed , process-oriented', 'first_appeared': '2012 ; 13 years ago ( 2012 )', 'stable_release': '1.18.4 / 21 May 2025 ; 11 days ago ( 21 May 2025 )', 'typing_discipline': 'dynamic , strong'}}
{'url': 'https://en.wikipedia.org/wiki/MATLAB', 'infobox': {'paradigms': 'multi-paradigm: functional, imperative, procedural, object-oriented, array', 'first_appeared': 'late 1970s', 'stable_release': 'R2024b[2] Edit this on Wikidata / September 12, 2024; 8 months ago', 'typing_discipline': 'dynamic, weak'}}
{'url': 'https://en.wikipedia.org/wiki/Rust_(programming_language)', 'infobox': {'paradigms': 'Concurrent functional generic imperative structured', 'first_appeared': 'January 19, 2012 ; 13 years ago ( 2012-01-19 )', 'stable_release': '1.87.0 / May 15, 2025 ; 17 days ago ( May 15, 2025 )', 'typing_discipline': 'Affine inferred n