In [2]:
from bs4 import BeautifulSoup
import requests
import re
import json
from tqdm import tqdm
from time import sleep
from typing import List, Dict, Any

WHITESPACE_RE       = re.compile(r"\s+")


def extract_infobox_from_html(html: str) -> Dict[str, str]:
    """
    Liest alle Key-Value-Paare aus der Infobox aus. Zusätzlich fängt er
    Fälle ab, in denen eine Überschrift (<th class="infobox-header">)
    in einem <tr> steht und der eigentliche Wert erst in der nächsten Zeile (<tr>)
    in einem <td class="infobox-full-data"> steht.

    Gibt ein Dictionary zurück, in dem die Infobox-Felder als key:value stehen.
    """
    soup = BeautifulSoup(html, "html.parser")
    infobox = soup.find("table", class_=lambda c: c and "infobox" in c)
    if not infobox:
        return {}

    data: Dict[str, str] = {}
    rows = infobox.find_all("tr")

    # Wir durchlaufen alle <tr>-Zeilen per Index, damit wir "next row" leicht ansprechen können.
    for i, row in enumerate(rows):
        # 1) Prüfen, ob diese Zeile eine reine Header-Zeile ist: <th class="infobox-header">
        th_header = row.find("th", class_=lambda c: c and "infobox-header" in c)
        if th_header:
            # Den Text der Header-Zelle holen
            key = th_header.get_text(" ", strip=True)
            # Versuchen, in der nächsten Zeile (<tr> i+1) den <td class="infobox-full-data"> zu holen
            if i + 1 < len(rows):
                next_row = rows[i + 1]
                td_full = next_row.find("td", class_=lambda c: c and "infobox-full-data" in c)
                # Falls kein class="infobox-full-data", nehmen wir das erste <td> als Fallback
                if not td_full:
                    td_full = next_row.find("td")
                if td_full:
                    # Fußnoten entfernen und Text extrahieren
                    for sup in td_full.find_all("sup"):
                        sup.decompose()
                    value = td_full.get_text(" ", strip=True)
                    value = WHITESPACE_RE.sub(" ", value).strip()
                    data[key] = value
            # Wir haben diesen Header behandelt, überspringen aber nicht komplett 
            # (weil es sein kann, dass der nächste <tr> später noch als "scope=row" bearbeitet wird). 
            # Allerdings soll dieselbe Zelle nicht nochmals als "scope=row" erkannt werden.
            continue

        # 2) Fallback auf “normale” row-mit-scope="row” („<th scope='row'> … <td class='infobox-data'>“)
        th = row.find("th", scope="row")
        if not th:
            # Manche Infoboxen nutzen <th class="infobox-label"> statt scope="row"
            th = row.find("th", class_=lambda c: c and "infobox-label" in c)

        td = row.find("td", class_=lambda c: c and "infobox-data" in c)
        # Fallback: wenn <td> da ist, aber ohne Klasse
        if not td:
            td = row.find("td")

        if th and td:
            key = th.get_text(" ", strip=True)
            # Alle Fußnoten in <sup> entfernen
            for sup in td.find_all("sup"):
                sup.decompose()
            value = td.get_text(" ", strip=True)
            value = WHITESPACE_RE.sub(" ", value).strip()
            data[key] = value

    return data

def load_html(url: str) -> str:
    """
    Lädt den HTML-Inhalt von der angegebenen URL.
    """
    try:
        sleep(0.5)  # Kurze Pause, um die Server nicht zu überlasten
        response = requests.get(url, timeout=20)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"[ERROR] Could not load {url}: {e}")
        return ""  # Leerer String bei Fehler

In [3]:
with open("/home/bfh/irsed/daten/ProgLang/24/prog_lang.json", "r", encoding="utf-8") as f:
    original_records = json.load(f)
    
    
urls = [record.get("url") for record in original_records]

infoboxes = []

for url in urls:
    html_content = load_html(url)
    if html_content:
        infobox = extract_infobox_from_html(html_content)
        if url == "https://en.wikipedia.org/wiki/MATLAB":
            infobox = {
                "paradigm": "multi-paradigm: functional, imperative, procedural, object-oriented, array",
                "first_appeared": "late 1970s",
                "stable_release": "R2024b[2] Edit this on Wikidata / September 12, 2024; 8 months ago",
                "typing_discipline": "dynamic, weak",
                "influenced": "Julia, Octave, Scilab",
                "influenced_by": "APL, EISPACK, Fortran, LINPACK, PL/0, Speakeasy",
            }

        infoboxes.append({"url": url, "infobox": infobox})
        print(f"Infobox for {url}: {infobox}")
    else:
        print(f"Failed to retrieve or parse HTML for {url}")


Infobox for https://en.wikipedia.org/wiki/Elixir_(programming_language): {'Paradigms': 'multi-paradigm : functional , concurrent , distributed , process-oriented', 'Designed\xa0by': 'José Valim', 'First\xa0appeared': '2012 ; 13 years ago ( 2012 )', 'Stable release': '1.18.4 / 21 May 2025 ; 11 days ago ( 21 May 2025 )', 'Typing discipline': 'dynamic , strong', 'Platform': 'Erlang', 'License': 'Apache License 2.0', 'Filename extensions': '.ex, .exs', 'Website': 'elixir-lang .org', 'Influenced by': 'Clojure , Erlang , Ruby', 'Influenced': 'Gleam , LFE'}
Infobox for https://en.wikipedia.org/wiki/MATLAB: {'paradigm': 'multi-paradigm: functional, imperative, procedural, object-oriented, array', 'first_appeared': 'late 1970s', 'stable_release': 'R2024b[2] Edit this on Wikidata / September 12, 2024; 8 months ago', 'typing_discipline': 'dynamic, weak', 'influenced': 'Julia, Octave, Scilab', 'influenced_by': 'APL, EISPACK, Fortran, LINPACK, PL/0, Speakeasy'}
Infobox for https://en.wikipedia.org/

In [4]:
rewrite_key = {
    "paradigms": "paradigm"
}

for entry in infoboxes:
    new_infobox = {}
    for k, v in entry["infobox"].items():
        new_key = re.sub(r"\s+", "_", k.lower())
        if new_key in rewrite_key:
            new_key = rewrite_key[new_key]
        new_infobox[new_key] = v
    entry["infobox"] = new_infobox
    
for entry in infoboxes:
    print(entry["infobox"])


{'paradigm': 'multi-paradigm : functional , concurrent , distributed , process-oriented', 'designed_by': 'José Valim', 'first_appeared': '2012 ; 13 years ago ( 2012 )', 'stable_release': '1.18.4 / 21 May 2025 ; 11 days ago ( 21 May 2025 )', 'typing_discipline': 'dynamic , strong', 'platform': 'Erlang', 'license': 'Apache License 2.0', 'filename_extensions': '.ex, .exs', 'website': 'elixir-lang .org', 'influenced_by': 'Clojure , Erlang , Ruby', 'influenced': 'Gleam , LFE'}
{'paradigm': 'multi-paradigm: functional, imperative, procedural, object-oriented, array', 'first_appeared': 'late 1970s', 'stable_release': 'R2024b[2] Edit this on Wikidata / September 12, 2024; 8 months ago', 'typing_discipline': 'dynamic, weak', 'influenced': 'Julia, Octave, Scilab', 'influenced_by': 'APL, EISPACK, Fortran, LINPACK, PL/0, Speakeasy'}
{'paradigm': 'Concurrent functional generic imperative structured', 'developer': 'The Rust Team', 'first_appeared': 'January 19, 2012 ; 13 years ago ( 2012-01-19 )', '

In [5]:
key_count = {}

for infobox in infoboxes:
    for key, value in infobox["infobox"].items():
        # print(f"{key}: {value}")
        if(key not in key_count):
            key_count[key] = 1
        else:
            key_count[key] += 1
print("\nKey counts:")
for key, count in key_count.items():
    print(f"{key}: {count}")


Key counts:
paradigm: 24
designed_by: 21
first_appeared: 24
stable_release: 23
typing_discipline: 24
platform: 10
license: 17
filename_extensions: 23
website: 22
influenced_by: 23
influenced: 20
developer: 17
implementation_language: 8
os: 14
major_implementations: 12
dialects: 5
memory_management: 6
preview_release: 6
family: 3
scope: 2


In [6]:
import re
from datetime import datetime

def parse_date_for_solr(date_str):
    """Convert various date formats to Solr-compatible ISO-8601 format"""
    if not date_str or len(date_str.strip()) == 0:
        return ""
    
    # Dictionary of month names to numbers
    month_dict = {
        'January': 1, 'February': 2, 'March': 3, 'April': 4,
        'May': 5, 'June': 6, 'July': 7, 'August': 8,
        'September': 9, 'October': 10, 'November': 11, 'December': 12,
        'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
        'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
        'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
    }
    
    # Pattern 1: "21 May 2025"
    pattern1 = r'(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})'
    match = re.search(pattern1, date_str)
    if match:
        day, month, year = match.groups()
        if month in month_dict:
            try:
                # Create a datetime object and format it for Solr
                dt = datetime(int(year), month_dict[month], int(day))
                return dt.strftime("%Y-%m-%dT00:00:00Z")
            except ValueError:
                pass  # Invalid date, try next pattern
    
    # Pattern 2: "May 21, 2025" or "May 21 2025"
    pattern2 = r'([A-Za-z]+)\s+(\d{1,2})(?:[,\s]+)(\d{4})'
    match = re.search(pattern2, date_str)
    if match:
        month, day, year = match.groups()
        if month in month_dict:
            try:
                dt = datetime(int(year), month_dict[month], int(day))
                return dt.strftime("%Y-%m-%dT00:00:00Z")
            except ValueError:
                pass
    
    # Pattern 3: "2025-05-21" (already ISO format without time)
    pattern3 = r'(\d{4})-(\d{2})-(\d{2})'
    match = re.search(pattern3, date_str)
    if match:
        year, month, day = match.groups()
        try:
            dt = datetime(int(year), int(month), int(day))
            return dt.strftime("%Y-%m-%dT00:00:00Z")
        except ValueError:
            pass
    
    # Pattern 4: Just a year "2025"
    pattern4 = r'\b(19|20)\d{2}\b'
    match = re.search(pattern4, date_str)
    if match:
        year = match.group(0)
        return f"{year}-01-01T00:00:00Z"  # Default to January 1st
    
    # If no pattern matches, return empty string
    return ""

def split_stable_release(stable_release_str):
    # Example: "1.18.4 / 21 May 2025 ; 11 days ago ( 21 May 2025 )"
    if not stable_release_str:
        return "", ""
    parts = stable_release_str.split(" / ")
    version = parts[0].strip()
    date = ""
    if len(parts) > 1:
        date_part = parts[1].split(";")[0].strip()
        date = date_part
    return version, date

# Example usage:
for entry in infoboxes:
    stable_release = entry["infobox"].get("stable_release", "")
    version, release_date = split_stable_release(stable_release)
    if release_date == "":
        release_date = "1 January 1947"
    # Override the stable_release in the infobox
    entry["infobox"]["stable_release"] = version
    entry["infobox"]["stable_release_date"] = parse_date_for_solr(release_date)

In [12]:
influenced_by_corrections = {
    "https://en.wikipedia.org/wiki/Rust_(programming_language)": 
        "Alef, BETA, CLU, C#, C++, Cyclone, Elm, Erlang, Haskell, Hermes, Limbo, Mesa, Napier, Newsqueak, NIL, OCaml, Ruby, Sather, Scheme, Standard ML, Swift",
    "https://en.wikipedia.org/wiki/R_(programming_language)": 
        "Lisp, S, Scheme",
    "https://en.wikipedia.org/wiki/Kotlin_(programming_language)": 
        "C#, Eiffel, Gosu, Groovy, Java, JavaScript, ML, Python, Scala"
}

for entry in infoboxes:
    url = entry["url"]
    if url in influenced_by_corrections:
        entry["infobox"]["influenced_by"] = influenced_by_corrections[url]
    
        

In [13]:
filtered_infoboxes = []
filter_keys = {"paradigm", "first_appeared", "stable_release", "typing_discipline", "influenced_by", "influenced", "stable_release_date"}

for entry in infoboxes:
    filtered = {k: (v if v is not None else "") for k, v in entry["infobox"].items() if k in filter_keys}
    filtered_infoboxes.append({"url": entry["url"], "infobox": filtered})

print(len(filtered_infoboxes), "filtered infoboxes:")

for entry in filtered_infoboxes:
    print(entry)

# print("\n")    

for entry in filtered_infoboxes:
    print(f"url: {entry['url']}, influenced_by: {entry['infobox'].get('influenced_by')}")

# print("\n")  
    
# for entry in filtered_infoboxes:
#     print(f"url: {entry['url']}, influenced: {entry['infobox'].get('influenced')}")

24 filtered infoboxes:
{'url': 'https://en.wikipedia.org/wiki/Elixir_(programming_language)', 'infobox': {'paradigm': 'multi-paradigm : functional , concurrent , distributed , process-oriented', 'first_appeared': '2012 ; 13 years ago ( 2012 )', 'stable_release': '1.18.4', 'typing_discipline': 'dynamic , strong', 'influenced_by': 'Clojure , Erlang , Ruby', 'influenced': 'Gleam , LFE', 'stable_release_date': '2025-05-21T00:00:00Z'}}
{'url': 'https://en.wikipedia.org/wiki/MATLAB', 'infobox': {'paradigm': 'multi-paradigm: functional, imperative, procedural, object-oriented, array', 'first_appeared': 'late 1970s', 'stable_release': 'R2024b[2] Edit this on Wikidata', 'typing_discipline': 'dynamic, weak', 'influenced': 'Julia, Octave, Scilab', 'influenced_by': 'APL, EISPACK, Fortran, LINPACK, PL/0, Speakeasy', 'stable_release_date': '2024-09-12T00:00:00Z'}}
{'url': 'https://en.wikipedia.org/wiki/Rust_(programming_language)', 'infobox': {'paradigm': 'Concurrent functional generic imperative st

In [14]:
with open("/home/bfh/irsed/daten/ProgLang/24/prog_lang.json", "r", encoding="utf-8") as f:
    original_records = json.load(f)
    

for record in original_records:
    url = record.get("url")
    for infobox in filtered_infoboxes:
        if infobox["url"] == url:
            for k, v in infobox["infobox"].items():
                record[k] = v
            break
        
print(original_records[0])
        
with open("/home/bfh/irsed/daten/ProgLang/prog_lang_enriched.json", "w", encoding="utf-8") as f:
    json.dump(original_records, f, ensure_ascii=False, indent=2)

{'title': 'Elixir (programming language)', 'url': 'https://en.wikipedia.org/wiki/Elixir_(programming_language)', 'history': "José Valim created the Elixir programming language as a research and development project at Plataformatec. His goals were to enable higher extensibility and productivity in the Erlang VM while maintaining compatibility with Erlang's ecosystem. Elixir is aimed at large-scale sites and apps. It uses features of Ruby, Erlang, and Clojure to develop a high-concurrency and low-latency language. It was designed to handle large data volumes. Elixir is also used in telecommunications, e-commerce, and finance. In 2021, the Numerical Elixir effort was announced with the goal of bringing machine learning, neural networks, GPU compilation, data processing, and computational notebooks to the Elixir ecosystem.", 'versioning': 'Each of the minor versions supports a specific range of Erlang/OTP versions. The current stable release version is 1.18.3 .', 'features': "Compiles to b