# Setup und Konfiguration

In diesem Abschnitt werden alle benötigten Bibliotheken importiert und grundlegende Einstellungen wie Dateipfade, Filterlisten und Konstanten definiert. Die Filterlisten bestimmen, welche Programmiersprachen berücksichtigt oder ausgeschlossen werden.

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import json
from dicttoxml import dicttoxml
from xml.dom.minidom import parseString
from time import sleep
import math
from tqdm import tqdm
import os
from urllib.parse import urlparse

path = "/home/bfh/irsed/daten/ProgLang"
temp_path = "/home/bfh/irsed/daten/ProgLang/temp"
max_text_len=32765
cPP = r'C%2B%2B'

filter_languages = ["Python_(programming_language)", "Go_(programming_language)", "typescript",
                    "JavaScript", "Java_(programming_language)", "Rust_(programming_language)", "Bash_(Unix_shell)", 
                    "C_Sharp_(programming_language)", cPP , "C_(programming_language)",
                    "PHP", "PowerShell", "Kotlin_(programming_language)", "Lua", 
                    "Dart_(programming_language)", "Assembly_language", "Ruby_(programming_language)",
                    "Swift_(programming_language)", "R_(programming_language)", "MATLAB",
                    "Apache_Groovy", "Scala_(programming_language)", "Haskell", "Elixir_(programming_language)",
                    "Zig_(programming_language)"]
print(len(filter_languages))
filter_not_languages =  ["Lingo", "JavaFX_Script", "Join_Java", "ABC_(programming_language)",
                        "Bc_(programming_language)", "LPC_(programming_language)", "SA-C_(programming_language)"
                        "Orc_(programming_language)", "ACC_(programming_language)", "Arc_(programming_language)",
                        "XC_(programming_language)", "TRAC_(programming_language)", "Planner_(programming_language)",
                        "IBM_Basic_assembly_language", "SA-C_(programming_language)", "Clipper_(programming_language)",
                        "Factor_(programming_language)", "Fj%C3%B6lnir_(programming_language)", "Actor_(programming_language)",
                        "Escher_(programming_language)", "SR_(programming_language)", "Euler_(programming_language)"]

25


# Wikipedia-Seite laden und parsen

Hier wird die Wikipedia-Seite mit der Liste der Programmiersprachen abgerufen und mit BeautifulSoup geparst, um die HTML-Struktur für die weitere Verarbeitung vorzubereiten.

[Link](https://en.wikipedia.org/wiki/List_of_programming_languages)

In [2]:
baseUrl = "https://en.wikipedia.org"
# baseUrl = "https://de.wikipedia.org"
progListUrl = f"{baseUrl}/wiki/List_of_programming_languages"

resp = requests.get(progListUrl)
content = resp.text
soup = BeautifulSoup(content, 'lxml')

# Hilfsfunktionen für Link-Filterung

Diese Funktionen helfen dabei, die gefundenen Links nach gewünschten und unerwünschten Programmiersprachen zu filtern.

In [3]:
def contains_any_language(link: str, languages: list[str]) -> bool:
    """Check if link contains any of the given languages."""
    link = link.lower()
    return any(lang.lower() == link for lang in languages)

def create_url_map(links: list[str]) -> dict:
    """Create a map with the last part of the URL as the key and the full URL as the value."""
    return {link.rsplit('/', 1)[-1]: link for link in links}

def filter_programming_links(
    links: list[str],
    include_languages: list[str] = None,
    exclude_languages: list[str] = None
) -> list[str]:
    include_languages = include_languages or []
    exclude_languages = exclude_languages or []

    if not include_languages and not exclude_languages:
        return links
    
    url_map = create_url_map(links)
    keys = list(url_map.keys())

    if include_languages:
        keys = [key for key in keys if contains_any_language(key, include_languages)]

    # Exclude keys based on exclude_languages
    if exclude_languages:
        keys = [key for key in keys if not contains_any_language(key, exclude_languages)]

    return [url_map[key] for key in keys]

# Links zu Programmiersprachen extrahieren und filtern

Hier werden alle relevanten Links zu einzelnen Programmiersprachen aus dem HTML extrahiert und anhand der Filterlisten eingeschränkt.

In [4]:
a_tags = soup.select("div.mw-content-ltr.mw-parser-output li a")

links = [baseUrl + a['href'] for a in a_tags if a.has_attr('href')]
print(f"Found Links {len(links)}")
print(f"All Links {links}")

filtered_links = set(filter_programming_links(
    links=links,
    include_languages=filter_languages,
    exclude_languages=filter_not_languages
)) 
print(f"Filtered Links {len(filtered_links)}")
print(f"Filtered Links {filtered_links}")
links = filtered_links

Found Links 792
All Links ['https://en.wikipedia.org/wiki/List_of_programming_languages_by_type', 'https://en.wikipedia.org/wiki/Timeline_of_programming_languages', 'https://en.wikipedia.org/wiki/Generational_list_of_programming_languages', 'https://en.wikipedia.org/wiki/Template:Programming_language_lists', 'https://en.wikipedia.org/wiki/Template_talk:Programming_language_lists', 'https://en.wikipedia.org/wiki/Special:EditPage/Template:Programming_language_lists', 'https://en.wikipedia.org#0–9', 'https://en.wikipedia.org#A', 'https://en.wikipedia.org#B', 'https://en.wikipedia.org#C', 'https://en.wikipedia.org#D', 'https://en.wikipedia.org#E', 'https://en.wikipedia.org#F', 'https://en.wikipedia.org#G', 'https://en.wikipedia.org#H', 'https://en.wikipedia.org#I', 'https://en.wikipedia.org#J', 'https://en.wikipedia.org#K', 'https://en.wikipedia.org#L', 'https://en.wikipedia.org#M', 'https://en.wikipedia.org#N', 'https://en.wikipedia.org#O', 'https://en.wikipedia.org#P', 'https://en.wikipe

# HTML-Inhalte speichern

Diese Funktion speichert den HTML-Inhalt jeder Programmiersprachen-Seite als Datei im angegebenen Verzeichnis.

In [5]:
def save_html_content(url, html_content, base_path):    
    # Create a filename from the URL
    parsed_url = urlparse(url)
    filename = parsed_url.netloc.replace('.', '_') + parsed_url.path.replace('/', '_')
    if not filename.endswith('.html'):
        filename += '.html'
    
    # Create the full path
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    full_path = os.path.join(base_path, filename)
    
    # Save the content
    try:
        with open(full_path, 'w', encoding='utf-8') as f:
            f.write(html_content)
        # print(f"Saved HTML content to {full_path}")
        return full_path
    except Exception as e:
        print(f"Error saving HTML content: {e}")
        return None

# Alle Programmiersprachen-Seiten herunterladen

Mit dieser Funktion werden alle gefilterten Wikipedia-Seiten zu den Programmiersprachen heruntergeladen und lokal gespeichert.

In [6]:
def fetch_pages(links: list) -> list:
    file_paths = []
    # Estimate total time (2 seconds per request assuming average response time)
    estimated_time = len(links) * 2
    print(f"Estimated time: {math.ceil(estimated_time/60)} minutes")
    
    # Use tqdm for progress bar
    for link in tqdm(links, desc="Fetching pages"):
        try:
            resp = requests.get(link, timeout=10)
            resp.raise_for_status()  # Raise exception for bad status codes
            
            content = resp.text
            # Save content to file instead of creating soup objects
            file_path = save_html_content(link, content, temp_path)
            if file_path:
                file_paths.append((link, file_path))
            
            # Polite delay between requests
            sleep(1)
            
        except requests.RequestException as e:
            print(f"Error fetching {link}: {str(e)}")
            # Try next link instead of failing
            continue
            
    print(f"Successfully fetched and saved {len(file_paths)} out of {len(links)} pages")
    return file_paths

file_paths = fetch_pages(links)

Estimated time: 1 minutes


Fetching pages:   0%|          | 0/25 [00:00<?, ?it/s]

Fetching pages: 100%|██████████| 25/25 [00:28<00:00,  1.16s/it]

Successfully fetched and saved 25 out of 25 pages





# Funktionen zur Extraktion von Seitentiteln und Inhalten

Diese Funktionen extrahieren den Titel, den Hauptinhalt und die Überschriften einer Wikipedia-Seite. Sie helfen dabei, die Struktur und die relevanten Informationen der Seite zu erfassen.

In [7]:
def find_title(soup):
    return soup.select_one("#firstHeading > span")

def find_main_content(soup):
    return soup.select_one("#mw-content-text > div.mw-content-ltr.mw-parser-output")

def extract_heading_data(element):
    """Extract heading text and ID from element"""
    first_child = element.contents[0] if element.contents else None
    if first_child:
        # Clean spans
        for span in first_child.find_all("span"):
            span.unwrap()
        return {
            "id": first_child.get("id", ""),
            "heading": first_child.text.strip()
        }
    return None

def find_all_headings(main_content):
    """Extract all headings with their IDs using list comprehension"""
    return [
        extract_heading_data(element)
        for element in main_content.select(".mw-heading")
        if element.contents and extract_heading_data(element)
    ]

def get_content_until_next_heading(parent_element):
    """Get all content between current element and next heading"""
    content = []
    current = parent_element.find_next_sibling()
    
    while current and 'mw-heading' not in current.get('class', []):
        if current.name != 'style':
            content.append(current)
        current = current.find_next_sibling()    
        
    return content


def get_all_text_from_heading_id(main_content, heading_id):
    target_element = main_content.select_one(f'[id="{heading_id}"]')
    
    if not target_element:
        return None
    
    
    parent_element = target_element.parent
    if not parent_element:
        return None
    
    content_elements = get_content_until_next_heading(parent_element)
    combined_text = ' '.join(element.text.strip() for element in content_elements)
    clean_text = re.sub(r'\[\d{1,3}\]', '', combined_text)
    clean_text = ' '.join(clean_text.split())
    
    return clean_text

def convert_to_valid_field(heading: str):
    return re.sub(r'[^a-z_]', '', 
                 heading
                 .strip()
                 .lower()
                 .replace(' ', '_')
                 .replace('-', '_')
                 .encode("ascii", "ignore")
                 .decode("ascii"))
    
    

# Verarbeitung und Speicherung der extrahierten Inhalte

Hier werden die gespeicherten HTML-Dateien verarbeitet, die relevanten Inhalte extrahiert und als strukturierte JSON-Datei gespeichert.

In [1]:
def process_soups_content(file_paths):
    field_names = set()
    output_file = f"{path}/prog_lang.json"
    
    # Start with an empty array
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("[\n")
    
    file_count = 0
    
    for link, file_path in file_paths:#tqdm(file_paths, desc="Processing files"):
        try:
            # Read the HTML content from the file
            print(f"Processing {file_path}")
            try:
                print(f"Reading file {file_path}")
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
            except Exception as e:
                print(f"Error reading file {file_path}: {str(e)}")
                continue
            
            # Create soup object from the content
            try:
                soup = BeautifulSoup(content, 'lxml')
                print(f"Parsed HTML from {file_path}")
            except Exception as e:
                print(f"Error parsing HTML in {file_path}: {str(e)}")
                continue
            
            # Find title element
            try:
                title_element = find_title(soup)
                print(f"Found title in {file_path}")
                if not title_element:
                    print(f"No title found in {file_path}, skipping")
                    continue
                title_text = title_element.text.strip()
            except Exception as e:
                print(f"Error extracting title from {file_path}: {str(e)}")
                continue
            
            # Find main content
            try:
                main_content = find_main_content(soup)
                print(f"Found main content in {file_path}")
                if not main_content:
                    print(f"No main content found in {file_path}, skipping")
                    continue
            except Exception as e:
                print(f"Error finding main content in {file_path}: {str(e)}")
                continue
            
            # Create page object
            page = {
                "title": title_text,
                "url": link
            }
            
            # Process headings
            try:
                headings = find_all_headings(main_content)
                print(f"Found {len(headings)} headings in {file_path}")
            except Exception as e:
                print(f"Error finding headings in {file_path}: {str(e)}")
                headings = []
            
            # Track if we found any content
            found_content = False
            
            # Process each heading - add directly to document
            for heading in headings:
                try:
                    if not heading or "id" not in heading:
                        continue
                        
                    text = get_all_text_from_heading_id(main_content, heading["id"])
                    if text:
                        field_name = convert_to_valid_field(heading["heading"])
                        # Add content directly to the document with the field name
                        page[field_name] = text[:max_text_len].replace("\"", "")
                        
                        # Optionally store the original heading if needed
                        # page[f"heading_{field_name}"] = heading["heading"]
                        
                        # Add field to our set of field names
                        field_names.add(field_name)
                        # field_names.add(f"heading_{field_name}")
                        
                        found_content = True
                except Exception as e:
                    print(f"Error processing heading {heading.get('heading', 'unknown')} in {file_path}: {str(e)}")
                    continue
            
            # Only write pages with actual content
            if found_content:
                # Append to the JSON file with proper commas
                print(f"Writing page to {output_file}")
                with open(output_file, 'a', encoding='utf-8') as f:
                    if file_count > 0:
                        print(f"Writing comma before page {file_count}")
                        f.write(",\n")
                    json_data = json.dumps(page, indent=2, ensure_ascii=False)
                    print(f"Writing page {json_data}")
                    f.write(json_data)
                file_count += 1
            else:
                print(f"No usable content found in {file_path}, skipping")
            
        except Exception as e:
            print(f"Unexpected error processing {file_path}: {str(e)}")
            continue
    
    # Close the JSON array
    with open(output_file, 'a', encoding='utf-8') as f:
        f.write("\n]")
    
    print(f"Successfully processed {file_count} files and wrote them to {output_file}")
    return field_names

# Save the field names instead of pages
field_list = process_soups_content(file_paths)

NameError: name 'file_paths' is not defined

# (Optional) Felder extrahieren und als CSV speichern

Dieser Abschnitt extrahiert die Feldnamen aus den gesammelten Daten und speichert sie als CSV-Datei.

In [None]:
property = "filed"
filed_list = set([item[property] for article in pages for item in article["data"] if property in item]) 
print(filed_list)
        
with open(f"{path}/prog_lang_fields.csv", 'w', encoding='utf-8') as f:
    f.write(",\n".join(filed_list))

{'bibliography', 'block_expressions_and_control_flow', 'external_links', 'function_syntax', 'rustfmt', 'history', 'controversy', 'union_types', 'modules_and_namespaces', 'lexical_closure', 'macros', 'integration_with_build_automation_tools', 'mozilla_sponsorship_', 'promises', 'declaration_files', 'traits', 'javafx_application', 'syntax_and_semantics', 'codedom_provider', 'misplaced_trust_in_the_client', 'pointers', 'libraries_and_frameworks', 'javascript_engine', 'reference_implementation', 'linting_tools', 'design_philosophy_and_features', 'syntax', 'book_sources', 'weakly_typed', 'while_loops', 'libraries', 'cross_site_request_forgery', 'object_composition_and_inheritance', 'api_documentation_generators', 'syntax_and_features', 'creation_at_netscape', 'cargo', 'servlet', 'swing_application', 'clippy', 'criticism', 'others', 'declarative_macros', 'trademark', 'promises_and_asyncawait', 'see_also', 'array_and_object_literals', 'editions', 'type_annotations', 'automatic_memory_manageme

# (Optional) JSON-Ausgabe als XML speichern

Hier kann die gesammelte JSON-Ausgabe optional in das XML-Format konvertiert und gespeichert werden.

In [None]:
# with open(f"{path}/prog_lang.json", 'w', encoding='utf-8') as f:
#     f.write(json.dumps(pages, indent=2, ensure_ascii=False))

In [None]:
# def json_to_xml(json_data):
#     # Convert to XML
#     xml = dicttoxml(json_data, custom_root='root', attr_type=False)
    
#     # Pretty print the XML
#     dom = parseString(xml)
#     return dom.toprettyxml()

# xml_string = json_to_xml(pages)
# with open(f"{path}/prog_lang.xml", 'w', encoding='utf-8') as f:
#     f.write(xml_string)