# Parse sitemap

Copyright 2025 Stephan Kulla ("Kulla")

Licensed under the Apache License, Version 2.0 (the "Apache License") and Creative Commons Attribution 4.0 International (the "CC-BY License"). You may choose either of these licenses to govern your use of this project.

You may obtain a copy of the Apache License at:
    http://www.apache.org/licenses/LICENSE-2.0

You may obtain a copy of the CC-BY License at:
    https://creativecommons.org/licenses/by/4.0/

Unless required by applicable law or agreed to in writing, software and content distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache License for the specific language governing permissions and limitations under the License.

Under the CC-BY License, you are free to share and adapt the material provided you give appropriate credit, provide a link to the license, and indicate if changes were made. See the CC-BY License for full details.

Third-Party Components and Licenses:
This product also includes third-party components which are distributed under their respective licenses. Please refer to the NOTICE file for details.

## Utility functions and imports

In [1]:
%load_ext jupyter_ai_magics

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

## Helper function to download wikitext

In [4]:
import requests

def get_wikitext(page_title):
    url = 'https://de.wikibooks.org/w/api.php'
    
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'revisions',
        'titles': page_title,
        'rvprop': 'content',
        'rvslots': '*'
    }

    response = requests.get(url, params=params)
    
    if response.status_code != 200:
        raise Exception(f"Error fetching data: {response.status_code}")
    
    data = response.json()
    pages = data['query']['pages']

    for page_id, page in pages.items():
        if 'revisions' in page:
            wikitext = page['revisions'][0]['slots']['main']['*']
            return wikitext

    raise ValueError(f"No content found for page: {page_title}")

# Example usage
page_title = 'Mathe für Nicht-Freaks: Analysis 1'
try:
    wikitext = get_wikitext(page_title)
    print(wikitext)
except Exception as e:
    print(e)

{{#invoke:Mathe für Nicht-Freaks/Seite|oben}}

{{#invoke:Mathe für Nicht-Freaks/Seite|unten}}


## Download sitemap

In [8]:
raw_sitemap = get_wikitext("Mathe für Nicht-Freaks: Sitemap")

# Print first 50 lines
print("\n".join(raw_sitemap.split("\n")[0:50]))

{{#invoke:Mathe für Nicht-Freaks/Seite|oben}}

Diese Seite listet alle Kapitel des Projekts „Mathe für Nicht-Freaks“ auf. Diese Seite dient zur Übersicht und aus ihr werden die komplette Navigation und die Inhaltsverzeichnisse der einzelnen Bücher generiert.

{| class="wikitable"
|+ Legende
! Symbol
! Bedeutung
|-
| <span style="color:#BA0000;">roter Link</span>
| Link auf ein Kapitel, welches noch nicht existiert und welches noch geschrieben werden muss.
|-
| <span style="color:#0645AD;">blauer Link</span>
| Kapitel wurde bereits angelegt und enthält Inhalt.
|-
| {{Symbol|0%}}
| Fortschritt 0% – Kapitel besitzt keinen oder kaum Inhalt. Das Kapitel muss neu geschrieben bzw. ergänzt und überarbeitet werden.
|-
| {{Symbol|25%}}
| Fortschritt 25% – Kapitel befindet sich in der Entwicklung, muss aber noch wesentlich ergänzt werden.
|-
| {{Symbol|50%}}
| Fortschritt 50% – Wesentliche Inhalte sind vorhanden, es müssen aber noch wichtige Inhalte hinzugefügt werden (oft befinden sich auf der S

## Parse sitemap

In [32]:
import json

from dataclasses import dataclass, asdict, field
from typing import List, Literal


@dataclass
class BookLine:
    name: str
    href: str


@dataclass
class SectionLine:
    name: str


@dataclass
class PageLine:
    name: str
    href: str


def parse_link(line):
    end_index = line.find("]]")

    if line.startswith("[[") and end_index > 2:
        link_content = line[2:end_index]
        link_parts = link_content.split("|")

        if len(link_parts) == 2:
            return link_parts

    return (None, None)


def parse_sitemap_line(sitemap_line):
    if sitemap_line.startswith("===") and sitemap_line.endswith("==="):
        name = sitemap_line.strip("===").strip()

        if name:
            return SectionLine(name)
    elif sitemap_line.startswith("==") and sitemap_line.endswith("=="):
        href, name = parse_link(sitemap_line.strip("==").strip())

        if href and name:
            return BookLine(name, href)
    elif sitemap_line.startswith("* "):
        href, name = parse_link(sitemap_line.lstrip("* "))

        if href and name:
            return PageLine(name, href)
            
    return None


@dataclass
class Project:
    kind: Literal["project"] = "project"
    name: Literal["Mathe für Nicht-Freaks"] = "Mathe für Nicht-Freaks"
    href: Literal["Mathe für Nicht-Freaks"] = "Mathe für Nicht-Freaks"
    books: List[Book] = field(default_factory=list)


@dataclass
class Book:
    name: str
    href: str
    kind: Literal["book"] = "book"
    sections: Literal[Section] = field(default_factory=list)


@dataclass
class Section:
    name: str
    kind: Literal["section"] = "section"
    pages: List[Page] = field(default_factory=list)


@dataclass
class Page:
    name: str
    href: str
    kind: Literal["page"] = "page"

def parse_sitemap(sitemap_text):
    project = Project()
    book = None
    section = None
    
    for line in sitemap_text.split("\n"):
        parsed_line = parse_sitemap_line(line)

        if parsed_line is None:
            continue

        if isinstance(parsed_line, BookLine):
            book = Book(name=parsed_line.name, href=parsed_line.href)
            section = None
            
            project.books.append(book)
        elif isinstance(parsed_line, SectionLine):
            section = Section(name=parsed_line.name)

            book.sections.append(section)
        elif isinstance(parsed_line, PageLine):
            page = Page(name=parsed_line.name, href=parsed_line.href)

            if section is None:
                for book_without_sections in ["Buchanfänge", "Über das Projekt"]:
                    if book.name == book_without_sections:
                        section = Section(name="Buchanfänge")
    
                        book.sections.append(section)

            section.pages.append(page)

    return project

project = parse_sitemap(raw_sitemap)

print(json.dumps(asdict(project), indent=2))

{
  "kind": "project",
  "name": "Mathe f\u00fcr Nicht-Freaks",
  "href": "Mathe f\u00fcr Nicht-Freaks",
  "books": [
    {
      "name": "Grundlagen der Mathematik",
      "href": "Mathe f\u00fcr Nicht-Freaks: Grundlagen der Mathematik",
      "kind": "book",
      "sections": [
        {
          "name": "Was ist Mathematik?",
          "kind": "section",
          "pages": [
            {
              "name": "Was ist Mathematik?",
              "href": "Mathe f\u00fcr Nicht-Freaks: Was ist Mathematik?",
              "kind": "page"
            }
          ]
        },
        {
          "name": "Einf\u00fchrung in die Logik",
          "kind": "section",
          "pages": [
            {
              "name": "Logik und Aussagen",
              "href": "Mathe f\u00fcr Nicht-Freaks: Logik und Aussagen",
              "kind": "page"
            },
            {
              "name": "Junktoren",
              "href": "Mathe f\u00fcr Nicht-Freaks: Junktor",
              "kind": "