# Imports

In [1]:
import time, requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from helper.list_of_all_html import urls
from helper.academicCloudEmbeddings import AcademicCloudEmbeddings
import streamlit as st

# Crawl

In [2]:
# ── 1. Helper utilities ────────────────────────────────────────────────────────
from bs4 import BeautifulSoup, NavigableString
import requests, time
from langchain.schema import Document          # or `from langchain.docstore.document import Document`

def extract_table_as_text(table_tag):
    """
    • Erkennt Überschriften in erster Zeile oder erster Spalte – auch ohne <th>.
    • Wählt die Achse mit den meisten Einträgen als 'echter' Header.
    • Fallback: Wenn beide Achsen leer ⇒ rohe Matrixzeilen ausgeben.
    """
    # ---------- 1) Tabelle in Python‑Grid überführen ---------------------------
    grid = []
    for tr in table_tag.find_all("tr"):
        cells = [c.get_text(strip=True) for c in tr.find_all(["td", "th"])]
        if any(cells):
            grid.append(cells)

    if not grid:
        return []                                    # leere Tabelle

    max_cols = max(len(r) for r in grid)
    for r in grid:
        r.extend([""] * (max_cols - len(r)))         # kürzere Zeilen auffüllen

    # ---------- 2) Zeilen‑ vs. Spalten‑Header zählen ---------------------------
    first_row = grid[0]
    first_col = [r[0] for r in grid]

    row_header_count = sum(bool(c.strip()) for c in first_row)
    col_header_count = sum(bool(c.strip()) for c in first_col)

    # ---------- 3) Keine Header ⇒ rohe Matrix (Variante 1) ---------------------
    if row_header_count == 0 and col_header_count == 0:
        return [" | ".join(r) for r in grid]

    # ---------- 4) Achse mit den meisten Einträgen wird 'echt' -----------------
    real_axis = "row" if row_header_count >= col_header_count else "col"

    rows_out = []

    if real_axis == "row":
        real_headers   = first_row
        other_headers  = first_col[1:]
        for j, real_h in enumerate(real_headers):
            if not real_h.strip():
                continue
            for i, other_h in enumerate(other_headers, start=1):
                if not other_h.strip():
                    continue
                value = grid[i][j] if j < len(grid[i]) else ""
                rows_out.append(f"{real_h}: {other_h} = {value}")

    else:  # real_axis == "col"
        real_headers  = first_col
        other_headers = first_row[1:]
        for i, real_h in enumerate(real_headers):
            if i == 0 or not real_h.strip():
                continue
            for j, other_h in enumerate(other_headers, start=1):
                if not other_h.strip():
                    continue
                value = grid[i][j] if j < len(grid[i]) else ""
                rows_out.append(f"{real_h}: {other_h} = {value}")

    return rows_out





def replace_all_links_with_text_and_url(soup: BeautifulSoup):
    for a in soup.find_all("a"):
        label = a.get_text(strip=True)
        href  = a.get("href")
        if href:
            a.replace_with(f"{label} ({href})")
        else:
            a.replace_with(label)

def remove_tools_divs(soup):
    """
    Entfernt alle <div class="tools">‑Elemente (inklusive ihres Inhalts) aus dem
    BeautifulSoup‑Baum.
    """
    for div in soup.select("div.tools"):
        div.decompose()

def remove_ignored_parts(soup):
    """
    Löscht aus der BeautifulSoup‑Instanz:
      • <div class="tools">
      • <div class="docInfo">
      • <div id="dokuwiki__sitetools">
      • <nav id="dokuwiki__aside">
      • <a  href="#dokuwiki__content">
    """
    # 1. komplette Container entfernen
    for sel in [
        "div.tools",
        "div.docInfo",
        "div#dokuwiki__sitetools",
        "nav#dokuwiki__aside",
    ]:
        for tag in soup.select(sel):
            tag.decompose()

    # 2. einzelne Anker entfernen
    for a in soup.select('a[href="#dokuwiki__content"]'):
        a.decompose()





def clean_inline_tags(soup_or_tag):
    """Remove inline presentational tags but keep their text."""
    for inner_tag in soup_or_tag.find_all(["strong", "em", "span", "b", "i", "u"]):
        inner_tag.replace_with(inner_tag.get_text(strip=True))


In [3]:
# ── 2. Text‑extraction pipeline ────────────────────────────────────────────────
def extract_visible_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")

    # 1 Drop non‑visible nodes early
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    # 2 Convert every <table> → readable text block
    for table in soup.find_all("table"):
        table_lines = extract_table_as_text(table)
        table.replace_with(NavigableString("\n".join(table_lines)))

    # 3 Replace <a> with "label (href)"
    replace_all_links_with_text_and_url(soup)

    # 4 Strip presentation‑only inline tags
    clean_inline_tags(soup)
    # 5  Tools‑Container löschen
    remove_tools_divs(soup)
    remove_ignored_parts(soup)
    # 6 Collapse to plain text
    visible_lines = [
        line.strip() for line in soup.get_text("\n").splitlines() if line.strip()
    ]
    return "\n".join(visible_lines)


In [4]:
# ── 3. Crawler using the new extractor ─────────────────────────────────────────
def crawl_urls(urls, delay: float = 0.4) -> list[Document]:
    docs: list[Document] = []

    for url in urls:
        try:
            r = requests.get(
                url,
                headers={"User-Agent": "Mozilla/5.0"},
                timeout=15,
            )
            r.raise_for_status()

            text = extract_visible_text(r.text)
            docs.append(Document(page_content=text, metadata={"url": url}))

        except Exception as exc:
            print(f"[!!] {url}: {exc}")

        time.sleep(delay)

    return docs


## Optional: Write docs to txt

In [5]:
from pathlib import Path

def save_docs_to_txt(docs, filename="data/crawled_pages.txt"):
    """
    Write a list of Document objects to one TXT file.
    Each document is separated by a divider line.
    """
    path = Path(filename)
    with path.open("w", encoding="utf-8") as f:
        for i, doc in enumerate(docs, 1):
            url = doc.metadata.get("url", "unknown‑url")
            f.write(f"=== Document {i} ===\n")
            f.write(f"URL: {url}\n\n")
            f.write(doc.page_content)
            f.write("\n\n" + "-" * 80 + "\n\n")
    print(f"Wrote {len(docs)} documents → {path.resolve()}")

# after crawl_urls(...)
docs = crawl_urls(urls)
save_docs_to_txt(docs)

Wrote 257 documents → /Users/jakobeilts/Development/Masterarbeit/2_increment/data/crawled_pages.txt


# Chunking

In [6]:
# 1) Seiten holen → Documents
docs = crawl_urls(urls, delay=0.4)

# 2) splitten – jede URL bleibt als metadata erhalten
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""],
)
chunks = splitter.split_documents(docs)

# 3) Embeddings und FAISS
embedder = AcademicCloudEmbeddings(
    api_key=st.secrets["GWDG_API_KEY"],
    url=st.secrets["BASE_URL_EMBEDDINGS"],
)
store = FAISS.from_documents(chunks, embedder)
store.save_local("faiss_wiki_index")