1. Get data

In [1]:
# import all the URLs from list_of_all_html.py
from helper.list_of_all_html import urls
# additional imports
import time, requests
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm.notebook import tqdm

2. Write all the data from all URLs to a txt file

In [2]:
# ------- 3)  helper to strip HTML to text -------
def extract_visible_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    return "\n".join(
        line.strip() for line in soup.get_text("\n").splitlines() if line.strip()
    )

In [3]:
def crawl_to_txt(urls, outfile: Path, delay=0.4):
    print(f"Writing → {outfile.resolve()}")
    with outfile.open("w", encoding="utf-8") as fh:
        for url in tqdm(urls, unit="page"):
            try:
                r = requests.get(url,
                                 headers={"User-Agent": "Mozilla/5.0"},
                                 timeout=15)
                r.raise_for_status()
                text = extract_visible_text(r.text)
            except Exception as exc:
                text = f"[!!] Could not fetch {url} – {exc}"
            fh.write(f"\n\n# {url}\n\n{text}\n")
            time.sleep(delay)       # polite pause

outfile = Path("data/all_text.txt")
crawl_to_txt(urls, outfile)

print("✅ Done!")

Writing → /Users/jakobeilts/Development/Masterarbeit/1_increment/data/all_text.txt


  0%|          | 0/258 [00:00<?, ?page/s]

✅ Done!


# Alternative! Wahrscheinlich besser, wegen den Quellen:

In [1]:
# crawler.py
import time, requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document

def extract_visible_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    return "\n".join(line.strip()
                     for line in soup.get_text("\n").splitlines()
                     if line.strip())

def crawl_urls(urls, delay=0.4) -> list[Document]:
    docs: list[Document] = []
    for url in urls:
        try:
            r = requests.get(url,
                             headers={"User-Agent": "Mozilla/5.0"},
                             timeout=15)
            r.raise_for_status()
            text = extract_visible_text(r.text)
            docs.append(Document(page_content=text,
                                 metadata={"url": url}))
        except Exception as exc:
            print(f"[!!] {url}: {exc}")
        time.sleep(delay)
    return docs

## jetzt chunking mit Metadaten-Vererbung

In [2]:
# build_index.py
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from helper.list_of_all_html import urls
from helper.academicCloudEmbeddings import AcademicCloudEmbeddings
import streamlit as st

# 1) Seiten holen → Documents
docs = crawl_urls(urls, delay=0.4)

# 2) splitten – jede URL bleibt als metadata erhalten
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""],
)
chunks = splitter.split_documents(docs)

# 3) Embeddings und FAISS
embedder = AcademicCloudEmbeddings(
    api_key=st.secrets["GWDG_API_KEY"],
    url=st.secrets["BASE_URL_EMBEDDINGS"],
)
store = FAISS.from_documents(chunks, embedder)
store.save_local("faiss_wiki_index")

KeyboardInterrupt: 