# Imports

In [3]:
import time, requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from helper.list_of_all_html import urls
from helper.academicCloudEmbeddings import AcademicCloudEmbeddings
import streamlit as st

# Crawl

In [4]:
def extract_visible_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    return "\n".join(line.strip()
                     for line in soup.get_text("\n").splitlines()
                     if line.strip())

def crawl_urls(urls, delay=0.4) -> list[Document]:
    docs: list[Document] = []
    for url in urls:
        try:
            r = requests.get(url,
                             headers={"User-Agent": "Mozilla/5.0"},
                             timeout=15)
            r.raise_for_status()
            text = extract_visible_text(r.text)
            docs.append(Document(page_content=text,
                                 metadata={"url": url}))
        except Exception as exc:
            print(f"[!!] {url}: {exc}")
        time.sleep(delay)
    return docs

# Chunking

In [5]:
# 1) Seiten holen → Documents
docs = crawl_urls(urls, delay=0.4)

# 2) splitten – jede URL bleibt als metadata erhalten
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""],
)
chunks = splitter.split_documents(docs)

# 3) Embeddings und FAISS
embedder = AcademicCloudEmbeddings(
    api_key=st.secrets["GWDG_API_KEY"],
    url=st.secrets["BASE_URL_EMBEDDINGS"],
)
store = FAISS.from_documents(chunks, embedder)
store.save_local("faiss_wiki_index")