In [6]:
from __future__ import annotations

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
import gzip
import pickle
from pathlib import Path
from typing import List, Union
from langchain.schema import Document
from helper.list_of_all_html import urls

## Crawling with Crawl4AI
`https://docs.crawl4ai.com/core/simple-crawling/`

In [7]:
async def crawl_docs(urls: list[str]) -> list[Document]:
    md_gen = DefaultMarkdownGenerator(options={"ignore_images": True})
    run_cfg = CrawlerRunConfig(
        excluded_tags=["header", "footer"],
        excluded_selector=(
            "nav#dokuwiki__aside, "
            "nav#dokuwiki__pagetools, "
            "a.media, img, "
            "div#dw__toc, "
            "div.docInfo"
        ),
        markdown_generator=md_gen,
    )

    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun_many(urls, config=run_cfg)

    return [
        Document(
            page_content=res.markdown,
            # use the URL recorded by the crawler, after redirects
            metadata={"url": getattr(res, "url", None) or getattr(res, "final_url", None)}
        )
        for res in results
    ]

## Store to Pickle
We are saving the docs in a pickle file to use in another jupyter notebook

In [8]:
def save_docs_pickle(docs: List[Document], file: str | Path = "docs.pkl.gz") -> Path:
    """Gzip‑pickle a list of documents (drop‑in replacement for your first script)."""
    path = Path(file)
    with gzip.open(path, "wb") as f:
        pickle.dump(docs, f)
    print(f"Saved {len(docs)} documents → {path.resolve()}")
    return path

## Store to txt
This is just for debugging. The txt will not be used in the RAG

In [9]:
def save_docs_txt(
    docs: List[Document],
    file: Union[str, Path] = "docs.txt",
    *,
    include_metadata: bool = True,
    separator: str = "\n\n" + ("-" * 80) + "\n\n",
) -> Path:
    path = Path(file)
    with path.open("w", encoding="utf-8") as fp:
        for i, doc in enumerate(docs, start=1):
            if include_metadata and doc.metadata:
                fp.write(f"[Document #{i} metadata]\n{doc.metadata}\n\n")
            fp.write(doc.page_content.rstrip() + separator)
    print(f"Saved {len(docs)} documents → {path.resolve()}")
    return path

## Usage with contextual retrieval

In [10]:
docs = await crawl_docs(urls)
save_docs_txt(docs)
save_docs_pickle(docs)

Saved 257 documents → /Users/jakobeilts/Development/studit-chatbot/4_increment_contextual_retrieval/docs.txt
Saved 257 documents → /Users/jakobeilts/Development/studit-chatbot/4_increment_contextual_retrieval/docs.pkl.gz


PosixPath('docs.pkl.gz')