In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [32]:
import re

from bs4 import BeautifulSoup

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()


loader = RecursiveUrlLoader(
    "https://docs.python.org/3.9/",
    max_depth=4, # 2: 23 docs; 3: 481; 4: 25 (bug?); 5: 481
    # use_async=False,
    extractor=bs4_extractor,
    # metadata_extractor=None,
    # exclude_dirs=(),
    # timeout=10,
    # check_response_status=True,
    # continue_on_failure=True,
    # prevent_outside=True,
    # base_url=None,
    # ...
)

In [33]:
documents = loader.load()
print(f"{len(documents)} documents loaded, 1st doc: {documents[0]}")
print(documents[0].metadata)


25 documents loaded, 1st doc: page_content='3.9.19 Documentation\n\nDownload\nDownload these documents\nDocs by version\n\nPython 3.13 (in development)\nPython 3.12 (stable)\nPython 3.11 (security-fixes)\nPython 3.10 (security-fixes)\nPython 3.9 (security-fixes)\nPython 3.8 (security-fixes)\nPython 3.7 (EOL)\nPython 3.6 (EOL)\nPython 3.5 (EOL)\nPython 3.4 (EOL)\nPython 3.3 (EOL)\nPython 3.2 (EOL)\nPython 3.1 (EOL)\nPython 3.0 (EOL)\nPython 2.7 (EOL)\nPython 2.6 (EOL)\nAll versions\n\nOther resources\n\nPEP Index\nBeginner\'s Guide\nBook List\nAudio/Visual Talks\nPython Developer’s Guide\n\nNavigation\n\nindex\n\nmodules |\n\nPython »\n\n3.9.19 Documentation »\n    \n\n                     |\n                \n\nPython 3.9.19 documentation\n\n  Welcome! This is the official documentation for Python 3.9.19.\n  \nParts of the documentation:\n\nWhat\'s new in Python 3.9?\n or all "What\'s new" documents since 2.0\nTutorial\nstart here\nLibrary Reference\nkeep this under your pillow\nLangua

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                               chunk_overlap=50)
splits = text_splitter.split_documents(documents)
print(f"{len(splits)} splits")

docs =[]
for idx, split in enumerate(splits):
    split.metadata.update({'id':idx})
    docs.append(split)

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
                                    #'hkunlp/instructor-large', 
                                    #'sentence-transformers/all-MiniLM-L6-v2',
                                   model_kwargs={'device': "cpu"})

db = FAISS.from_documents(docs, embeddings)
db.save_local("weburl.db")