In [1]:
import time, requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
import gzip, pickle
from helper.list_of_all_html import urls

# Crawling
Since we are using data from websites we are building a simple webcrawler and store the data in the *docs* variable

In [2]:
def extract_visible_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    return "\n".join(line.strip()
                     for line in soup.get_text("\n").splitlines()
                     if line.strip())

def crawl_urls(urls, delay=0.4) -> list[Document]:
    docs: list[Document] = []
    for url in urls:
        try:
            r = requests.get(url,
                             headers={"User-Agent": "Mozilla/5.0"},
                             timeout=15)
            r.raise_for_status()
            text = extract_visible_text(r.text)
            docs.append(Document(page_content=text,
                                 metadata={"url": url}))
        except Exception as exc:
            print(f"[!!] {url}: {exc}")
        time.sleep(delay)
    return docs

We are saving the docs in a pickle file to use in another jupyter notebook

In [3]:
# --- save ---
docs = crawl_urls(urls)
with gzip.open("docs.pkl.gz", "wb") as f:
    pickle.dump(docs, f)