In [68]:
import os
import requests

from bs4 import BeautifulSoup
from collections import deque
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from urllib.parse import urljoin, urlparse

In [69]:
load_dotenv()
assert os.environ["OPENAI_API_KEY"] != ""

In [23]:
def bfs(url):
    visited = set([url])
    dq = deque([[url, "", 0]])
    max_depth = 5

    while dq:
        base, path, depth = dq.popleft()

        if depth < max_depth:
            try:
                soup = BeautifulSoup(requests.get(base + path).text, "html.parser")

                for link in soup.find_all("a"):
                    href = link.get("href")

                    if href.startswith("#"):
                        continue

                    full_url = urljoin(url, href)
                    if not full_url.startswith(url):
                        continue

                    if "#" in full_url:
                        continue

                    if full_url not in visited:
                        visited.add(full_url)
                        #print("  " * depth + f"at depth {depth}: {full_url}")

                        if full_url.startswith("http"):
                            dq.append([full_url, "", depth + 1])
                        else:
                            dq.append([base, full_url, depth + 1])
            except:
                pass

    return visited

In [24]:
urls = bfs("https://fastapi.tiangolo.com")

at depth 0: https://fastapi.tiangolo.com/newsletter/
at depth 0: https://fastapi.tiangolo.com/
at depth 0: https://fastapi.tiangolo.com/de/
at depth 0: https://fastapi.tiangolo.com/em/
at depth 0: https://fastapi.tiangolo.com/es/
at depth 0: https://fastapi.tiangolo.com/fa/
at depth 0: https://fastapi.tiangolo.com/fr/
at depth 0: https://fastapi.tiangolo.com/he/
at depth 0: https://fastapi.tiangolo.com/id/
at depth 0: https://fastapi.tiangolo.com/ja/
at depth 0: https://fastapi.tiangolo.com/ko/
at depth 0: https://fastapi.tiangolo.com/pl/
at depth 0: https://fastapi.tiangolo.com/pt/
at depth 0: https://fastapi.tiangolo.com/ru/
at depth 0: https://fastapi.tiangolo.com/tr/
at depth 0: https://fastapi.tiangolo.com/zh/
at depth 0: https://fastapi.tiangolo.com/features/
at depth 0: https://fastapi.tiangolo.com/fastapi-people/
at depth 0: https://fastapi.tiangolo.com/python-types/
at depth 0: https://fastapi.tiangolo.com/tutorial/
at depth 0: https://fastapi.tiangolo.com/tutorial/first-steps

  at depth 1: https://fastapi.tiangolo.com/classes-as-dependencies/
  at depth 1: https://fastapi.tiangolo.com/sub-dependencies/
  at depth 1: https://fastapi.tiangolo.com/dependencies-in-path-operation-decorators/
  at depth 1: https://fastapi.tiangolo.com/global-dependencies/
  at depth 1: https://fastapi.tiangolo.com/dependencies-with-yield/
  at depth 1: https://fastapi.tiangolo.com/get-current-user/
  at depth 1: https://fastapi.tiangolo.com/simple-oauth2/
  at depth 1: https://fastapi.tiangolo.com/oauth2-jwt/
  at depth 1: https://fastapi.tiangolo.com/path-operation-advanced-configuration/
  at depth 1: https://fastapi.tiangolo.com/additional-status-codes/
  at depth 1: https://fastapi.tiangolo.com/response-directly/
  at depth 1: https://fastapi.tiangolo.com/custom-response/
  at depth 1: https://fastapi.tiangolo.com/additional-responses/
  at depth 1: https://fastapi.tiangolo.com/response-cookies/
  at depth 1: https://fastapi.tiangolo.com/response-headers/
  at depth 1: https:

In [26]:
scrape_data = []
for url in urls:
    loader = WebBaseLoader(url)
    scrape_data.append(loader.load())

In [27]:
print(f'We have {len(scrape_data)} document(s)')

We have 245 document(s)


In [71]:
docs = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
for data in scrape_data:
    docs.extend(text_splitter.split_documents(data))

In [72]:
print(f'We have {len(docs)} document(s)')

We have 4326 document(s)


In [73]:
embeddings = OpenAIEmbeddings()

In [74]:
db = FAISS.from_documents(docs, embeddings)

In [75]:
llm = OpenAI(temperature=0, openai_api_key=os.environ["OPENAI_API_KEY"])
chain = load_qa_chain(llm, chain_type="stuff")

In [77]:
query = "List all the breaking changes from version to version and state what has been broken"
docs = db.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query)

' Breaking changes from 0.60.2 to 0.61.0 include breaking changes to features. Breaking changes from 0.61.0 to 0.61.2 include breaking changes to security fixes. Breaking changes from 0.62.0 to 0.63.0 include breaking changes to features and fixes. Breaking changes from 0.64.0 to 0.65.0 include breaking changes to upgrade. Breaking changes from 0.65.1 to 0.80.0 include breaking changes to fixes and security fixes. Breaking changes from 0.79.1 to 0.79.0 include breaking changes to fixes. Breaking changes from 0.78.0 to 0.77.1 include breaking changes to upgrades.'