In [1]:
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import requests
from langchain_openai import ChatOpenAI
import bs4
from bs4 import BeautifulSoup
import getpass
import os

In [7]:
# Get API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

··········


In [3]:
# Define a class for loading web pages based on URLs
class WebBaseLoader:
    def __init__(self, web_paths, bs_kwargs):
        # Initialize with a list of URLs and BeautifulSoup configurations
        self.web_paths = web_paths
        self.bs_kwargs = bs_kwargs

    def load(self):
        # Method to fetch content from each URL
        results = {}
        for url in self.web_paths:
            try:
                # Perform a HTTP GET request to the URL
                response = requests.get(url)
                # If the response is successful, process the text with BeautifulSoup
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser', parse_only=self.bs_kwargs['parse_only'])
                    results[url] = soup.get_text()
                else:
                    # Store None if the request was unsuccessful
                    results[url] = None
            except requests.RequestException as e:
                # Store the exception message if an error occurred during the request
                results[url] = str(e)
                results[url] = str(e)
        return results

    def save_to_document(self, filename='document.txt'):
        # Method to save the fetched content to a file
        results = self.load()
        with open(filename, 'w', encoding='utf-8') as file:
            for url, text in results.items():
                # Write each URL and its content to the file
                file.write(f'# URL: {url}\n{text}\n\n')

# Function to initialize and use the WebBaseLoader
def fetch_website_text(urls, selector_attrs):
    # Define the HTML elements to parse
    strainer = bs4.SoupStrainer(**selector_attrs)
    # Create an instance of the loader with the specified URLs and parsing filter
    loader = WebBaseLoader(
        web_paths=urls,
        bs_kwargs={"parse_only": strainer}
    )
    # Fetch the data
    return loader.load()

# URLs to be fetched
urls = [
    "https://www.theguardian.com/technology/2016/may/03/amazon-fresh-food-deliveries-understood-to-start-this-month",
    "https://www.theguardian.com/media/2016/may/16/bbc-netflix-rival-itv-nbc-universal",
    "https://www.theguardian.com/technology/2016/apr/28/amazon-most-profitable-quarter-sales-up-costs",
    "https://www.theguardian.com/technology/2016/apr/26/amazon-kindle-oasis-review-luxury-e-reader",
    "https://www.theguardian.com/environment/andes-to-the-amazon/2016/may/25/london-stock-exchange-amazon-deforestation",
    "https://www.theguardian.com/media/2016/may/25/netflix-and-amazon-must-guarantee-20-of-content-is-european",
    "https://www.theguardian.com/technology/2016/may/26/amazon-echo-virtual-assistant-child-privacy-law",
]
# HTML class attributes used for filtering content
selector_attrs1 = {"class": "dcr-1qg0p6f"}
selector_attrs2 = {"class": "dcr-1qg0p6f"}
selector_attrs3 = {"class": "dcr-1qg0p6f"}
selector_attrs4 = {"class": "dcr-1qg0p6f"}
selector_attrs5 = {"class": "dcr-1qg0p6f"}
selector_attrs6 = {"class": "dcr-1qg0p6f"}
selector_attrs7 = {"class": "dcr-1qg0p6f"}

# Initialize the loader with the URLs and the specific attribute filter
loader = WebBaseLoader(urls, {"parse_only": bs4.SoupStrainer(**selector_attrs1)})

In [4]:
# After fetching documents using a previously defined loader (WebBaseLoader)
docs = loader.load()

In [5]:
# Define a class that handles the splitting of text into smaller chunks with a defined size and overlap
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size, chunk_overlap):
        # Constructor to initialize with chunk size and the overlap between consecutive chunks
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_document(self, text):
        """Method to split a single document into overlapping chunks.
        It generates a list of substrings, each of a specified 'chunk_size', with 'chunk_overlap' characters overlapping between adjacent chunks.
        """
        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]

    def split_documents(self, documents):
        """Method to split multiple documents which may be either strings or objects with a 'page_content' attribute. 
        This method loops through each document, checks its type, extracts text, and applies 'split_document'.
        """
        splits = []
        for doc in documents:
            # Check if 'doc' is a string or has attribute 'page_content'
            if isinstance(doc, str):
                text = doc  # Treat doc as plain text
            else:
                text = getattr(doc, 'page_content', '')
            splits.extend(self.split_document(text))
        return splits

In [None]:
# Instantiate the text splitter and split loaded documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [8]:
# Initialize a vector store with the split texts using embeddings from OpenAI to create semantic search capabilities
vectorstore = Chroma.from_texts(texts=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

In [9]:
# Load a pre-defined prompt from a hub, intended for use with language models in a retrieval-augmented generation setup
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [10]:
# Function to format documents into a uniform structure for processing.
def format_strings(documents):
    """ Formats a list of documents by either extracting 'page_content' from dictionaries or using string directly. 
    Returns a single string with documents separated by two newlines.
    """
    formatted_documents = []
    for doc in documents:
        if isinstance(doc, str):
            formatted_documents.append(doc)
        elif isinstance(doc, dict):
            # Check and extract 'page_content'
            formatted_documents.append(doc.get('page_content', ''))
        else:
            # Fallback or default string if the document type is not supported
            formatted_documents.append('')
    return "\n\n".join(formatted_documents)

# Format the documents for use with the response chain
formatted_context = format_strings(docs)

# Define a retrieval-augmented generation (RAG) chain that combines context retrieval with language model generation
rag_chain = (
    {"context": lambda x: retriever, "question": RunnablePassthrough()} | prompt | llm
)

In [None]:
# Main function to interactively query the system and display responses
def main():
    while True:
        question = input("Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        response = rag_chain.invoke(question)
        response_str = str(response)
        content_start = response_str.find("content='") + len("content='")
        content_end = response_str.find("'", content_start) + 1
        content = response_str[content_start:content_end]
        print("Response:", content)

if __name__ == "__main__":
    main()

Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): What are the news about Amazon between April and June 2016?
Response: "Amazon reported its most profitable quarter in April 2016 with increased sales and lower costs. In May 2016, Amazon was understood to start fresh food deliveries. Additionally, in the same month, there were concerns about Amazon'
Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): Tell me more about Amazon profitable quarter
Response: "Amazon reported its most profitable quarter in April 2016, with sales up and costs down. The increase in profits was attributed to strong demand for Amazon Web Services and growth in its Prime program. This success came despite concerns about the impact of Amazon'
Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): Why is the most profitable quarter?
Response: The most profitable quarter is typically the one where sales are high and costs are co