In [1]:
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import requests
from langchain_openai import ChatOpenAI
import bs4
from bs4 import BeautifulSoup
import getpass
import os

In [7]:
os.environ["OPENAI_API_KEY"] = getpass.getpass()

··········


In [3]:
class WebBaseLoader:
    def __init__(self, web_paths, bs_kwargs):
        self.web_paths = web_paths
        self.bs_kwargs = bs_kwargs

    def load(self):
        results = {}
        for url in self.web_paths:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser', parse_only=self.bs_kwargs['parse_only'])
                    results[url] = soup.get_text()
                else:
                    results[url] = None
            except requests.RequestException as e:
                results[url] = str(e)
        return results

    def save_to_document(self, filename='document.txt'):
        results = self.load()
        with open(filename, 'w', encoding='utf-8') as file:
            for url, text in results.items():
                file.write(f'# URL: {url}\n{text}\n\n')

def fetch_website_text(urls, selector_attrs):
    strainer = bs4.SoupStrainer(**selector_attrs)
    loader = WebBaseLoader(
        web_paths=urls,
        bs_kwargs={"parse_only": strainer}
    )
    return loader.load()

urls = [
    "https://www.theguardian.com/technology/2016/may/03/amazon-fresh-food-deliveries-understood-to-start-this-month",
    "https://www.theguardian.com/media/2016/may/16/bbc-netflix-rival-itv-nbc-universal",
    "https://www.theguardian.com/technology/2016/apr/28/amazon-most-profitable-quarter-sales-up-costs",
    "https://www.theguardian.com/technology/2016/apr/26/amazon-kindle-oasis-review-luxury-e-reader",
    "https://www.theguardian.com/environment/andes-to-the-amazon/2016/may/25/london-stock-exchange-amazon-deforestation",
    "https://www.theguardian.com/media/2016/may/25/netflix-and-amazon-must-guarantee-20-of-content-is-european",
    "https://www.theguardian.com/technology/2016/may/26/amazon-echo-virtual-assistant-child-privacy-law",
]
selector_attrs1 = {"class": "dcr-1qg0p6f"}
selector_attrs2 = {"class": "dcr-1qg0p6f"}
selector_attrs3 = {"class": "dcr-1qg0p6f"}
selector_attrs4 = {"class": "dcr-1qg0p6f"}
selector_attrs5 = {"class": "dcr-1qg0p6f"}
selector_attrs6 = {"class": "dcr-1qg0p6f"}
selector_attrs7 = {"class": "dcr-1qg0p6f"}

loader = WebBaseLoader(urls, {"parse_only": bs4.SoupStrainer(**selector_attrs1)})

In [4]:
docs = loader.load()

In [5]:
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size, chunk_overlap):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_document(self, text):
        """ Split a single document based on chunk size and overlap. """
        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]

    def split_documents(self, documents):
        """ Split multiple documents. Documents can be either a list of strings or objects with 'page_content' attribute. """
        splits = []
        for doc in documents:
            # Check if 'doc' is a string or has attribute 'page_content'
            if isinstance(doc, str):
                text = doc  # Treat doc as plain text
            else:
                text = getattr(doc, 'page_content', '')
            splits.extend(self.split_document(text))
        return splits

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [8]:
# Embed into the db
vectorstore = Chroma.from_texts(texts=splits, embedding=OpenAIEmbeddings())

In [9]:
# Set up the retriever
retriever = vectorstore.as_retriever()
# Prompt from langchain hub
prompt = hub.pull("rlm/rag-prompt")
# LLM
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [10]:
def format_strings(documents):
    """Formats a list of documents by extracting 'page_content' from dictionary or directly using string documents."""
    formatted_documents = []
    for doc in documents:
        if isinstance(doc, str):
            formatted_documents.append(doc)
        elif isinstance(doc, dict):
            # Check and extract 'page_content'
            formatted_documents.append(doc.get('page_content', ''))
        else:
            # Fallback or default string if the document type is not supported
            formatted_documents.append('')
    return "\n\n".join(formatted_documents)

formatted_context = format_strings(docs)  # Format the loaded documents
#question = "What is the story about Amazon?"

rag_chain = (
    {"context": lambda x: retriever, "question": RunnablePassthrough()} | prompt | llm
)

In [None]:
def main():
    while True:
        question = input("Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        response = rag_chain.invoke(question)
        response_str = str(response)
        content_start = response_str.find("content='") + len("content='")
        content_end = response_str.find("'", content_start) + 1
        content = response_str[content_start:content_end]
        print("Response:", content)

if __name__ == "__main__":
    main()

Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): What are the news about Amazon between April and June 2016?
Response: "Amazon reported its most profitable quarter in April 2016 with increased sales and lower costs. In May 2016, Amazon was understood to start fresh food deliveries. Additionally, in the same month, there were concerns about Amazon'
Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): Tell me more about Amazon profitable quarter
Response: "Amazon reported its most profitable quarter in April 2016, with sales up and costs down. The increase in profits was attributed to strong demand for Amazon Web Services and growth in its Prime program. This success came despite concerns about the impact of Amazon'
Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): Why is the most profitable quarter?
Response: The most profitable quarter is typically the one where sales are high and costs are co