In [1]:
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import requests
from langchain_openai import ChatOpenAI
import bs4
from bs4 import BeautifulSoup
import getpass
import os

In [2]:
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [3]:
class WebBaseLoader:
    def __init__(self, web_paths, bs_kwargs):
        self.web_paths = web_paths
        self.bs_kwargs = bs_kwargs

    def load(self):
        results = {}
        for url in self.web_paths:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser', parse_only=self.bs_kwargs['parse_only'])
                    results[url] = soup.get_text()
                else:
                    results[url] = None
            except requests.RequestException as e:
                results[url] = str(e)
        return results

    def save_to_document(self, filename='document.txt'):
        results = self.load()
        with open(filename, 'w', encoding='utf-8') as file:
            for url, text in results.items():
                file.write(f'# URL: {url}\n{text}\n\n')

def fetch_website_text(urls, selector_attrs):
    strainer = bs4.SoupStrainer(**selector_attrs)
    loader = WebBaseLoader(
        web_paths=urls,
        bs_kwargs={"parse_only": strainer}
    )
    return loader.load()

urls = [
    "https://www.theguardian.com/commentisfree/2016/jun/10/amazon-fresh-culture-instant-gratification",
    "https://www.theguardian.com/technology/2016/dec/11/amazon-accused-of-intolerable-conditions-at-scottish-warehouse"
]
selector_attrs1 = {"class_":"dcr-1bgz3hb"}
selector_attrs2 = {"class": "dcr-iy9ec7"}

loader = WebBaseLoader(urls, {"parse_only": bs4.SoupStrainer(**selector_attrs1)})

In [4]:
docs = loader.load()

In [5]:
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size, chunk_overlap):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_document(self, text):
        """ Split a single document based on chunk size and overlap. """
        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]

    def split_documents(self, documents):
        """ Split multiple documents. Documents can be either a list of strings or objects with 'page_content' attribute. """
        splits = []
        for doc in documents:
            # Check if 'doc' is a string or has attribute 'page_content'
            if isinstance(doc, str):
                text = doc  # Treat doc as plain text
            else:
                text = getattr(doc, 'page_content', '')
            splits.extend(self.split_document(text))
        return splits

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
# Embed into the db
vectorstore = Chroma.from_texts(texts=splits, embedding=OpenAIEmbeddings())

In [None]:
# Set up the retriever
retriever = vectorstore.as_retriever()
# Prompt from langchain hub
prompt = hub.pull("rlm/rag-prompt")
# LLM
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [None]:
def format_strings(documents):
    """Formats a list of documents by extracting 'page_content' from dictionary or directly using string documents."""
    formatted_documents = []
    for doc in documents:
        if isinstance(doc, str):
            formatted_documents.append(doc)
        elif isinstance(doc, dict):
            # Check and extract 'page_content'
            formatted_documents.append(doc.get('page_content', ''))
        else:
            # Fallback or default string if the document type is not supported
            formatted_documents.append('')
    return "\n\n".join(formatted_documents)

formatted_context = format_strings(docs)  # Format the loaded documents
#question = "What is the story about Amazon?"

rag_chain = (
    {"context": lambda x: retriever, "question": RunnablePassthrough()} | prompt | llm
)

In [None]:
def main():
    while True:
        question = input("Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        response = rag_chain.invoke(question)
        response_str = str(response)
        content_start = response_str.find("content='") + len("content='")
        content_end = response_str.find("'", content_start) + 1
        content = response_str[content_start:content_end]
        print("Response:", content)

if __name__ == "__main__":
    main()

Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): WHAT HAPPENED TO THE AMAZON WAREHOUSE AND WAREHOUSE IN SCOTLAND?
Response: Workers at the Amazon warehouse in Scotland were reportedly subjected to intolerable conditions, according to accusations. The company has faced criticism regarding its treatment of employees in various locations. Specific details of what happened at the Amazon warehouse in Scotland were not provided in the retrieved context.'
Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): CAN YOU GIVE ME THE LINK ABOUT THIS NEW?
Response: Here are two links related to Amazon: \n1. https://www.theguardian.com/commentisfree/2016/jun/10/amazon-fresh-culture-instant-gratification\n2. https://www.theguardian.com/technology/2016/dec/11/amazon-accused-of-intolerable-conditions-at-scottish-warehouse'
Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): QUIT
Response: "I'


KeyboardInterrupt: Interrupted by user