In [1]:
# !pip install langchain langchain-chroma langchain-openai chroma langchainhub

In [2]:
# Standard Library Imports
import os
import requests
import getpass

# BeautifulSoup for HTML Parsing
import bs4
from bs4 import BeautifulSoup

# NLTK for Natural Language Processing
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

# langchain for Language Model Operations
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter

# NLTK Setup
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Get API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

··········


In [4]:
# Define the WebBaseLoader class
class WebBaseLoader:
    # Constructor to initialize the WebBaseLoader object with web_paths and bs_kwargs
    def __init__(self, web_paths, bs_kwargs):
        self.web_paths = web_paths  # Stores a list of URLs to be processed
        self.bs_kwargs = bs_kwargs  # Stores additional arguments for BeautifulSoup

    # Method to load data from each web path and parse the HTML content
    def load(self):
        results = {}  # Dictionary to store the results of web scraping
        for url in self.web_paths:  # Iterating over each URL in the web_paths list
            try:
                response = requests.get(url)  # Sending a GET request to the URL
                if response.status_code == 200:  # Checking if the request was successful
                    # Parsing the HTML content with BeautifulSoup using the provided arguments
                    soup = BeautifulSoup(response.text, 'html.parser', **self.bs_kwargs)
                    results[url] = soup.get_text()  # Extracting text from the parsed HTML and storing it in the results dictionary
                else:
                    results[url] = None  # Storing None if the response was unsuccessful
            except requests.RequestException as e:  # Handling exceptions that may occur during the GET request
                results[url] = str(e)  # Storing the exception message as the result for the URL
        return results  # Returning the dictionary containing the results of the web scraping

In [5]:
# Function definition to fetch and return text content from specified website URLs using a given set of selector attributes
def fetch_website_text(urls, selector_attrs):
    # Creating a SoupStrainer object that filters out all unnecessary data except for elements matching the provided attributes
    strainer = bs4.SoupStrainer(**selector_attrs)
    # Initializing the WebBaseLoader with the URLs and the strainer object to parse only necessary parts of HTML
    loader = WebBaseLoader(web_paths=urls, bs_kwargs={"parse_only": strainer})
    # Calling the 'load' method from the WebBaseLoader instance to fetch and parse the web pages
    return loader.load()

# List of URLs from which to scrape data
urls = [
    "https://www.theguardian.com/technology/2016/may/03/amazon-fresh-food-deliveries-understood-to-start-this-month",
    "https://www.theguardian.com/media/2016/may/16/bbc-netflix-rival-itv-nbc-universal",
    "https://www.theguardian.com/technology/2016/apr/28/amazon-most-profitable-quarter-sales-up-costs",
    "https://www.theguardian.com/technology/2016/apr/26/amazon-kindle-oasis-review-luxury-e-reader",
    "https://www.theguardian.com/environment/andes-to-the-amazon/2016/may/25/london-stock-exchange-amazon-deforestation",
    "https://www.theguardian.com/media/2016/may/25/netflix-and-amazon-must-guarantee-20-of-content-is-european",
    "https://www.theguardian.com/technology/2016/may/26/amazon-echo-virtual-assistant-child-privacy-law",
]
# Dictionary specifying the attributes to filter HTML elements using SoupStrainer
selector_attrs = {"class": "article-body-commercial-selector"}

### The function fetch_website_text is now ready to be called with the list of URLs and selector attributes

In [6]:
# Initialize the WebBaseLoader with URLs and BeautifulSoup keyword arguments
loader = WebBaseLoader(urls, {"parse_only": bs4.SoupStrainer(**selector_attrs)})

# Load the content from the specified URLs
docs = loader.load()

In [7]:
# Definition of the RecursiveCharacterTextSplitter class
class RecursiveCharacterTextSplitter:
    # Constructor to initialize the RecursiveCharacterTextSplitter object with chunk_size and chunk_overlap
    def __init__(self, chunk_size, chunk_overlap):
        self.chunk_size = chunk_size  # The number of characters in each text chunk
        self.chunk_overlap = chunk_overlap  # The number of characters each chunk overlaps with the next

    # Method to split a single text string into smaller chunks based on chunk_size and chunk_overlap
    def split_document(self, text):
        # Creates a list of text chunks by iterating over the text. The starting index for each chunk is adjusted by the chunk_size minus the chunk_overlap.
        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]

    # Method to split multiple documents, where each document is a string or an object containing a string attribute
    def split_documents(self, documents):
        splits = []  # List to hold all chunks from all documents
        for doc in documents:  # Iterating over each document in the provided list
            if isinstance(doc, str):
                text = doc  # Directly assigns the document to text if it is a string
            else:
                text = getattr(doc, 'page_content', '')  # Attempts to fetch 'page_content' from the document object; defaults to empty string if not found
            splits.extend(self.split_document(text))  # Adds the chunks from the current document to the splits list
        return splits  # Returns the list of all chunks from all documents

In [8]:
# Creating an instance of RecursiveCharacterTextSplitter with a chunk size of 1000 characters and an overlap of 200 characters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# Splitting a list of documents into smaller, overlapping chunks to maintain context between sections
splits = text_splitter.split_documents(docs)

# Initializing a vector store to enable semantic search capabilities using embeddings from OpenAI
vectorstore = Chroma.from_texts(texts=splits, embedding=OpenAIEmbeddings())
# Creating a retriever from the vector store for efficient information retrieval
retriever = vectorstore.as_retriever()

# Retrieving a pre-defined prompt designed for use with language models in a retrieval-augmented generation setup
prompt = hub.pull("rlm/rag-prompt")
# Initializing a ChatOpenAI instance with the specified model to use for generating responses based on the prompt
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [9]:
# Function to format strings from a list of documents into a single string
def format_strings(documents):
    formatted_documents = []  # List to hold formatted documents
    for doc in documents:  # Iterate through each document in the input list
        if isinstance(doc, str):
            formatted_documents.append(doc)  # Add the string directly if the document is a string
        elif isinstance(doc, dict):
            # If the document is a dictionary, retrieve the value of 'page_content', defaulting to an empty string if not found
            formatted_documents.append(doc.get('page_content', ''))
        else:
            # Append an empty string if the document is neither a string nor a dictionary
            formatted_documents.append('')
    # Join all formatted documents into a single string, separated by two newlines
    return "\n\n".join(formatted_documents)

# Usage of the function to format a list of documents
formatted_context = format_strings(docs)

# Define a retrieval-augmented generation chain using a context retriever and a question passthrough in a pipeline setup
rag_chain = (
    # Lambda function to fetch relevant documents based on the context provided by the retriever
    {"context": lambda x: retriever.get_relevant_documents(x), "question": RunnablePassthrough()} |
    # Incorporates the prompt, likely intended to direct the language model's response generation
    prompt |
    # Final component of the chain, a language model from OpenAI configured to generate responses
    llm
)

In [10]:
# References and Summaries (manually sourced and created) for BLEU score calculation
references = {
    "https://www.theguardian.com/technology/2016/may/03/amazon-fresh-food-deliveries-understood-to-start-this-month": "Amazon is believed to be planning to start delivering fresh food in the UK this month.",
    "https://www.theguardian.com/media/2016/may/16/bbc-netflix-rival-itv-nbc-universal": "BBC and ITV partner with NBC Universal to rival Netflix.",
    "https://www.theguardian.com/technology/2016/apr/28/amazon-most-profitable-quarter-sales-up-costs": "Amazon reports its most profitable quarter with increased sales.",
    "https://www.theguardian.com/technology/2016/apr/26/amazon-kindle-oasis-review-luxury-e-reader": "Review of Amazon Kindle Oasis, a luxury e-reader.",
    "https://www.theguardian.com/environment/andes-to-the-amazon/2016/may/25/london-stock-exchange-amazon-deforestation": "London Stock Exchange faces scrutiny over Amazon deforestation.",
    "https://www.theguardian.com/media/2016/may/25/netflix-and-amazon-must-guarantee-20-of-content-is-european": "Netflix and Amazon must ensure 20% of their content is European.",
    "https://www.theguardian.com/technology/2016/may/26/amazon-echo-virtual-assistant-child-privacy-law": "Amazon Echo virtual assistant faces child privacy law issues."
}

In [11]:
# Function to calculate the BLEU score between a hypothesis and a reference text
def calculate_bleu(hypothesis, reference):
    # Tokenizing the reference text into words
    reference_tokens = word_tokenize(reference)
    # Tokenizing the hypothesis text into words
    hypothesis_tokens = word_tokenize(hypothesis)
    # Creating a smoothing function to handle cases where precision is 0 (common in short texts or specific domains)
    smoothing_function = SmoothingFunction().method1
    # Calculating the BLEU score using the sentence-level BLEU function, suitable for comparing a pair of sentences
    return sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)

In [12]:
# Function to interact with a retrieval-augmented generation system and evaluate BLEU scores
def main():
    bleu_scores = []  # List to store the BLEU scores for each response

    # Looping over a dictionary 'references' that contains URLs and their corresponding reference summaries
    for url, ref_summary in references.items():
        print(f"Question about URL: {url}")  # Printing the URL being processed
        # Invoking the RAG chain to generate a summary for the content of the URL
        response = rag_chain.invoke(f"Summarize the main content of the article from URL: {url}")
        response_str = str(response)  # Converting the response to a string

        # Extracting the content from the response string based on predefined format
        content_start = response_str.find("content='") + len("content='")
        content_end = response_str.find("'", content_start)
        content = response_str[content_start:content_end]

        print("Generated Response:", content)  # Printing the generated content
        # Calculating the BLEU score for the generated content against the reference summary
        bleu_score = calculate_bleu(content, ref_summary)
        bleu_scores.append(bleu_score)  # Appending the BLEU score to the list
        print(f"BLEU Score: {bleu_score}\n")  # Printing the BLEU score

    # Calculating the average BLEU score from all scores computed
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"Average BLEU Score: {average_bleu}")  # Printing the average BLEU score

if __name__ == "__main__":
    main()

Question about URL: https://www.theguardian.com/technology/2016/may/03/amazon-fresh-food-deliveries-understood-to-start-this-month


  warn_deprecated(


Generated Response: The article from The Guardian discusses the launch of Amazon Fresh food deliveries expected to start in the same month. This new service will allow customers to order fresh groceries online. Amazon aims to compete with other food delivery services by offering a convenient and efficient option for customers.
BLEU Score: 0.02465289405820389

Question about URL: https://www.theguardian.com/media/2016/may/16/bbc-netflix-rival-itv-nbc-universal
Generated Response: "The article discusses the BBC
BLEU Score: 0.01774239756616722

Question about URL: https://www.theguardian.com/technology/2016/apr/28/amazon-most-profitable-quarter-sales-up-costs
Generated Response: "The article discusses Amazon
BLEU Score: 0.01976560930094397

Question about URL: https://www.theguardian.com/technology/2016/apr/26/amazon-kindle-oasis-review-luxury-e-reader
Generated Response: "The article discusses the review of the Amazon Kindle Oasis, highlighting its luxury features and design. It provides

In [13]:

# Main function to interactively query the system and display responses
def main():
    while True:
        question = input("Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        response = rag_chain.invoke(question)
        response_str = str(response)
        content_start = response_str.find("content='") + len("content='")
        content_end = response_str.find("'", content_start) + 1
        content = response_str[content_start:content_end]
        print("Response:", content)

if __name__ == "__main__":
    main()

Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): Summarise the news
Response: The news covers topics such as the competition between BBC and Netflix, concerns about deforestation linked to the London Stock Exchange and Amazon, and the call for Netflix and Amazon to ensure a percentage of their content is European. Additionally, there is a review of the Amazon Kindle Oasis as a luxury e-reader.'
Ask a question about Amazon between April and June 2016 (or type 'exit' to quit): What are the concerns?
Response: "The concerns raised in the retrieved context include child privacy laws in relation to Amazon Echo, the impact of Amazon Fresh food deliveries starting, the deforestation issues linked to Amazon, and Amazon'


KeyboardInterrupt: Interrupted by user