In [1]:
# !pip install langchain langchain-chroma langchain-openai chroma langchainhub datasets

In [None]:
# Standart Libraries
import os
import getpass

# Web Scraping and HTTP Requests
import requests
import bs4
from bs4 import BeautifulSoup

# NLP
import nltk
from transformers import pipeline

# Data Handling and Analysis
import pandas as pd
import numpy as np
import datasets # For the metric

# Langchain for RAG
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
# To experiment with the plain version of the model, just changing the line below is enough to display the difference.

# qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

In [2]:
# Initialize the question-answering pipeline from the Fine-Tuned model
qa_pipeline = pipeline("question-answering", model="kgntmr/RoBERTa-SQuAD2.0-SubjQA")

In [3]:
# Get API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [4]:
# Define the WebBaseLoader class
class WebBaseLoader:
    # Constructor to initialize the WebBaseLoader object with web_paths and bs_kwargs
    def __init__(self, web_paths, bs_kwargs):
        self.web_paths = web_paths  # Stores a list of URLs to be processed
        self.bs_kwargs = bs_kwargs  # Stores additional arguments for BeautifulSoup

    # Method to load data from each web path and parse the HTML content
    def load(self):
        results = {}  # Dictionary to store the results of web scraping
        for url in self.web_paths:  # Iterating over each URL in the web_paths list
            try:
                response = requests.get(url)  # Sending a GET request to the URL
                if response.status_code == 200:  # Checking if the request was successful
                    # Parsing the HTML content with BeautifulSoup using the provided arguments
                    soup = BeautifulSoup(response.text, 'html.parser', **self.bs_kwargs)
                    results[url] = soup.get_text()  # Extracting text from the parsed HTML and storing it in the results dictionary
                else:
                    results[url] = None  # Storing None if the response was unsuccessful
            except requests.RequestException as e:  # Handling exceptions that may occur during the GET request
                results[url] = str(e)  # Storing the exception message as the result for the URL
        return results  # Returning the dictionary containing the results of the web scraping

#### The URLs contains news articles according to the Semantic and Sentiment Analysis results.

In [5]:
# Function definition to fetch and return text content from specified website URLs using a given set of selector attributes
def fetch_website_text(urls, selector_attrs):
    # Creating a SoupStrainer object that filters out all unnecessary data except for elements matching the provided attributes
    strainer = bs4.SoupStrainer(**selector_attrs)
    # Initializing the WebBaseLoader with the URLs and the strainer object to parse only necessary parts of HTML
    loader = WebBaseLoader(web_paths=urls, bs_kwargs={"parse_only": strainer})
    # Calling the 'load' method from the WebBaseLoader instance to fetch and parse the web pages
    return loader.load()

# List of URLs from which to scrape data
urls = [
    "https://www.theguardian.com/technology/2016/may/03/amazon-fresh-food-deliveries-understood-to-start-this-month",
    "https://www.theguardian.com/media/2016/may/16/bbc-netflix-rival-itv-nbc-universal",
    "https://www.theguardian.com/technology/2016/apr/28/amazon-most-profitable-quarter-sales-up-costs",
    "https://www.theguardian.com/technology/2016/apr/26/amazon-kindle-oasis-review-luxury-e-reader",
    "https://www.theguardian.com/environment/andes-to-the-amazon/2016/may/25/london-stock-exchange-amazon-deforestation",
    "https://www.theguardian.com/media/2016/may/25/netflix-and-amazon-must-guarantee-20-of-content-is-european",
    "https://www.theguardian.com/technology/2016/may/26/amazon-echo-virtual-assistant-child-privacy-law",
]
# Dictionary specifying the attributes to filter HTML elements using SoupStrainer
selector_attrs = {"class": "article-body-commercial-selector"}

### The function fetch_website_text is now ready to be called with the list of URLs and selector attributes

In [6]:
# Initialize the WebBaseLoader with URLs and BeautifulSoup keyword arguments
loader = WebBaseLoader(urls, {"parse_only": bs4.SoupStrainer(**selector_attrs)})

# Load the content from the specified URLs
docs = loader.load()

In [7]:
# Definition of the RecursiveCharacterTextSplitter class
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size, chunk_overlap):
        self.chunk_size = chunk_size  # The number of characters in each text chunk
        self.chunk_overlap = chunk_overlap  # The number of characters each chunk overlaps with the next

    def split_document(self, text):
        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]

    def split_documents(self, documents):
        splits = []  # List to hold all chunks from all documents
        for doc in documents:  # Iterating over each document in the provided list
            if isinstance(doc, str):
                text = doc  # Directly assigns the document to text if it is a string
            else:
                text = getattr(doc, 'page_content', '')  # Attempts to fetch 'page_content' from the document object; defaults to empty string if not found
            splits.extend(self.split_document(text))  # Adds the chunks from the current document to the splits list
        return splits  # Returns the list of all chunks from all documents

In [8]:
# Creating an instance of RecursiveCharacterTextSplitter with a chunk size of 1000 characters and an overlap of 200 characters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# Splitting a list of documents into smaller, overlapping chunks to maintain context between sections
splits = text_splitter.split_documents(docs)

# Initializing a vector store to enable semantic search capabilities using embeddings from OpenAI
vectorstore = Chroma.from_texts(texts=splits, embedding=OpenAIEmbeddings())
# Creating a retriever from the vector store for efficient information retrieval
retriever = vectorstore.as_retriever()

# Retrieving a pre-defined prompt designed for use with language models in a retrieval-augmented generation setup
prompt = hub.pull("rlm/rag-prompt")

In [9]:
# Function to format strings from a list of documents into a single string
def format_strings(documents):
    formatted_documents = []  # List to hold formatted documents
    for doc in documents:  # Iterate through each document in the input list
        if isinstance(doc, str):
            formatted_documents.append(doc)  # Add the string directly if the document is a string
        elif isinstance(doc, dict):
            # If the document is a dictionary, retrieve the value of 'page_content', defaulting to an empty string if not found
            formatted_documents.append(doc.get('page_content', ''))
        else:
            # Append an empty string if the document is neither a string nor a dictionary
            formatted_documents.append('')
    # Join all formatted documents into a single string, separated by two newlines
    return "\n\n".join(formatted_documents)

# Usage of the function to format a list of documents
formatted_context = format_strings(docs)

In [10]:
# Define the User Interface for RAG based on the user's queries
def user_interface():
    while True:
        # User input for the question
        question = input("Ask a question about Amazon between 25 April and 15 June 2016 (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            print("Exiting the program.")
            break
        # Process the user's question using the retrieval-augmented generation pipeline
        response = rag_answer(question, formatted_context)
        print("Answer:", response)

# Function to generate answers using the RAG pipeline and provided context
def rag_answer(question, context):
    # Generate answer using the RAG pipeline
    answer = qa_pipeline(question=question, context=context)
    return answer['answer']  # Return only the 'answer' part of the result

# # Main function to handle the application's execution flow
# if __name__ == "__main__":
#     user_interface()

## Experiment

#### Here is a brief summary of the experiment on Fine-Tuning and the RAG System:

##### Fine-Tuning:

- Before fine-tuning, the QA model had a prediction of 2% on EM and 9.41% on the F1 score.
- After fine-tuning the SubjQA train data, the QA model was tested on unseen data using the SubjQA Test Dataset.
- Fine-tuning resulted in a performance improvement to 62.52% EM and 64.62% F1 score on the test data.

##### RAG System:

- In this RAG system, from this point, the model is tested with the same dataset and metric used in the fine-tuning stage.
- The plain version of the model which "deepset/roberta-base-squad2" had a prediction of 4.5% EM Score and 24.26% F1 Score
- The fine-tuned version of the model resulted in a 17% EM score and a 21.85% F1 score, showing an improvement in the EM score but a decrease in the F1 score.

###  Test the RAG System
- The code below is exactly the same as in the fine-tuning stage. The point at which the code starts to change will be indicated.

In [None]:
# Load the dataset
data = pd.read_csv('subjqa-test.csv')

In [13]:
data=data[['question','human_ans_indices','review','human_ans_spans']]
data['id']=np.linspace(0,len(data)-1,len(data))

In [14]:
data

Unnamed: 0,question,human_ans_indices,review,human_ans_spans,id
0,Is this storyline interesting or strong?,"(1850, 1886)","Spoilers thar be, Maytees.Is a man created by ...",just barely above mildly interesting,0.0
1,Is this storyline interesting or strong?,"(240, 249)","Spoilers thar be, Maytees.Is a man created by ...",important,1.0
2,Is the sound of the movie a reason to recommen...,"(111, 135)",This was my first bluray of a Disney classic a...,sound were crystal clear,2.0
3,Is the sound of the movie a reason to recommen...,"(394, 408)",This was my first bluray of a Disney classic a...,ANSWERNOTFOUND,3.0
4,How do you rate the sound?,"(5806, 5820)","In the realm of big Hollywood filmmaking, a fe...",ANSWERNOTFOUND,4.0
...,...,...,...,...,...
577,How is it act ?,"(539, 561)",del Toro's visual imagination is present in Pa...,the dialogue is clunky,577.0
578,How is the view?,"(2034, 2048)","Now I am not a religious person, even tough I ...",ANSWERNOTFOUND,578.0
579,How is the view?,"(2034, 2048)","Now I am not a religious person, even tough I ...",ANSWERNOTFOUND,579.0
580,How is the costume?,"(26, 55)","The sets are spectucular, the costumes are so ...",the costumes are so authentic,580.0


In [15]:
int(data.iloc[0].human_ans_indices.split('(')[1].split(',')[0])
float(data.iloc[0].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])
data['answers']=data['human_ans_spans']

In [16]:
# Extract answer data and adds it to a new column
for i in range(0,len(data)):
  answer1={}
  si=int(data.iloc[i].human_ans_indices.split('(')[1].split(',')[0])
  ei=int(data.iloc[i].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])
  answer1['text']=[data.iloc[i].review[si:ei]]
  answer1['answer_start']=[si]
  data.at[i, 'answers']=answer1

In [17]:
data.columns=['question', 'human_ans_indices', 'context', 'human_ans_spans', 'id',
       'answers']

In [18]:
data.head()

Unnamed: 0,question,human_ans_indices,context,human_ans_spans,id,answers
0,Is this storyline interesting or strong?,"(1850, 1886)","Spoilers thar be, Maytees.Is a man created by ...",just barely above mildly interesting,0.0,{'text': ['just barely above mildly interestin...
1,Is this storyline interesting or strong?,"(240, 249)","Spoilers thar be, Maytees.Is a man created by ...",important,1.0,"{'text': ['important'], 'answer_start': [240]}"
2,Is the sound of the movie a reason to recommen...,"(111, 135)",This was my first bluray of a Disney classic a...,sound were crystal clear,2.0,"{'text': ['sound were crystal clear'], 'answer..."
3,Is the sound of the movie a reason to recommen...,"(394, 408)",This was my first bluray of a Disney classic a...,ANSWERNOTFOUND,3.0,"{'text': ['ANSWERNOTFOUND'], 'answer_start': [..."
4,How do you rate the sound?,"(5806, 5820)","In the realm of big Hollywood filmmaking, a fe...",ANSWERNOTFOUND,4.0,"{'text': ['ANSWERNOTFOUND'], 'answer_start': [..."


In [None]:
# Load the metric
metric = datasets.load_metric('squad') # As same as with Fine-Tuning Metric

- The code remains unchanged up to this point from the fine-tuning stage.

In [19]:
# Prepare answers and predictions lists
actual_answers = []
predicted_answers = []

for _, row in data.iterrows():
    # Extract the first answer from the 'text' list in the 'answers' dictionary
    actual_answer = row['answers']['text'][0]
    if actual_answer != 'ANSWERNOTFOUND':
        # Generate the answer using the RAG system
        predicted_answer = rag_answer(row['question'], row['context'])
        actual_answers.append(actual_answer.lower().strip())
        predicted_answers.append(predicted_answer.lower().strip())

In [21]:
# Prepare data for metric calculation
references = [{'id': str(i), 'answers': {'text': [ans], 'answer_start': [0]}} for i, ans in enumerate(actual_answers)]
predictions = [{'id': str(i), 'prediction_text': ans} for i, ans in enumerate(predicted_answers)]

In [22]:
# Compute the metrics
results = metric.compute(predictions=predictions, references=references)

In [23]:
# Print the Exact Match score and F1-Score
print("Exact Match Score:", results['exact_match'], "%")
print("F1-Score:", results['f1'], "%")

Exact Match Score: 17.0 %
F1-Score: 21.850642921455368 %
