In [11]:
# HTTP Requests
import requests

# BeautifulSoup for HTML Parsing
from bs4 import BeautifulSoup

# Hugging Face Transformers for NLP Pipelines
from transformers import pipeline

# NLTK for Natural Language Processing
import nltk
from nltk.translate.bleu_score import sentence_bleu

# NLTK Setup
nltk.download('punkt')

# Initialize the question-answering pipeline from the Fine-Tuned model
qa_pipeline = pipeline("question-answering", model="kgntmr/RoBERTa-SQuAD2.0-SubjQA")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# Previously determined URLs to scrape content from
urls = [
    "https://www.theguardian.com/technology/2016/may/03/amazon-fresh-food-deliveries-understood-to-start-this-month",
    "https://www.theguardian.com/media/2016/may/16/bbc-netflix-rival-itv-nbc-universal",
    "https://www.theguardian.com/technology/2016/apr/28/amazon-most-profitable-quarter-sales-up-costs",
    "https://www.theguardian.com/technology/2016/apr/26/amazon-kindle-oasis-review-luxury-e-reader",
    "https://www.theguardian.com/environment/andes-to-the-amazon/2016/may/25/london-stock-exchange-amazon-deforestation",
    "https://www.theguardian.com/media/2016/may/25/netflix-and-amazon-must-guarantee-20-of-content-is-european",
    "https://www.theguardian.com/technology/2016/may/26/amazon-echo-virtual-assistant-child-privacy-law",
]

# Selector attributes for the specific part of the HTML to scrape
selector_attrs = {"class": "article-body-commercial-selector"}

# Function to scrape the text content from a given URL
def get_article_text(url, selector_attrs):
    # Sends a GET request to the URL and retrieves the response
    response = requests.get(url)
    # Parses the HTML content of the response with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    # Finds all paragraph elements (<p>) within the HTML
    paragraphs = soup.find_all('p')
    # Extracts the text from each paragraph and joins them into a single string
    return ' '.join([p.get_text() for p in paragraphs])

In [13]:
# Scrape the content from each URL
contexts = []
for url in urls:
    try:
        # Fetch and scrape the article text from the URL
        context = get_article_text(url, selector_attrs)
        # Check if context is not empty
        if context:
            contexts.append(context)
        else:
            # Placeholder for empty context
            contexts.append("Content not found.")
    except Exception as e:
        # Print error message and add placeholder for the error
        print(f"Error fetching {url}: {e}")
        contexts.append("Content not found.")

# Print the list of scraped content
print(contexts)



In [14]:
# Define a set of questions and reference answers for each context
qa_examples = [
    {
        "question": "Summarize the main content of the article",
        "reference": "Amazon is believed to be planning to start delivering fresh food in the UK this month.",
        "context": contexts[0]
    },
    {
        "question": "Summarize the main content of the article",
        "reference": "BBC and ITV partner with NBC Universal to rival Netflix.",
        "context": contexts[1]
    },
    {
        "question": "Summarize the main content of the article",
        "reference": "Amazon reports its most profitable quarter with increased sales.",
        "context": contexts[2]
    },
    {
        "question": "Summarize the main content of the article",
        "reference": "Review of Amazon Kindle Oasis, a luxury e-reader.",
        "context": contexts[3]
    },
    {
        "question": "Summarize the main content of the article",
        "reference": "London Stock Exchange faces scrutiny over Amazon deforestation.",
        "context": contexts[4]
    },
    {
        "question": "Summarize the main content of the article",
        "reference": "Netflix and Amazon must ensure 20% of their content is European.",
        "context": contexts[5]
    },
    {
        "question": "Summarize the main content of the article",
        "reference": "Amazon Echo virtual assistant faces child privacy law issues.",
        "context": contexts[6]
    },
]

In [15]:
# List to store BLEU scores for each generated answer
bleu_scores = []

for example in qa_examples:
    if example['context'] and example['context'] != "Content not found.":
        # Use the QA pipeline to generate an answer based on the question and context
        result = qa_pipeline(question=example['question'], context=example['context'])
        generated_answer = result['answer']  # Extract the generated answer
        reference_answer = example['reference']  # Reference answer for comparison
        # Tokenize both generated and reference answers
        generated_tokens = nltk.word_tokenize(generated_answer)
        reference_tokens = [nltk.word_tokenize(reference_answer)]
        # Calculate the BLEU score to measure the similarity
        bleu_score = sentence_bleu(reference_tokens, generated_tokens)
        bleu_scores.append(bleu_score)  # Add the BLEU score to the list
        print(f"Generated Answer: {generated_answer}, Reference Answer: {reference_answer}, BLEU Score: {bleu_score}")
    else:
        # Skip example if context is empty or "Content not found."
        print(f"Skipping example due to empty context for question: {example['question']}")

# Calculate the average BLEU score
if bleu_scores:
    average_bleu_score = sum(bleu_scores) / len(bleu_scores)  # Average of all BLEU scores
else:
    average_bleu_score = 0  # No valid BLEU scores, set average to 0
print(f"Average BLEU Score: {average_bleu_score}")

Generated Answer: the next few weeks, Reference Answer: Amazon is expected to start fresh food deliveries soon., BLEU Score: 0
Generated Answer: The BBC has traditionally been wary of introducing subscription services in the UK, Reference Answer: BBC and ITV partner with NBC Universal to rival Netflix., BLEU Score: 9.594503055152632e-232
Generated Answer: better-than-expected profits, Reference Answer: Amazon reports its most profitable quarter with increased sales., BLEU Score: 0
Generated Answer: influenced by advertisers or affiliates and is not written to promote these products, Reference Answer: Review of Amazon Kindle Oasis, a luxury e-reader., BLEU Score: 0
Generated Answer: substantially more land, Reference Answer: London Stock Exchange faces scrutiny over Amazon deforestation., BLEU Score: 0
Generated Answer: the European Union, Reference Answer: Netflix and Amazon must ensure 20% of their content is European., BLEU Score: 4.938325780438735e-233
Generated Answer: for everyone