In [2]:
import pandas as pd
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np

[nltk_data] Downloading package punkt to /Users/letriluan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/letriluan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('/Users/letriluan/Downloads/NLP/news_dataset.csv', encoding="latin1")
df

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."
...,...,...,...,...,...,...,...
995,18460,Gerry Mullany,14/03/2017,2017,3,accidents,HONG KONG ? Hundreds of pilot whales that s...
996,18461,Rory Smith,10/02/2017,2017,2,sports,"NICE, France ? Riv?re accepts the complim..."
997,18462,Jack Ewing,9/02/2017,2017,2,business,FRANKFURT ? Germans who never really warmed...
998,18463,Scott Cacciola,10/02/2017,2017,2,sports,Charles Oakley has strong feelings about compe...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1000 non-null   int64 
 1   author   994 non-null    object
 2   date     1000 non-null   object
 3   year     1000 non-null   object
 4   month    1000 non-null   object
 5   topic    1000 non-null   object
 6   article  1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [5]:
df.describe()

Unnamed: 0,id
count,1000.0
mean,17878.532
std,341.50184
min,17283.0
25%,17582.75
50%,17881.5
75%,18182.25
max,18465.0


In [6]:
df.isnull().sum()

id         0
author     6
date       0
year       0
month      0
topic      0
article    0
dtype: int64

**Pre-process**

In [7]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import spacy
import neuralcoref

nlp = spacy.load("en_core_web_sm")
neuralcoref.add_to_pipe(nlp)

def clean_text(text):
    tokenizer = RegexpTokenizer(r'\b\w+\b') 
    tokens = tokenizer.tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    doc = nlp(" ".join(filtered_tokens))
    lemmatized_tokens = [token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in doc]
    
    return " ".join(lemmatized_tokens)

df_new = df.copy()
df_new["article"] = df_new["article"].apply(clean_text)


**Coreference Resolution utility**

In [8]:
def resolve_coreferences(text):
    doc = nlp(text)
    if doc._.has_coref:
        return doc._.coref_resolved
    return text

**Text matching utility**

In [10]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def find_most_relevant_sentence(question, article_text):
    text = resolve_coreferences(article_text)
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    question_embedding = model.encode(question)
    sentence_embeddings = model.encode(sentences)

    similarities = util.pytorch_cos_sim(question_embedding, sentence_embeddings).squeeze()
    most_similar_index = similarities.argmax().item()
    confidence = similarities[most_similar_index].item()
    
    return sentences[most_similar_index], confidence


In [11]:
nlp = spacy.load("en_core_web_sm")
def extract_relevant_snippets(question, relevant_sentence):
    doc = nlp(relevant_sentence)
    question_doc = nlp(question)

    target_label = 'PERSON'  # Default to PERSON
    
    if any(word in question.lower() for word in ['who', 'name']):
        target_label = 'PERSON'
    elif any(word in question.lower() for word in ['when', 'date', 'year', 'time']):
        target_label = 'DATE'
    elif any(word in question.lower() for word in ['where', 'city', 'country', 'place', 'location']):
        target_label = 'GPE'
    elif any(word in question.lower() for word in ['what', 'company', 'organization']):
        target_label = 'ORG'
    elif 'how many' in question.lower():
        target_label = 'CARDINAL' 

    entities = {}
    for ent in doc.ents:
        if ent.label_ == target_label:
            if ent.text in entities:
                entities[ent.text] += 1
            else:
                entities[ent.text] = 1

    if entities:
        sorted_entities = sorted(entities.items(), key=lambda item: (-item[1], relevant_sentence.index(item[0])))
        return sorted_entities[0][0]

    return "No relevant information found."


In [14]:
def answer_question_from_article(article_id, question, df):
    try:
        article_text = df.loc[df['id'] == article_id, 'article'].values[0]
    except IndexError:
        return "Article not found."

    relevant_sentence, confidence = find_most_relevant_sentence(question, article_text)
    
    confidence_threshold = 0.4
    if confidence < confidence_threshold:
        return "High confidence answer not found."
    
    answer_snippet = extract_relevant_snippets(question, relevant_sentence)
    return answer_snippet, confidence

article_id = 17574  
question = "Who is the vice chairman of Samsung?"
answer = answer_question_from_article(article_id, question, df_new)
print("Answer snippet:", answer)

Answer snippet: ('Jay Lee', 0.5828641653060913)


In [None]:
Article ID:  17574
Your question:  Who run the Samsung effectively?
Answer: High confidence answer not found.
Article ID:  17574
Your question:  Who is vice the chairman of Samsung?
Answer: ('Jay Lee', 0.5779139995574951)
Article ID:  17574
Your question:  Who is vice the chairman of Samsung?
Answer: ('Jay Lee', 0.5779139995574951)

**Interaction**

In [16]:
def user_interaction():
    while True:
        article_id = input("Enter the article ID or type 'quit' to exit: ")
        if article_id.lower() == 'quit':
            break
        print( 'Article ID: ', article_id)
        question = input("Enter your question: ")

        if question.lower() == 'quit':
            break
        print('Your question: ', question)
        try:
            article_id = int(article_id)
            answer = answer_question_from_article(article_id, question, df_new)
            print("Answer:", answer)
        except ValueError:
            print("Invalid article ID. Please enter a numeric ID.")

# To run the interaction
user_interaction()

Article ID:  17
Your question:  Who I am?
Answer: Article not found.
Article ID:  17574
Your question:  Who I am?
Answer: High confidence answer not found.
Article ID:  17574
Your question:  Who is the vice chairman of Samsung?
Answer: ('Jay Lee', 0.5828641653060913)
Article ID:  17574
Your question:  When the vice Chaiman of Samsung will be questioned?
Answer: ('Thursday', 0.5326929688453674)
Article ID:  17574
Your question:  When Jay Lee will be questioned?
Answer: ('November', 0.5102343559265137)


In [17]:
user_interaction() #Tranformers based

Article ID:  18460
Your question:  How many pilot whales that swam into a shallow New Zealand bay died overnight?
Answer: ('Hundreds', 0.8101339936256409)
Article ID:  18460
Your question:  How many rescuers tried frantically to send the pilot whales back out to sea?
Answer: ('500', 0.7037572264671326)
Article ID:  18460
Your question:  Where is has one of the highest rates of whale strandings?
Answer: ('New Zealand', 0.6138548851013184)


**BERT**

In [20]:
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
from torch.nn.functional import softmax

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

def answer_question_bert(question, context):
    """Function to answer questions using BERT directly from the context."""
    inputs = tokenizer(question, context, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = model(**inputs, return_dict=True)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    return answer

def answer_question_bert(question, context):
    """Function to answer questions using BERT directly from the context, including confidence score."""
    inputs = tokenizer(question, context, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    with torch.no_grad(): 
        outputs = model(**inputs)
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

    start_probs = softmax(answer_start_scores, dim=-1)
    end_probs = softmax(answer_end_scores, dim=-1)
    answer_start = torch.argmax(start_probs)
    answer_end = torch.argmax(end_probs) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer

def answer_question_from_article(article_id, question, df):
    """Retrieve an article by ID and use BERT to answer a question based on the article's text, including confidence."""
    try:
        article_text = df.loc[df['id'] == article_id, 'article'].values[0]
    except IndexError:
        return "Article not found."

    answer = answer_question_bert(question, article_text)
    return answer

Answer snippet: new zealand


In [21]:
user_interaction() #BERT model

Article ID:  15
Your question:  Who I am?
Answer: Article not found.
Article ID:  17574
Your question:  Who I am?
Answer: [CLS]
Article ID:  17574
Your question:  Who is the vice chairman of Samsung?
Answer: jay lee
Article ID:  17574
Your question:  Who run the Samsung effectively?
Answer: mr lee
Article ID:  17574
Your question:  When the vice Chaiman of Samsung will be questioned?
Answer: thursday
Article ID:  17574
Your question:  When Jay Lee will be questioned?
Answer: thursday


In [23]:
user_interaction() #BERT model

Article ID:  18460
Your question:  How many pilot whales that swam into a shallow New Zealand bay died overnight?
Answer: hundreds
Article ID:  18460
Your question:  How many rescuers tried frantically to send the pilot whales back out to sea?
Answer: 500
Article ID:  18460
Your question:  Where is has one of the highest rates of whale strandings?
Answer: new zealand
