In [1]:
import pandas as pd
from readability import Document


#for NER
import en_core_web_sm
nlp= en_core_web_sm.load()
import re
import sys
import nltk
import wikipediaapi
from nltk.tokenize import sent_tokenize, RegexpTokenizer

In [None]:
# Ensure the necessary NLTK resources are available
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [20]:
# Initialize the tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Define patterns
who_pat = r"\b[Ww]ho\b"
where_pat = r"\b[Ww]here\b"
what_pat = r"\b[Ww]hat\b"
when_pat = r"\b[Ww]hen\b"

# Initialize the spaCy model
import spacy
nlp = spacy.load("en_core_web_sm")

# Initialize global variables
query = ""
entity = ""

#classify the query so we can know what the user wants answered
def classify_query(query):
    query_toks = tokenizer.tokenize(query)
    
    query_type = ""
    for word in query_toks:
        if re.search(who_pat, word):
            query_type = "Person"
            print("Person Query")
        elif re.search(where_pat, word):
            query_type = "Location"
            print("Location Query")
        elif re.search(what_pat, word):
            query_type = "Definition"
            print("Definition Query")
        elif re.search(when_pat, word):
            query_type = "Time"
            print("When Query")
        
    return query_toks, query_type

In [21]:
#in the case of a named Entity, this fill find it

def entity_search(tokenized_query):
    global entity
    # Join the tokens into a sentence
    sent = " ".join(tokenized_query)
    # Perform NER on the sentence
    doc = nlp(sent)
    # Get the named entities
    if doc.ents:
        # Assign the first named entity to the global variable entity
        entity = doc.ents[0].text
    else:
        # If no named entities are found, set entity to an empty string or other default value
        entity = ""
    
    print(f"Entity: {entity}")

When did the Berlin Wall fall


In [22]:
#in the case there isn't a named entity, we need to single out the key words (nouns/verbs)
def find_key_words(query):
    unimportant_words = r"\b(([Ww]here|[Ww]hat|[Ww]ho|[Ww]hen) (is|was|did))( (a|the))?\b"
    key_words = re.sub(unimportant_words, "", query)
    
    print(f"Key Words: {key_words}")
    return key_words

In [23]:
#sometimes the POS tags were wrong. This will try to correct it.

def correct_tags(key_words):
    corrected_tags = []
    pos_tokens = nltk.word_tokenize(key_words)
    tags = nltk.pos_tag(pos_tokens)

    # Rule-based disambiguation
    for i, (word, tag) in enumerate(tags):
        if word.lower() == "fall" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            corrected_tags.append((word, 'VB'))
        elif word.lower() == "die" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            corrected_tags.append((word, 'VB'))
        elif word.lower() == "born" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            corrected_tags.append((word, 'VB'))
        elif word.lower() == "start" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            corrected_tags.append((word, 'VB'))
        else:
            corrected_tags.append((word, tag))
            
    print(f"Corrected Tags: {corrected_tags}")
    return corrected_tags

When Query


In [24]:
#Get the nouns from the POS tags

def find_noun(corrected_tags):
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'IN']

    # Filter out only the nouns
    nouns = [word for word, pos in corrected_tags if pos in noun_tags]

    noun = " ".join(nouns)
    
    print(f"Noun: {noun}")
    return noun

['When', 'did', 'the', 'Berlin', 'Wall', 'fall']


In [26]:
#Leahs scraping code from here
def fetch_wikipedia_summary(topic):
    user_agent = 'PA2/1.0 (lantler@gmu.edu)'
    wiki_api = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})
    page = wiki_api.page(topic)
    if page.exists():
        return page.summary
    else:
        return ""

the Berlin Wall


In [27]:
def generate_response(question, summary):
    question_words = question.lower().split()
    subject = " ".join(question_words[2:]).replace('?', '')

    sentences = sent_tokenize(summary)
    
    for sentence in sentences:
        if subject.lower() in sentence.lower():
            return sentence.strip()

    return "I'm sorry, I don't know the answer."

In [34]:
def main():
    global query, entity

    while query.lower() != 'exit':
        tokens, query_type = classify_query(query)
        entity_search(tokens)
        key_words = find_key_words(query)
        corrected_tags = correct_tags(key_words)
        noun = find_noun(corrected_tags)
        
        search_term = "".join(noun)
        content_summary = fetch_wikipedia_summary(search_term)
        print(content_summary)
        
        if entity:
            answer = generate_response(entity, content_summary)
        else:
            answer = generate_response(search_term, content_summary)
        
        print(answer)
        
        query = input("Please ask a question. Type 'exit' to exit: ")

if __name__ == "__main__":
    query = input("Please ask a question, Type 'Exit' to exit: ").strip()
    pattern = r'[^\w\s]'
    query = re.sub(pattern, '', query)

    main()

In [None]:
#All that's left to do is parse through the content summary and find relavent information for more complicated questions.
#having the query type and associated verb will help to specify what the user wants answered