In [1]:
import pandas as pd
#from readability import Document


#for NER
import en_core_web_sm
nlp= en_core_web_sm.load()
import re
import sys
import nltk
import wikipediaapi
from nltk.tokenize import sent_tokenize, RegexpTokenizer

In [2]:
import logging

# Configure logging
logging.basicConfig(
    filename='my_log.log',    # Log file name
    filemode='a',             # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log message format
    level=logging.INFO        # Log level
)

logger = logging.getLogger()

In [3]:
# Ensure the necessary NLTK resources are available
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\Rockin
[nltk_data]     Randal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rockin Randal\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
# Initialize the tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Define patterns
who_pat = r"\b[Ww]ho\b"
where_pat = r"\b[Ww]here\b"
what_pat = r"\b[Ww]hat\b"
when_pat = r"\b[Ww]hen\b"
born_pat = r"\b[Bb]orn\b"

# Initialize the spaCy model
import spacy
nlp = spacy.load("en_core_web_sm")

# Initialize global variables
query = ""
entity = ""

#classify the query so we can know what the user wants answered
def classify_query(query):
    query_toks = tokenizer.tokenize(query)
    
    query_type = ""
    for word in query_toks:
        if re.search(who_pat, word):
            query_type = "Person"
            logger.info("Person Query")
        elif re.search(where_pat, word):
            query_type = "Location"
            logger.info("Location Query")
        elif re.search(what_pat, word):
            query_type = "Definition"
            logger.info("Definition Query")
        elif re.search(when_pat, word):
            query_type = "Time"
            logger.info("When Query")
        elif re.search(born_pat, word) and query_type == "Time":
            query_type = "Birth"
            logger.info("Birth Query")
        
    return query_toks, query_type

In [5]:
#in the case of a named Entity, this fill find it

def entity_search(tokenized_query):
    global entity
    # Join the tokens into a sentence
    sent = " ".join(tokenized_query)
    # Perform NER on the sentence
    doc = nlp(sent)
    # Get the named entities
    if doc.ents:
        # Assign the first named entity to the global variable entity
        entity = doc.ents[0].text
    else:
        # If no named entities are found, set entity to an empty string or other default value
        entity = ""
    
    logger.info(f"Entity: {entity}")

In [6]:
#in the case there isn't a named entity, we need to single out the key words (nouns/verbs)
def find_key_words(query):
    unimportant_words = r"\b(([Ww]here|[Ww]hat|[Ww]ho|[Ww]hen) (is|was|did))( (a|the))?\b"
    key_words = re.sub(unimportant_words, "", query)
    
    logger.info(f"Key Words: {key_words}")
    return key_words

In [7]:
#sometimes the POS tags were wrong. This will try to correct it.

def correct_tags(key_words):
    corrected_tags = []
    pos_tokens = nltk.word_tokenize(key_words)
    tags = nltk.pos_tag(pos_tokens)

    # Rule-based disambiguation
    for i, (word, tag) in enumerate(tags):
        if word.lower() == "fall" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            corrected_tags.append((word, 'VB'))
        elif word.lower() == "die" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            corrected_tags.append((word, 'VB'))
        elif word.lower() == "born" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            corrected_tags.append((word, 'VB'))
        elif word.lower() == "start" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            corrected_tags.append((word, 'VB'))
        else:
            corrected_tags.append((word, tag))
            
    logger.info(f"Corrected Tags: {corrected_tags}")
    return corrected_tags

In [8]:
#Get the nouns from the POS tags

def find_noun(corrected_tags):
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'IN']

    # Filter out only the nouns
    nouns = [word for word, pos in corrected_tags if pos in noun_tags]

    noun = " ".join(nouns)
    
    logger.info(f"Noun: {noun}")
    return noun

In [9]:
#Leahs scraping code from here
def fetch_wikipedia_summary(topic):
    user_agent = 'PA2/1.0 (lantler@gmu.edu)'
    wiki_api = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})
    page = wiki_api.page(topic)
    if page.exists():
        return page.summary
    else:
        return ""

In [10]:
#function to find the birthday (also finds death day but right now only prints birth)
def find_birthday(answer):    
    #also need some sort of pattern for detecting (Dates)
    months = ["January", "February", "March", "April", "May", "June", 
              "July", "August", "September", "October", "November", "December", 
                  "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
                     "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

    escaped_month_patterns = [re.escape(month) for month in months]

    #joining it like (January|February|March|...)
    month_regex = '|'.join(escaped_month_patterns)

    #pattern for matching specific dates formatted in 2 different ways
    #doesnt work for anyone born before 1000... I think it will be okay
    date_regex = rf'\b((?:{month_regex})\s\d{{1,2}},?\s\d{{3,4}}|\d{{1,2}}\s(?:{month_regex}),?\s\d{{3,4}})\b'

    #match[0] will correspond to the birthday, match[1] will correspond to the death day if there is one
    match = re.findall(date_regex, answer, re.IGNORECASE)

    # Print the first match if found
    if match:
        logger.info(f"{entity} was born on {match[0]}.")
    else:
        logger.info("Sorry, I don't know the answer")

In [11]:
#Grab the response based on the question
def generate_response(question, summary, query_type):
    location_patterns = r"\b((is|was|did)* (near|around|at|spanning)+)\b"
    #when patterns should hopefully also end in a number
    when_patterns = r"\b((began|happened|established)+)\b"
    #also need some sort of pattern for detecting (Dates)
    born_patterns = r"\b([Bb]orn|(.*\d+))\b"

    #Need extra logic to check for name of person or thing
    def_patterns = r"\b(is|was)+\b"
    person_patterns = r"\b(is|was)+\b"
    use_pattern = ""
    
    
    #question_words = question.lower().split()
    #subject = " ".join(question_words[2:]).replace('?', '')
    if query_type == "Location":
        use_pattern = location_patterns
    elif query_type == "Time":
        use_pattern = when_patterns
    elif query_type == "Born":
        use_pattern = born_patterns
    elif query_type == "Definition":
        use_pattern = def_patterns
    elif query_type == "Person":
        use_pattern = person_patterns

    sentences = sent_tokenize(summary)
    
    for sentence in sentences:
        if re.search(use_pattern, sentence.lower()) != None:
            return sentence.strip()

    return "I'm sorry, I don't know the answer."

In [None]:
def main():
    global query, entity

    while query.lower() != 'exit':
        tokens, query_type = classify_query(query)
        entity_search(tokens)
        key_words = find_key_words(query)
        corrected_tags = correct_tags(key_words)
        noun = find_noun(corrected_tags)
        logger.info(query_type)
        
        search_term = "".join(noun)
        content_summary = fetch_wikipedia_summary(search_term)
        logger.info(content_summary)
        
        if entity:
            answer = generate_response(entity, content_summary, query_type)
        else:
            answer = generate_response(search_term, content_summary, query_type)
        
        print("-----\n\n{}\n\n-----".format(answer))

        if query_type == "Birth":
            find_birthday(answer)
        
        query = input("Please ask a question. Type 'exit' to exit: ")

if __name__ == "__main__":
    query = input("Please ask a question, Type 'Exit' to exit: ").strip()
    pattern = r'[^\w\s]'
    query = re.sub(pattern, '', query)

    main()

Please ask a question, Type 'Exit' to exit:  When was George Washington born?


-----

George Washington (February 22, 1732 – December 14, 1799) was an American Founding Father, military officer, and politician who served as the first president of the United States from 1789 to 1797.

-----


Please ask a question. Type 'exit' to exit:  What is a bicycle?


-----

A bicycle, also called a pedal cycle, bike, push-bike or cycle, is a human-powered or motor-assisted, pedal-driven, single-track vehicle, with two wheels attached to a frame, one behind the other.

-----


Please ask a question. Type 'exit' to exit:  When was George Washington born?


-----

George Washington (February 22, 1732 – December 14, 1799) was an American Founding Father, military officer, and politician who served as the first president of the United States from 1789 to 1797.

-----


Please ask a question. Type 'exit' to exit:  Where is Minnesota?


-----

Minnesota is known as the "Land of 10,000 Lakes" but actually has 14,380 bodies of fresh water covering at least ten acres each; roughly a third of the state is forested; much of the remainder is prairie and farmland.

-----


Please ask a question. Type 'exit' to exit:  Who was George Washington?


-----

George Washington (February 22, 1732 – December 14, 1799) was an American Founding Father, military officer, and politician who served as the first president of the United States from 1789 to 1797.

-----


Please ask a question. Type 'exit' to exit:  Where was Joseph Biden born?


-----

I'm sorry, I don't know the answer.

-----


Location Query
Entity: Russia
Key Words:  Russia?
Corrected Tags: [('Russia', 'NNP'), ('?', '.')]
Noun: Russia
Location
Russia, or the Russian Federation, is a country spanning Eastern Europe and North Asia. It is the largest country in the world by area, extending across eleven time zones and sharing land borders with fourteen countries. It is the world's ninth-most populous country and Europe's most populous country. Russia is a highly urbanized country including 16 population centers with over a million inhabitants. Its capital as well as its largest city is Moscow. Saint Petersburg is Russia's second-largest city and its cultural capital. 
The East Slavs emerged as a recognised group in Europe between the 3rd and 8th centuries CE. The first East Slavic state, Kievan Rus', arose in the 9th century, and in 988, it adopted Orthodox Christianity from the Byzantine Empire. Rus' ultimately disintegrated, with the Grand Duchy of Moscow growing to become the Tsardom of Russia. By the early

Please ask a question. Type 'exit' to exit:  When was russia?


When Query
Entity: russia
Key Words:  russia?
Corrected Tags: [('russia', 'NN'), ('?', '.')]
Noun: russia
Time
Russia, or the Russian Federation, is a country spanning Eastern Europe and North Asia. It is the largest country in the world by area, extending across eleven time zones and sharing land borders with fourteen countries. It is the world's ninth-most populous country and Europe's most populous country. Russia is a highly urbanized country including 16 population centers with over a million inhabitants. Its capital as well as its largest city is Moscow. Saint Petersburg is Russia's second-largest city and its cultural capital. 
The East Slavs emerged as a recognised group in Europe between the 3rd and 8th centuries CE. The first East Slavic state, Kievan Rus', arose in the 9th century, and in 988, it adopted Orthodox Christianity from the Byzantine Empire. Rus' ultimately disintegrated, with the Grand Duchy of Moscow growing to become the Tsardom of Russia. By the early 18th cen

Please ask a question. Type 'exit' to exit:  When was ths USA?


When Query
Entity: USA
Key Words:  ths USA?
Corrected Tags: [('ths', 'NNS'), ('USA', 'NNP'), ('?', '.')]
Noun: ths USA
Time

I'm sorry, I don't know the answer.


Please ask a question. Type 'exit' to exit:  When was the United States of America?


When Query
Entity: the United States of America
Key Words:  United States of America?
Corrected Tags: [('United', 'NNP'), ('States', 'NNPS'), ('of', 'IN'), ('America', 'NNP'), ('?', '.')]
Noun: United States of America
Time
The United States of America (USA or U.S.A.), commonly known as the United States (US or U.S.) or America, is a country primarily located in North America. It is a federation of 50 states, a federal capital district (Washington, D.C.), and 326 Indian reservations. Outside the union of states, it asserts sovereignty over five major unincorporated island territories and various uninhabited islands. The country has the world's third-largest land area, second-largest exclusive economic zone, and third-largest population, exceeding 334 million.
Paleo-Indians migrated across the Bering land bridge more than 12,000 years ago, and went on to form various civilizations and societies. British colonization led to the first settlement of the Thirteen Colonies in Virginia in 160

Please ask a question. Type 'exit' to exit:  What is the Chernobyl Disaster?


Definition Query
Entity: 
Key Words:  Chernobyl Disaster?
Corrected Tags: [('Chernobyl', 'NNP'), ('Disaster', 'NNP'), ('?', '.')]
Noun: Chernobyl Disaster
Definition
The Chernobyl disaster began on 26 April 1986 with the explosion of the No. 4 reactor of the Chernobyl Nuclear Power Plant near the city of Pripyat in the north of the Ukrainian SSR, close to the border with the Byelorussian SSR, in the Soviet Union. It is one of only two nuclear energy accidents rated at seven—the maximum severity—on the International Nuclear Event Scale, the other being the 2011 Fukushima nuclear accident. The initial emergency response and subsequent mitigation efforts involved more than 500,000 personnel and cost an estimated 18 billion roubles—roughly US$68 billion in 2019, adjusted for inflation. It was the worst nuclear disaster in history, and the costliest disaster in human history, costing an estimated US$700 billion.
The accident occurred during a test of the steam turbine's ability to power the

Please ask a question. Type 'exit' to exit:  When did the Chernobyl Disaster happen?


When Query
Entity: 
Key Words:  Chernobyl Disaster happen?
Corrected Tags: [('Chernobyl', 'NNP'), ('Disaster', 'NNP'), ('happen', 'VB'), ('?', '.')]
Noun: Chernobyl Disaster
Time
The Chernobyl disaster began on 26 April 1986 with the explosion of the No. 4 reactor of the Chernobyl Nuclear Power Plant near the city of Pripyat in the north of the Ukrainian SSR, close to the border with the Byelorussian SSR, in the Soviet Union. It is one of only two nuclear energy accidents rated at seven—the maximum severity—on the International Nuclear Event Scale, the other being the 2011 Fukushima nuclear accident. The initial emergency response and subsequent mitigation efforts involved more than 500,000 personnel and cost an estimated 18 billion roubles—roughly US$68 billion in 2019, adjusted for inflation. It was the worst nuclear disaster in history, and the costliest disaster in human history, costing an estimated US$700 billion.
The accident occurred during a test of the steam turbine's ability

Please ask a question. Type 'exit' to exit:  exit


In [None]:
#All that's left to do is parse through the content summary and find relavent information for more complicated questions.
#having the query type and associated verb will help to specify what the user wants answered