In [1]:
import bs4 as bs
import urllib.request
import re
import spacy
import pandas as pd
from bs4 import BeautifulSoup
from readability import Document
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import RegexpTokenizer

#for NER
import en_core_web_sm
nlp= en_core_web_sm.load()

from spacy import displacy

In [2]:
import wikipediaapi

In [3]:
from pprint import pprint

In [19]:
query = input("Please ask a question, Type 'Exit' to exit: ")

Please ask a question, Type 'Exit' to exit:  When did the Berlin Wall fall?


In [20]:
pattern = r'[^\w\s]'

query= re.sub(pattern, '', query)

In [21]:
print(query)

When did the Berlin Wall fall


In [22]:
tokenizer = RegexpTokenizer(r'\w+')

who_pat= r"\b[Ww]ho\b"
where_pat = r"\b[Ww]here\b"
what_pat = r"\b[Ww]hat\b"
when_pat = r"\b[Ww]hen\b"

#tokenize and match query type
def classify_query(query):
    query_toks = tokenizer.tokenize(query)
    
    query_type= ""
    for word in query_toks:
        if re.search(who_pat, word):
            query_type= "Person"
            print("Person Query")
        elif re.search(where_pat, word):
            query_type= "Location"
            print("Location Query")
        elif re.search(what_pat, word):
            query_type= "Definition"
            print("Definition Query")
        elif re.search(when_pat, word):
            query_type= "Time"
            print("When Query")
        
    return query_toks, query_type

In [23]:
tokens, query_type= classify_query(query)

When Query


In [24]:
print(tokens)

['When', 'did', 'the', 'Berlin', 'Wall', 'fall']


In [25]:
#now that we have the query type, we can join the query and perform NER on it...

In [26]:
entity = ""

def entity_search(tokenized_query):
    global entity
    #search for named entities
    sent= " ".join(tokens)
    sent= nlp(sent)
    #Get the named entities
    if sent.ents:
        # Assign the first named entity to the global variable entity
        entity = sent.ents[0].text
    else:
        # If no named entities are found, set entity to an empty string or other default value
        entity = ""
    
    print(entity)

entity_search(tokens)

the Berlin Wall


In [27]:
#have to have a different entity identifier if there is no proper noun
#example query: What is a bicycle
#want to return "bicycle"
key_words= ""

def find_key_words(tokens):
    unimportant_words = r"\b(([Ww]here|[Ww]hat|[Ww]ho|[Ww]hen|[Ww]hat) (is|was|did))( (a|the))? \b"

    key_words= re.sub(unimportant_words, "", query)
    return key_words

In [34]:
key_words= find_key_words(query)

In [35]:
print(key_words)

Berlin Wall fall


In [36]:
from nltk import pos_tag

pos_tokens= nltk.word_tokenize(key_words)
tags= nltk.pos_tag(pos_tokens)

print(tags)

#For more complex questions we have to tag potential verbs. A lot of verbs were being mistagged so I am going in with rule-based disambiguation here.

for i, (word, tag) in enumerate(tags):
        # Example rule: if the word is "fall" and the previous word is a noun (NN, NNS, NNP, NNPS), tag it as a verb (VB)
    if word.lower() == "fall" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
        corrected_tags.append((word, 'VB'))
    elif word.lower() == "die" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
        corrected_tags.append((word, 'VB'))
    elif word.lower() == "born" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
        corrected_tags.append((word, 'VB'))
    elif word.lower() == "start" and i > 0 and tags[i-1][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
        corrected_tags.append((word, 'VB'))
    else:
        corrected_tags.append((word, tag))

print(corrected_tags)


[('Berlin', 'NNP'), ('Wall', 'NNP'), ('fall', 'NN')]
[('Berlin', 'NNP'), ('Wall', 'NNP'), ('fall', 'VB')]


In [37]:
# Define the noun types
# JJ added for certain cases
noun_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'IN']

# Filter out only the nouns
nouns = [word for word, pos in corrected_tags if pos in noun_tags]

noun = " ".join(nouns)

In [38]:
print(noun)

Berlin Wall


In [16]:
#Leah's code for scraping and response generation

In [39]:
def fetch_wikipedia_summary(topic):
    user_agent = 'PA2/1.0 (lantler@gmu.edu)'
    wiki_api = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})
    page = wiki_api.page(topic)
    if page.exists():
        return page.summary
    else:
        return ""

In [40]:
search_term = "".join(noun)
content_summary = fetch_wikipedia_summary(search_term)
print(content_summary)

The Berlin Wall (German: Berliner Mauer, pronounced [bɛʁˌliːnɐ ˈmaʊɐ] ) was a guarded concrete barrier that encircled West Berlin of the Federal Republic of Germany (FRG; West Germany) from 1961 to 1989, separating it from East Berlin and the German Democratic Republic (GDR; East Germany). Construction of the Berlin Wall was commenced by the government of the GDR on 13 August 1961. It included guard towers placed along large concrete walls, accompanied by a wide area (later known as the "death strip") that contained anti-vehicle trenches, beds of nails and other defenses. The primary intention for the Wall's construction was to prevent East German citizens from fleeing to the West.
The Soviet Bloc propaganda portrayed the Wall as protecting its population from "fascist elements conspiring to prevent the will of the people" from building a communist state in the GDR. The authorities officially referred to the Berlin Wall as the Anti-Fascist Protection Rampart (German: Antifaschistischer

In [41]:
def generate_response(question, summary):
    question_words = question.lower().split()
    subject = " ".join(question_words[2:]).replace('?', '')

    sentences = sent_tokenize(summary)
    
    for sentence in sentences:
        if subject.lower() in sentence.lower():
            return sentence.strip()

    return "I'm sorry, I don't know the answer."

In [42]:
#sometimes the entity output is more accurate than the stripped search term
if entity:
    answer = generate_response(entity, content_summary)
else:
    answer = generate_response(search_term, content_summary)

In [43]:
print(answer)

The Berlin Wall (German: Berliner Mauer, pronounced [bɛʁˌliːnɐ ˈmaʊɐ] ) was a guarded concrete barrier that encircled West Berlin of the Federal Republic of Germany (FRG; West Germany) from 1961 to 1989, separating it from East Berlin and the German Democratic Republic (GDR; East Germany).


In [None]:
#All that's left to do is parse through the content summary and find relavent information for more complicated questions.
#having the query type and associated verb will help to specify what the user wants answered