# Yes/No Question Answering

Try for this test:

java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \ -preload tokenize,parse,depparse,lemma \ -status_port 9000 -port 9000 -timeout 15000 &

Call for full system:

java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \ -preload tokenize,ssplit,pos,lemma,ner,parse,depparse \ -status_port 9000 -port 9000 -timeout 15000 &

# Examples

In [43]:
#Example Dependence Parse

dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse('oranges used to be considered poisonous'.split())
[[(governor, dep, dependent) for governor, dep, dependent in p.triples()] for p in parses]

[[(('used', 'VBN'), 'nsubj', ('oranges', 'NNS')),
  (('used', 'VBN'), 'xcomp', ('considered', 'VBN')),
  (('considered', 'VBN'), 'mark', ('to', 'TO')),
  (('considered', 'VBN'), 'auxpass', ('be', 'VB')),
  (('considered', 'VBN'), 'xcomp', ('poisonous', 'JJ'))]]

In [44]:
parses = dep_parser.parse('organges were considered poisonous'.split())
[[(governor, dep, dependent) for governor, dep, dependent in p.triples()] for p in parses]

[[(('considered', 'VBN'), 'nsubjpass', ('organges', 'NNS')),
  (('considered', 'VBN'), 'auxpass', ('were', 'VBD')),
  (('considered', 'VBN'), 'xcomp', ('poisonous', 'JJ'))]]

In [45]:
#Lemmatizer Example
lemmatizer = WordNetLemmatizer() 
  
print("rocks :", lemmatizer.lemmatize("rocks")) 
print("corpora :", lemmatizer.lemmatize("corpora")) 
print("appears :", lemmatizer.lemmatize("appears", pos="v")) 

rocks : rock
corpora : corpus
appears : appear


# Define Functions

In [2]:
import nltk
import os
import string
from nltk.parse import CoreNLPParser, CoreNLPDependencyParser
from nltk.stem import WordNetLemmatizer 

In [47]:
#From Jesse's question_generation.ipynb
from nltk.tree import Tree

def list_to_string(word_list):
    return ' '.join(word_list)

def tree_to_string(parsed_tree):
#     if isinstance(parsed_tree, str):
#         return parsed_tree
#     words = []
#     for subtree in parsed_tree:
#         words.append(tree_to_string(subtree))
    return list_to_string(parsed_tree.leaves())

In [48]:
#Input Structure Example
'''
(WHNP (WP what))
(SQ (VBZ is) (NP (NNP Gyarados)) (VP (VBN known) (PP (IN as))))
(. ?)
'''

#Take question tree and place verb in location after first NP with a following phrase
#If no following phrase, place verb at end of sentence
def recurse_make_binary_declarative(question, verb, lvl):
    #print("Question:")
    #print(question)
    if (lvl == 1 and len(question) >= 3) or (lvl > 1 and len(question) >= 2): #eg VBZ, NP, VP for lvl 1
        if (lvl == 1):
            ind = 1
        else:
            ind = 0
        #print("IND:")
        #print(question[ind])
        if question[ind].label().startswith("N"):
            n = question[ind]
            full_list = [tree_to_string(n), verb]

            remaining = []
            for i in range(ind+1, len(question)):
                remaining.append(tree_to_string(question[i]))

            full_list.extend(remaining)
            full = list_to_string(full_list)
            #print(full)
            return full
        
        else: #Can't handle this form if the next thing after the verb isnt a noun phrase/noun
            print("ERROR: COULD NOT DETERMINE WHERE TO PLACE VERB. NO NOUN PHRASE.")
            return ""

    else:
        if (lvl == 1):
            result = recurse_make_binary_declarative(question[1], verb, 2)
            return result
        elif (lvl > 1):
            result = recurse_make_binary_declarative(question[0], verb, lvl+1)
            return result
        

#Return string form of reformed question as a declarative statement
def make_binary_declarative(question):
        verb = question[0]
        #print("Make Binary Declarative:")
        #print("Verb:")
        #print(verb[0])
        result = recurse_make_binary_declarative(question, verb[0], 1)
        
        if result == "":
            print("ERROR: COULD NOT RECONFIGURE")
            print(question)
            return tree_to_string(question)
            #FIXME: Should this be returned to signal something?
            #return "" 
        
        return result

In [49]:
#Input Structure Example
'''
  (SBARQ
    (WHNP (WP what))
    (SQ (VBZ is) (NP (NNP Gyarados)) (VP (VBN known) (PP (IN as))))
    (. ?))
'''
#Instead of making wh-questions declarative, just give Sojeong the wh question word and the sentence with it removed (can leave it in if necessary)

#Return a string of the wh word from the question
def find_wh_word(question):
    #print(question)

    if isinstance(question, str):
        return ""

    if question.label() in {"WP", "WDT", "WP$", "WRB"}:
        return question[0]
    
    else:
        for i in range(len(question)):
            result = find_wh_word(question[i])
            
            if result != "":
                return result

    return ""

#Return strings of wh word and new sentence in the same order just without the wh word
def get_wh_word(question):
    
    wh_word = find_wh_word(question)    
    new_sentence = tree_to_string(question)

    #FIXME: Can remove if this step is not necessary
    #Remove wh word from question - FIXME: could go back and make this more efficient
    ind_start = new_sentence.index(wh_word)
    ind_end = ind_start + len(wh_word)
    new_sentence = new_sentence[:ind_start] + new_sentence[ind_end:]
    new_sentence = ' '.join(new_sentence.split())

    #print("WH Word: " + str(wh_word))
    return [wh_word, new_sentence]

In [50]:
#Input Structure Example
'''
(ROOT
  (SBARQ
    (WHNP (WP what))
    (SQ (VBZ is) (NP (NNP Gyarados)) (VP (VBN known) (PP (IN as))))
    (. ?)))
'''
def make_declarative(parsed_tree):
    
    was_declarative = False #Mark if the sentence was already considered declarative
    question_word = ""
    #FIXME: ^Should also apply if the sentence could not be parsed
    question = parsed_tree[0] #Indexing the ROOT

    if question.label() == "SBARQ":
        #Check to make sure "SQ" doesnt follow in case it was just misclassified
        if question[0].label() == "SQ":
            new_sentence = make_binary_declarative(question[0]) #Gets rid of random previous "SBARQ"
        else:
            [question_word, new_sentence] = get_wh_word(question)
    elif question.label() == "SQ":
        new_sentence = make_binary_declarative(question)

    elif question.label() == "S":
        #print("Already in sentence form")
        was_declarative = True
        new_sentence = tree_to_string(question) #NOTE: May not want to use this new sentence as some tokens may be altered, example "(" and ")" become "-LRB-" and "-RRB-" respectively

    else:
        print("ERROR: Not equipt to handle questions parsed as: " + str(question.label()))
        was_declarative = True #NOTE: This is to flag to just do a closest match return
        new_sentence = tree_to_string(question)
        #new_sentence = "" #FIXME: Possibly include some other signal when this does not parse correctly
        #FIXME: Recursively run through subtrees of question to find these parts?

    return [question_word, was_declarative, new_sentence]
            

# Preprocessing

In [51]:
#Preprocess document for queries
sentences = []

with open("noun_counting_data/a1.txt") as f: #FIXME: In actual implementation, have this be a line from article.txt, argument 1 of the command line
    text = f.read()
    paragraphs = text.split("\n")
    for entry in paragraphs:
        split_entries = entry.split(".")
        for se in split_entries:
            if len(se) > 0: #Get rid of empty lines
                sentences.append(se)

In [52]:
#Example Questions

#Preprocessing: make first word lowercase if it is ["is", "does", "was", etc], but keep the question mark; test removing all other punctuation (",", ";", etc)
#Preprocessing NOTE: making every word lowercase messes up the classification done by the parser

parser = CoreNLPParser(url='http://localhost:9000')

query1 = "is a red Gyarados found in Pokémon Gold?"
query2 = "is Gyarados a Pokémon species?"
query3 = "does Gyarados appear multiple times in Pokémon under various trainers such as Misty, Lance, Crasher Wake, and Nurse Joy?"
query4 = "did Author Ash Dekirk write that Gyarados and Magikarp were inspired by the Asiatic myth of the Dragon Gate?"
query5 = "who is Gyarados voiced by?" #NOTE: For Sojeong
query6 = "in Pokémon Adventures, what debuts in the Red, Green & Blue chapter in Gyarados Splashes In!?" #NOTE: For Sojeong
test = "Gyarados (ギャラドス, Gyaradosu,  or ) is a Pokémon species in Nintendo and Game Freak's Pokémon franchise." #NOTE: For Sojeong

queryList = [query1, query2, query3, query4, query5, query6, test]
listEntries = []
entries = []
for q in queryList:
    result = parser.parse(q.split())
    listResult = list(result)
    entries.append(result)
    listEntries.append(listResult)
    #print(listResult)

for i in range(len(listEntries)):
    print(int(i))
    print(listEntries[i][0]) #0 index to get to actual tree
    print()

0
(ROOT
  (SBARQ
    (SQ
      (VBZ is)
      (NP
        (NP (DT a) (JJ red) (NN Gyarados))
        (VP (VBN found) (PP (IN in) (NP (NNP Pokémon) (NNP Gold))))))
    (. ?)))

1
(ROOT
  (SBARQ
    (SQ
      (VBZ is)
      (NP (NNP Gyarados) (NP (DT a) (NNP Pokémon) (NNS species))))
    (. ?)))

2
(ROOT
  (SQ
    (VBZ does)
    (NP (NNP Gyarados))
    (VP
      (VB appear)
      (NP (JJ multiple) (NNS times))
      (PP (IN in) (NP (NNP Pokémon)))
      (PP
        (IN under)
        (NP
          (NP (JJ various) (NNS trainers))
          (PP
            (JJ such)
            (IN as)
            (NP
              (NP (NNP Misty))
              (, ,)
              (NP (NNP Lance))
              (, ,)
              (NP (NNP Crasher))))))
      (VP (VBP Wake)))
    (, ,)
    (CC and)
    (NP (NN Nurse) (NNP Joy))
    (. ?)))

3
(ROOT
  (SQ
    (VBD did)
    (NP (NN Author) (NNP Ash) (NNP Dekirk))
    (VP
      (VB write)
      (SBAR
        (IN that)
        (S
          (NP (NNP Gyarados)

In [53]:
new_sentence_list = []
for i in range(len(listEntries)):
    print(int(i))
    [question_word, was_declarative, new_sentence] = make_declarative(listEntries[i][0])
    if new_sentence != "" and new_sentence[-1] == "?":
        new_sentence = new_sentence[:-1] #Remove question mark
    new_sentence_list.append(new_sentence)
    print(queryList[i])
    print("Question Word: " + question_word)
    print("Was Declarative?: " + str(was_declarative))
    print("New Sentence: " + new_sentence)
    print()

0
is a red Gyarados found in Pokémon Gold?
Question Word: 
Was Declarative?: False
New Sentence: a red Gyarados is found in Pokémon Gold

1
is Gyarados a Pokémon species?
Question Word: 
Was Declarative?: False
New Sentence: Gyarados is a Pokémon species

2
does Gyarados appear multiple times in Pokémon under various trainers such as Misty, Lance, Crasher Wake, and Nurse Joy?
Question Word: 
Was Declarative?: False
New Sentence: Gyarados does appear multiple times in Pokémon under various trainers such as Misty , Lance , Crasher Wake , and Nurse Joy 

3
did Author Ash Dekirk write that Gyarados and Magikarp were inspired by the Asiatic myth of the Dragon Gate?
Question Word: 
Was Declarative?: False
New Sentence: Author Ash Dekirk did write that Gyarados and Magikarp were inspired by the Asiatic myth of the Dragon Gate 

4
who is Gyarados voiced by?
Question Word: who
Was Declarative?: False
New Sentence: is Gyarados voiced by 

5
in Pokémon Adventures, what debuts in the Red, Green & 

In [54]:
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
corresponding_sentences = []
'''
query1 = "is a red Gyarados found in Pokémon Gold?"
query2 = "is Gyarados a Pokémon species?"
query3 = "does Gyarados appear multiple times in Pokémon under various trainers such as Misty, Lance, Crasher Wake, and Nurse Joy?"
query4 = "did Author Ash Dekirk write that Gyarados and Magikarp were inspired by the Asiatic myth of the Dragon Gate?"
'''
s1 = "In Pokémon Gold, Silver, and Crystal, and their remakes, a red Gyarados is found in the Lake of Rage." 
s2 = "Gyarados (ギャラドス, Gyaradosu,  or ) is not a Pokémon species in Nintendo and Game Freak's Pokémon franchise."
s3 = "Gyarados does not appear multiple times in the anime under various trainers such as Misty, Lance, Crasher Wake, and Nurse Joy."
s4 = "Author Ash Dekirk wrote that Gyarados and Magikarp were inspired by the Asiatic myth of the Dragon Gate."
corresponding_sentences.append(s1)
corresponding_sentences.append(s2) #NOTE: This was negated from original text
corresponding_sentences.append(s3) #NOTE: This was negated from original text
corresponding_sentences.append(s4)

dep_new = []
dep_sentences = []

for i in range(len(new_sentence_list)):
    if i == 4: #Only first 4 are yes/no questions
        break
    print(i)
    
    #Rearranged Questions then Corresponding Sentences
    listed = [new_sentence_list[i], corresponding_sentences[i]]
    for k in range(len(listed)):
        s = listed[k]
        parses = dep_parser.parse(s.split())
        print(s)
        result = [[(governor, dep, dependent) for governor, dep, dependent in p.triples()] for p in parses]

        if k == 0: dep_new.append(result)
        else: dep_sentences.append(result)

        for i in range(len(result[0])):
            print(result[0][i])

    print()

0
a red Gyarados is found in Pokémon Gold
(('found', 'VBN'), 'nsubjpass', ('Gyarados', 'NNPS'))
(('Gyarados', 'NNPS'), 'det', ('a', 'DT'))
(('Gyarados', 'NNPS'), 'amod', ('red', 'JJ'))
(('found', 'VBN'), 'auxpass', ('is', 'VBZ'))
(('found', 'VBN'), 'nmod', ('Gold', 'NNP'))
(('Gold', 'NNP'), 'case', ('in', 'IN'))
(('Gold', 'NNP'), 'compound', ('Pokémon', 'NNP'))
In Pokémon Gold, Silver, and Crystal, and their remakes, a red Gyarados is found in the Lake of Rage.
(('Gold', 'NNP'), 'case', ('In', 'IN'))
(('Gold', 'NNP'), 'compound', ('Pokémon', 'NNP'))
(('Gold', 'NNP'), 'punct', (',', ','))
(('Gold', 'NNP'), 'appos', ('Silver', 'NNP'))
(('Gold', 'NNP'), 'punct', (',', ','))
(('Gold', 'NNP'), 'cc', ('and', 'CC'))
(('Gold', 'NNP'), 'conj', ('Crystal', 'NNP'))
(('Gold', 'NNP'), 'punct', (',', ','))
(('Gold', 'NNP'), 'cc', ('and', 'CC'))
(('Gold', 'NNP'), 'conj', ('found', 'VBN'))
(('found', 'VBN'), 'nsubjpass', ('remakes', 'NNS'))
(('remakes', 'NNS'), 'nmod:poss', ('their', 'PRP$'))
(('remak

In [55]:
#Use dependency parses to compare restructured question and corresponding statement to answer yes or no
#NOTE: dep_new and dep_sentences should be the same length
for i in range(len(dep_new)):
    dep_n = dep_new[i]
    dep_s = dep_sentences[i]

    #Save governors and corresponding deps for comparison
    n_governors = dict() #NOTE: dependent are tuples while governors(keys) and dep are a string
    s_governors = dict()

    dependencies = [dep_n[0], dep_s[0]]
    governors = [n_governors, s_governors]

    for k in range(len(dependencies)):
        dpnd = dependencies[k]
        govs = governors[k]
        #print(dpnd)

        for item in dpnd:
            g_word = item[0][0]
            #print(g_word)
            g_pos = item[0][1]
            #print(g_pos)
            dep = item[1]
            #print(dep)
            dependent = item[2]
            #print(dependent)

            #If the governor is a verb, store its base form as the key
            if g_pos.startswith("V"): 
                g_word = lemmatizer.lemmatize(g_word, pos="v")

            #Add new word to dictionary if not there
            if g_word not in govs:
                govs[g_word] = dict()

                #Check again if pos is a verb to add new field in dictionary
                #for its part of speech before base form form dictionary
                if g_pos.startswith("V"): 
                    govs[g_word]["lem"] = g_pos

            if dep in ["cop", "aux", "auxpass", "neg"]: #Special case to handle verb conjugations
                if "verb" not in govs[g_word]:
                    govs[g_word]["verb"] = set()
                govs[g_word]["verb"].add(dependent)
            govs[g_word][dep] = dependent

    #Check for existence of each governor in sentence (with corresponding dependents when necessary)
    bool_yes = True
    #print(n_governors.keys())
    for key in n_governors.keys():

        #print(key)
        #If head is not in other parse, then it is false
        if key not in s_governors.keys():
            print("NO: reason 1")
            bool_yes = False
            break
        
        #If subject and direct object are not present, then it is false
        if ("nsubj" in n_governors[key]):
            if ("nsubj" not in s_governors[key]):
                print("NO: reason 2")
                bool_yes = False
                break
        if ("nsubjpass" in n_governors[key]):
            if ("nsubjpass" not in s_governors[key]):
                print("NO: reason 3")
                bool_yes = False
                break
        if ("dobj" in n_governors[key]):
            if ("dobj" not in s_governors[key]):
                print("NO: reason 4")
                bool_yes = False
                break

        #If key is a verb in its lemmatized form or has copula
        #Check to see if one is negated and the other is not
        #FIXME: Can make more complex check of verb tenses as well
        if "lem" in n_governors[key] or "verb" in n_governors[key]: #store up verb information to deal with
            n_neg = "neg" in n_governors[key]
            s_neg = "neg" in s_governors[key]

            if n_neg != s_neg:
                print("NO: reason 5")
                bool_yes = False
                break
    
    if bool_yes:
        print("YES")


YES
NO: reason 5
NO: reason 5
YES


In [31]:
from nltk.corpus import wordnet
#https://www.geeksforgeeks.org/nlp-synsets-for-a-word-in-wordnet/
#https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/

#NOTE: With current system, will only be able to check for one word replacements
# Therefore multi-word replacements may still be missed

syns = wordnet.synsets('dog')

first = syns[0]

synonyms = set()
for syn in syns: 
    for l in syn.lemmas(): 
        synonyms.add(l.name()) 
print(synonyms)

print ("Synset name :  ", first.name())

hyper = first.hypernyms()
print ("\nSynset abstract term :  ", hyper) 
  
hyper_words = set()
for m in hyper:
    #hyper_words.add(m.name())
    for word in m.lemmas():
        phrase = word.name().replace("_", " ") #Eliminating underscores to show one word vs 2 word statements
        if len(phrase.split()) == 1: #Identifying single word replacements
            hyper_words.add(phrase)

hypo = first.hyponyms()
print ("\nSynset specific term :  ",  
       hypo) 
  
hypo_words = set()
for k in hypo:
    #hypo_words.add(k.name())
    for word in k.lemmas():
        phrase = word.name().replace("_", " ") #Eliminating underscores to show one word vs 2 word statements
        if len(phrase.split()) == 1: #Identifying single word replacements
            hypo_words.add(phrase)
  
print ("\nHyper: ", hyper_words)
print ("\nHypo: ", hypo_words)



{'go_after', 'Canis_familiaris', 'frank', 'track', 'frump', 'hound', 'give_chase', 'wienerwurst', 'pawl', 'hot_dog', 'click', 'chase', 'blackguard', 'dog-iron', 'tag', 'andiron', 'hotdog', 'wiener', 'detent', 'frankfurter', 'dog', 'trail', 'bounder', 'domestic_dog', 'chase_after', 'weenie', 'firedog', 'heel', 'tail', 'cad'}
Synset name :   dog.n.01

Synset abstract term :   [Synset('canine.n.02'), Synset('domestic_animal.n.01')]

Synset specific term :   [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'), Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'), Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'), Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), Synset('pug.n.01'), Synset('puppy.n.01'), Synset('spitz.n.01'), Synset('toy_dog.n.01'), Synset('working_dog.n.01')]

Hyper:  {'canine', 'canid'}

Hypo:  {'basenji', 'mutt', 'spitz', 'Leonberg', 'Newfoundland', 