# Question Generation

Purpose: Given an article output a list of sentences
1. Parse Article into sentences
2. From each sentence, generate Stanford dependency parse tree
3. From each parse tree, use rule based method to generate question from sentence.
4. Refine the sentences using language models.

### Article -> Sentences

In [1]:
import nltk

In [2]:
content = []
for i in range(1, 2):
    with open(f'./Development_data/set2/a{i}.txt', 'r') as f:
        content.append(f.read())

In [3]:
sentences = []
for file in content:
    sentences.extend(nltk.sent_tokenize(file))


### Sentences -> Parse Trees

In [4]:
from nltk.parse.corenlp import CoreNLPServer
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.stem import WordNetLemmatizer 
# nltk.download('wordnet')
import os
import requests
import spacy

In [5]:
STANFORD = os.path.join("models", "stanford-corenlp-full-2018-10-05")

# Create the server
server = CoreNLPServer(
   os.path.join(STANFORD, "stanford-corenlp-3.9.2.jar"),
   os.path.join(STANFORD, "stanford-corenlp-3.9.2-models.jar"),    
)
server.start()

CoreNLPServerError: Could not connect to the server.

In [5]:
requests.post('http://[::]:9000/?properties={"annotators":"tokenize,ssplit,pos","outputFormat":"json"}', data = {'data': "tmp"}).text

'{\n  "sentences": [\n    {\n      "index": 0,\n      "tokens": [\n        {\n          "index": 1,\n          "word": "data",\n          "originalText": "data",\n          "characterOffsetBegin": 0,\n          "characterOffsetEnd": 4,\n          "pos": "NN",\n          "before": "",\n          "after": ""\n        },\n        {\n          "index": 2,\n          "word": "=",\n          "originalText": "=",\n          "characterOffsetBegin": 4,\n          "characterOffsetEnd": 5,\n          "pos": "JJ",\n          "before": "",\n          "after": ""\n        },\n        {\n          "index": 3,\n          "word": "tmp",\n          "originalText": "tmp",\n          "characterOffsetBegin": 5,\n          "characterOffsetEnd": 8,\n          "pos": "NN",\n          "before": "",\n          "after": ""\n        }\n      ]\n    }\n  ]\n}\n'

Download Stanford Parser: https://nlp.stanford.edu/software/lex-parser.shtml#Download Version 3.9.2

In [108]:
from nltk.tree import Tree
parser = CoreNLPParser()
lemmatizer = WordNetLemmatizer() 
sp = spacy.load('en_core_web_sm')

In [89]:
invertible_aux_verb = {'am', 'are', 'is', 'was', 'were', 'can', 'could', 'may', 'might',
                       'must', 'shall', 'should', 'will', 'would'}
invertible_special = {'does', 'did', 'has', 'had', 'have'}

purge_tree = {"PRN", "ADVP", "RB"} #WHNP with parent SBAR #JJ, JJR, ADJP, S with parent NP

np_requirements = {"NNP", "NNPS"}

#Do/Did for I only
#Does/Did for everything else

def is_invertible(s, next_phrase):
    if isinstance(s, str):
        return (s.lower() in invertible_aux_verb or 
                s.lower() in invertible_special and next_phrase == "VP")
    return False

def list_to_string(word_list):
    return ' '.join(word_list)

def tree_to_string(parsed_tree, lower = False):
#     if isinstance(parsed_tree, str):
#         return parsed_tree
#     words = []
#     for subtree in parsed_tree:
#         words.append(tree_to_string(subtree))
    leaves = parsed_tree.leaves()
    if lower:
        leaves[0] = leaves[0].lower()
    return list_to_string(leaves)

def first(parsed_tree):
    if isinstance(parsed_tree[0][0], str):
        return parsed_tree[0], parsed_tree
    return first(parsed_tree[0])

def is_in(parsed_tree, label_set):
    if isinstance(parsed_tree[0], str):
        return parsed_tree.label() in label_set
    contain = False
    for tree in parsed_tree:
        contain = contain or is_in(tree, label_set)
    return contain

#purges tree based on the set purge_trees
def purge(parsed_tree):
    if isinstance(parsed_tree, str):
        return False
    length = len(parsed_tree)
    i = 0
    if parsed_tree.label() in purge_tree:
        return True
    while i < length:
        res = purge(parsed_tree[i])
        if res:
            del parsed_tree[i]
            length -= 1
        else:
            i += 1
    return False

#keeps the first subtree that is of PP
#pass in tree and a bool that checks for first

#each recursive call we return tuple 
#(whether subtree needs to be deleted, what the new first is after running it on the tree)
def purge_rest_helper(parsed_tree, first):
    if isinstance(parsed_tree, str):
        #is leaf
        return (False, True)
    if parsed_tree.label() == "PP":
        #supposed to be purged
        if first:
            #delay purging
            new_first = False
        else:
            #if its not the first, return first/False
            return (True, False)
    else:
        new_first = first #true if first hasn't occured, false if it has
        
    length = len(parsed_tree)
    #if its a first that is supposed to be purged you still look at the children
    is_first = first #if this is in the first tree, we set the first in the loop to True else False
    i = 0
    while i < length:
        (res, res_first) = purge_rest_helper(parsed_tree[i], is_first)
        if res and not is_first:
            del parsed_tree[i]
            length -= 1
        else:
            i += 1
        is_first = res_first
    if not new_first:
        return (False, new_first)
    else:
        return (False, is_first)

def purge_except_first(parsed_tree):
    purge_rest_helper(parsed_tree, True)

# TODO: take care of situations where vp is inside a np
# def binary_question_from_tree(parsed_tree):
#     sentence = parsed_tree[0]
#     assert(sentence.label() == 'S')
#     np = sentence[0]
#     vp = sentence[1]
#     noun_label = first(np).label()
#     #print("NL", noun_label)
#     assert(np.label() == 'NP')
#     assert(vp.label() == 'VP')
#     #print(parsed_tree)
#     if is_in(np, np_requirements):
#         subject = tree_to_string(np, True) if noun_label == "DT" else tree_to_string(np)
#         #print(parsed_tree)
#         #purge(vp)
#         remain = vp.leaves()[1:]
#         first_node, first_parent = first(vp)
#         if is_invertible(first_node[0], first_parent[1].label()): #checks if is aux word
#             return list_to_string([first(vp)[0].capitalize(), subject] + remain) + '?'
#         else:
#             #Add Does/Did/Do
#             verb_label = first(vp).label()
#             if isinstance(first(vp)[0], str):
#                 lemmas = sp(first(vp)[0])
#                 lemma = lemmas[0].lemma_
#             if verb_label in ["VBP", "VBZ","VBG"]: #present tense
#                 return list_to_string(["Does", subject, lemma] + remain) + "?"
#             elif verb_label in ["VBD", "VBN"]: #past tense
#                 return list_to_string(["Did", subject, lemma] + remain) + "?"
#     return None

def binary_question_from_tree(parsed_tree):
    sentence = parsed_tree[0]
    assert(sentence.label() == 'S')
    np = sentence[0]
    vp = sentence[1]
    if not isinstance(vp[0][0], str) or not isinstance(np[0][0], str):
        return None
    noun_label = np[0].label()
    
    assert(np.label() == 'NP')
    assert(vp.label() == 'VP')
   
    if is_in(np, np_requirements) or :
        subject = tree_to_string(np, True) if noun_label == "DT" else tree_to_string(np)
        remain = vp.leaves()[1:]
        if is_invertible(vp[0][0],vp[1].label()): #checks if is aux word
            return list_to_string([vp[0][0].capitalize(), subject] + remain) + '?'
        else:
            #Add Does/Did/Do
            verb_label = vp[0].label()
            lemmas = sp(vp[0][0])
            lemma = lemmas[0].lemma_
            if verb_label in ["VBP", "VBZ","VBG"]: #present tense
                return list_to_string(["Does", subject, lemma] + remain) + "?"
            elif verb_label in ["VBD", "VBN"]: #past tense
                return list_to_string(["Did", subject, lemma] + remain) + "?"
    return None

In [90]:
#Sentence Structure Tree
class SST():
    def __init__(self, label, children):
        self.label = label
        self.children = children

#Sentence Structure Leaf
class SSL():
    def __init__(self, label):
        self.label = label
        
simple_predicate = SST('ROOT', [SST('S', [SSL('NP'), SSL('VP'), SSL('.')])])

def satisfies_structure(parsed_tree, structure):
    if isinstance(structure, SSL):
        return parsed_tree.label() == structure.label
    else:
        if parsed_tree.label() != structure.label or len(parsed_tree) != len(structure.children): return False
        for i in range(len(parsed_tree)):
            if satisfies_structure(parsed_tree[i], structure.children[i]) == False:
                return False
        return True

In [91]:
#Testing
# sentences = ["Joe has an apple", "Joe has done something wrong"]

In [92]:
import copy
parse_list = []
count = 0
for sentence in sentences:
    if len(sentence) < 500:
        parse = next(parser.raw_parse(sentence))
        #print(parse)
        purge(parse)
        tmp_parse = copy.deepcopy(parse)
        #purge_except_first(parse)
        if satisfies_structure(parse, simple_predicate):
            question = binary_question_from_tree(parse)
            if question != None:
                count += 1
                #print("=========================== Sentence ======================")
                #print("Sentence:", sentence)
                #print(tmp_parse)
    #             print(parse.label())
                #print(sentence) 
                print(question)
                parse_list.append(parse)
                if count == 0:
                    break


print(count)  
    
#parse.draw()

Are the Indus cities noted for their urban planning , baked brick houses , elaborate drainage systems , water supply systems , and clusters of large non-residential buildings?
Is the Indus Valley Civilisation named the Harappan civilisation after Harappa , the first of its sites to be excavated in the 1920s , in what was the Punjab province of British India?
Is the Indus Valley Civilisation named the Harappan civilisation after Harappa , the first of its sites to be excavated in the 1920s , in what was the Punjab province of British India?
Has the Indus Valley Civilisation been called by some the `` Sarasvati culture '' , the `` Sarasvati Civilisation '' , the `` Indus-Sarasvati Civilisation '' or the `` Sindhu-Saraswati Civilisation '' , as the Ghaggar-Hakra river is identified by some with the mythological Sarasvati river , suggesting that the Indus Valley Civilisation was the Vedic civilisation as perceived by traditional Hindu beliefs?
Has an Indus Valley site been found on the Oxu

In [16]:
from nltk.tree import Tree
import neuralcoref

sp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(sp)
doc2 = sp('Angela lives in Boston as she is working there. She is quite happy in that city.')
print(list(tok.text_with_ws for tok in doc2))

['Angela ', 'lives ', 'in ', 'Boston ', 'as ', 'she ', 'is ', 'working ', 'there', '. ', 'She ', 'is ', 'quite ', 'happy ', 'in ', 'that ', 'city', '.']


In [40]:
def get_resolved(doc, clusters):
    ''' Return a list of utterrances text where the coref are resolved to the most representative mention'''
    sentences = [sent.string.strip() for sent in doc.sents]
    
    token_labels = []
    for i in range(len(sentences)):
        print(sentences[i])
        for j in range(len(list(tok.text_with_ws for tok in sp(sentences[i])))):
            token_labels.append(i)
    print(token_labels)
    resolved = list(tok.text_with_ws for tok in doc)
    for cluster in clusters:
        seen = set()
        for coref in cluster:
            print(seen, coref, cluster)
            if coref != cluster.main and token_labels[coref.start] not in seen:
                resolved[coref.start] = cluster.main.text + doc[coref.end-1].whitespace_
                for i in range(coref.start+1, coref.end):
                    resolved[i] = ""
            seen.add(token_labels[coref.start])
    return ''.join(resolved)
print(doc2._.coref_clusters)
print(get_resolved(doc2, doc2._.coref_clusters))


[Angela: [Angela, she, She], Boston: [Boston, that city]]
Angela lives in Boston as she is working there.
She is quite happy in that city.
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
set() Angela Angela: [Angela, she, She]
{0} she Angela: [Angela, she, She]
{0} She Angela: [Angela, she, She]
set() Boston Boston: [Boston, that city]
{0} that city Boston: [Boston, that city]
Angela lives in Boston as she is working there. Angela is quite happy in Boston.


In [46]:
server.stop()

NameError: name 'server' is not defined