# Question Generation

Purpose: Given an article output a list of sentences
1. Parse Article into sentences
2. From each sentence, generate Stanford dependency parse tree
3. From each parse tree, use rule based method to generate question from sentence.
4. Refine the sentences using language models.

### Article -> Sentences

In [106]:
import nltk

In [107]:
content = []
for i in range(1, 10):
    with open(f'./Development_data/set1/a{i}.txt', 'r') as f:
        content.append(f.read())

In [108]:
sentences = []
for file in content:
    sentences.extend(nltk.sent_tokenize(file))


### Sentences -> Parse Trees

In [109]:
from nltk.parse.corenlp import CoreNLPServer
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.stem import WordNetLemmatizer 
# nltk.download('wordnet')
import os
import requests
import spacy

In [5]:
STANFORD = os.path.join("models", "stanford-corenlp-full-2018-10-05")

# Create the server
server = CoreNLPServer(
   os.path.join(STANFORD, "stanford-corenlp-3.9.2.jar"),
   os.path.join(STANFORD, "stanford-corenlp-3.9.2-models.jar"),    
)
server.start()

CoreNLPServerError: Could not connect to the server.

In [164]:
requests.post('http://[::]:9000/?properties={"annotators":"tokenize,ssplit,pos","outputFormat":"json"}', data = {'data': "tmp"}).text

'{\n  "sentences": [\n    {\n      "index": 0,\n      "tokens": [\n        {\n          "index": 1,\n          "word": "data",\n          "originalText": "data",\n          "characterOffsetBegin": 0,\n          "characterOffsetEnd": 4,\n          "pos": "NN",\n          "before": "",\n          "after": ""\n        },\n        {\n          "index": 2,\n          "word": "=",\n          "originalText": "=",\n          "characterOffsetBegin": 4,\n          "characterOffsetEnd": 5,\n          "pos": "JJ",\n          "before": "",\n          "after": ""\n        },\n        {\n          "index": 3,\n          "word": "tmp",\n          "originalText": "tmp",\n          "characterOffsetBegin": 5,\n          "characterOffsetEnd": 8,\n          "pos": "NN",\n          "before": "",\n          "after": ""\n        }\n      ]\n    }\n  ]\n}\n'

Download Stanford Parser: https://nlp.stanford.edu/software/lex-parser.shtml#Download Version 3.9.2

In [165]:
from nltk.tree import Tree
parser = CoreNLPParser()
lemmatizer = WordNetLemmatizer() 
sp = spacy.load('en_core_web_sm')

In [171]:
invertible_aux_verb = {'am', 'are', 'is', 'was', 'were', 'can', 'could', 'does', 'did', 'has', 'had', 'have', 'may', 'might',
                       'must', 'shall', 'should', 'will', 'would'}

purge_tree = {"PRN"}

#Do/Did for I only
#Does/Did for everything else

def is_invertible(s):
    if isinstance(s, str):
        return s.lower() in invertible_aux_verb
    return False

def list_to_string(word_list):
    return ' '.join(word_list)

def tree_to_string(parsed_tree, lower = False):
#     if isinstance(parsed_tree, str):
#         return parsed_tree
#     words = []
#     for subtree in parsed_tree:
#         words.append(tree_to_string(subtree))
    leaves = parsed_tree.leaves()
    if lower:
        leaves[0] = leaves[0].lower()
    return list_to_string(leaves)

def first(parsed_tree):
    if isinstance(parsed_tree[0], str):
        return parsed_tree
    return first(parsed_tree[0])

#purges tree based on the set purge_trees
def purge(parsed_tree):
    if isinstance(parsed_tree, str):
        return False
    length = len(parsed_tree)
    i = 0
    if parsed_tree.label() in purge_tree:
        return True
    while i < length:
        res = purge(parsed_tree[i])
        if res:
            del parsed_tree[i]
            length -= 1
        else:
            i += 1
    return False

def binary_question_from_tree(parsed_tree):
    sentence = parsed_tree[0]
    assert(sentence.label() == 'S')
    np = sentence[0]
    vp = sentence[1]
    noun_label = first(np).label()
    #print("NL", noun_label)
    assert(np.label() == 'NP')
    assert(vp.label() == 'VP')
    
    #print(parsed_tree)
    if noun_label in ["NNP", "NNPS"] or first(np)[0] in ["The", "A"]:
        subject = tree_to_string(np, True) if noun_label == "DT" else tree_to_string(np)
        #print(parsed_tree)
        purge(vp)
        remain = vp.leaves()[1:]
        if is_invertible(first(vp)[0]): #checks if is aux word
            return list_to_string([first(vp)[0].capitalize(), subject] + remain) + '?'
        else:
            #Add Does/Did/Do
            verb_label = first(vp).label()
            if isinstance(first(vp)[0], str):
                lemmas = sp(first(vp)[0])
                lemma = lemmas[0].lemma_
            if verb_label in ["VBP", "VBZ","VBG"]: #present tense
                return list_to_string(["Does", subject, lemma] + remain) + "?"
            elif verb_label in ["VBD", "VBN"]: #past tense
                return list_to_string(["Did", subject, lemma] + remain) + "?"
    return None

In [172]:
#Sentence Structure Tree
class SST():
    def __init__(self, label, children):
        self.label = label
        self.children = children

#Sentence Structure Leaf
class SSL():
    def __init__(self, label):
        self.label = label
        
simple_predicate = SST('ROOT', [SST('S', [SSL('NP'), SSL('VP'), SSL('.')])])

def satisfies_structure(parsed_tree, structure):
    if isinstance(structure, SSL):
        return parsed_tree.label() == structure.label
    else:
        if parsed_tree.label() != structure.label or len(parsed_tree) != len(structure.children): return False
        for i in range(len(parsed_tree)):
            if satisfies_structure(parsed_tree[i], structure.children[i]) == False:
                return False
        return True

In [173]:
#sentences = ["The company bought food for the homeless."]

In [174]:
parse_list = []
count = 10 
for sentence in sentences:
    if len(sentence) < 500:
        parse = next(parser.raw_parse(sentence))
        if satisfies_structure(parse, simple_predicate) and binary_question_from_tree(parse):
            count -= 1
            
            print("=========================== Sentence ======================")
            print("Sentence:", sentence)
            #print(parse)
#             print(parse.label())
            #print(sentence) 
            print("Question:", binary_question_from_tree(parse))
            parse_list.append(parse)
            if count == 0:
                break
            

print(count)  
    
#parse.draw()

Sentence: Egypt attained its first continuous peak of civilization – the first of three so-called "Kingdom" periods (followed by the Middle Kingdom and New Kingdom) which mark the high points of civilization in the lower Nile Valley.
Question: Did Egypt attain its first continuous peak of civilization -- the first of three so-called `` Kingdom '' periods which mark the high points of civilization in the lower Nile Valley?
Sentence: The term itself was coined by eighteenth-century historians and the distinction between the Old Kingdom and the Early Dynastic Period is not one which would have been recognized by Ancient Egyptians.
Question: Is the term itself was coined by eighteenth-century historians and the distinction between the Old Kingdom and the Early Dynastic Period not one which would have been recognized by Ancient Egyptians?
Sentence: The basic justification for a separation between the two periods is the revolutionary change in architecture accompanied by the effects on Egypt

In [157]:
server.stop()