In [50]:
import nltk
from nltk.tokenize import sent_tokenize
import spacy
from nltk.stem import WordNetLemmatizer 


# TIPS
1. Delete everything inside a bracket first

In [146]:
nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()
textfile = "data/set4/a7.txt"

sentence1 = "Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros."
sentence2 = "Harry Potter has been spending another unhappy summer with the Dursleys."
sentence3 = "The film was produced by La Petite Reine and ARP Sélection for 13.47 million dollars."
sentence4 = "The film took them 13.47 million dollars."


doc1 = nlp(sentence1)
doc2 = nlp(sentence2)
doc3 = nlp(sentence3)
doc4 = nlp(sentence4)



In [145]:
textfile = "data/set4/a7.txt"

#put entire text file into a list of sentences
text = []
with open(textfile, "r") as f:
    for line in f:
        line = line.split('. ')
        if len(line) != 0:
            temp = line[0].strip('\n')
            if len(temp) != 0:
                text.append(temp)

# POS tagging


In [4]:
def pos_tag_lst(text):
    #list of sentences
    POS_tag_dict = dict()
    for i,line in enumerate(text):
        tags = []
        doc = nlp(str(line))
        for token in doc:
            tags.append((token.text, token.pos_, token.tag_, token.dep_, token.is_stop))
        if len(tags) != 0:
            POS_tag_dict[i] = tags
    return POS_tag_dict

In [59]:
def pos_tag_sentence(sentence):
    #list of sentences
    POS_tag_dict = dict()
    text = sentence.split()
    for line in text:
        tags = []
        doc = nlp(str(line))
        for token in doc:
            tags.append((token.pos_, token.tag_, token.dep_, token.is_stop, ))
        if len(tags) != 0:
            POS_tag_dict[token.text] = tags[0]
    return POS_tag_dict

In [60]:
pos_tag_sentence(sentence3)

{'The': ('DET', 'DT', 'ROOT', True),
 'film': ('NOUN', 'NN', 'ROOT', False),
 'was': ('AUX', 'VBD', 'ROOT', True),
 'produced': ('VERB', 'VBD', 'ROOT', False),
 'by': ('ADP', 'IN', 'ROOT', True),
 'La': ('PROPN', 'NNP', 'ROOT', False),
 'Petite': ('ADJ', 'JJ', 'ROOT', False),
 'Reine': ('VERB', 'VB', 'ROOT', False),
 'and': ('CCONJ', 'CC', 'ROOT', True),
 'ARP': ('PROPN', 'NNP', 'ROOT', False),
 'Sélection': ('NOUN', 'NN', 'ROOT', False),
 'for': ('ADP', 'IN', 'ROOT', True),
 '13.47': ('NUM', 'CD', 'ROOT', False),
 'million': ('NUM', 'CD', 'ROOT', False),
 '.': ('NOUN', 'NNS', 'ROOT', False)}

In [144]:
pos_dict = pos_tag_lst(text)

# Dependency Tree

In [8]:
#Token dict 
def dependency_dict(doc):
    out = dict()
    root = ''
    for token in doc:
        out[token.text] = (token.dep_, token.head.text, token.head.pos_,[child for child in token.children])
        if token.dep_ == "ROOT":
            root = token.text
    return out, root

In [9]:
token_Dict1, root1 = dependency_dict(doc1)
token_Dict1, root1

({'Harry': ('compound', 'Potter', 'PROPN', []),
  'Potter': ('nsubj', 'is', 'AUX', [Harry, and, Prisoner]),
  'and': ('cc', 'directed', 'VERB', []),
  'the': ('det', 'Prisoner', 'PROPN', []),
  'Prisoner': ('conj', 'Potter', 'PROPN', [the, of]),
  'of': ('prep', 'Prisoner', 'PROPN', [Azkaban]),
  'Azkaban': ('pobj', 'of', 'ADP', []),
  'is': ('ROOT', 'is', 'AUX', [Potter, film]),
  'a': ('det', 'film', 'NOUN', []),
  '2004': ('nummod', 'film', 'NOUN', []),
  'fantasy': ('compound', 'film', 'NOUN', []),
  'film': ('attr', 'is', 'AUX', [a, 2004, fantasy, directed]),
  'directed': ('acl', 'film', 'NOUN', [by, and, distributed]),
  'by': ('agent', 'distributed', 'VERB', [Bros.]),
  'Alfonso': ('compound', 'Cuarón', 'PROPN', []),
  'Cuarón': ('pobj', 'by', 'ADP', [Alfonso]),
  'distributed': ('conj', 'directed', 'VERB', [by]),
  'Warner': ('compound', 'Bros.', 'PROPN', []),
  'Bros.': ('pobj', 'by', 'ADP', [Warner])},
 'is')

In [10]:
token_Dict2, root2 = dependency_dict(doc2)
token_Dict2, root2

({'Harry': ('compound', 'Potter', 'PROPN', []),
  'Potter': ('nsubj', 'spending', 'VERB', [Harry]),
  'has': ('aux', 'spending', 'VERB', []),
  'been': ('aux', 'spending', 'VERB', []),
  'spending': ('ROOT',
   'spending',
   'VERB',
   [Potter, has, been, summer, with, .]),
  'another': ('det', 'summer', 'NOUN', []),
  'unhappy': ('amod', 'summer', 'NOUN', []),
  'summer': ('dobj', 'spending', 'VERB', [another, unhappy]),
  'with': ('prep', 'spending', 'VERB', [Dursleys]),
  'the': ('det', 'Dursleys', 'PROPN', []),
  'Dursleys': ('pobj', 'with', 'ADP', [the]),
  '.': ('punct', 'spending', 'VERB', [])},
 'spending')

In [11]:
token_Dict3, root3 = dependency_dict(doc3)
token_Dict3, root3

({'The': ('det', 'film', 'NOUN', []),
  'film': ('nsubjpass', 'produced', 'VERB', [The]),
  'was': ('auxpass', 'produced', 'VERB', []),
  'produced': ('ROOT', 'produced', 'VERB', [film, was, by, for, .]),
  'by': ('agent', 'produced', 'VERB', [Reine]),
  'La': ('compound', 'Reine', 'PROPN', []),
  'Petite': ('compound', 'Reine', 'PROPN', []),
  'Reine': ('pobj', 'by', 'ADP', [La, Petite, and, Sélection]),
  'and': ('cc', 'Reine', 'PROPN', []),
  'ARP': ('compound', 'Sélection', 'PROPN', []),
  'Sélection': ('conj', 'Reine', 'PROPN', [ARP]),
  'for': ('prep', 'produced', 'VERB', [dollars]),
  '13.47': ('compound', 'million', 'NUM', []),
  'million': ('nummod', 'dollars', 'NOUN', [13.47]),
  'dollars': ('pobj', 'for', 'ADP', [million]),
  '.': ('punct', 'produced', 'VERB', [])},
 'produced')

# NER Tagging

In [12]:
def ner_tag(text):
    NER_tag_dict = dict()
    for i,line in enumerate(text):
        tags = []
        doc = nlp(str(line))

        for ent in doc.ents:
            # print(ent.text +'-' + ent.label_ + '\n')
            tags.append(ent.text +'-' + ent.label_)
        if len(tags) != 0:
            NER_tag_dict[i] = tags
    return NER_tag_dict

In [13]:
def ner_tag_sentence(sentence):
    doc = nlp(str(sentence))
    NER_tag_dict = dict()
    tags = []
    for ent in doc.ents:
        # print(ent.text +'-' + ent.label_ + '\n')
        NER_tag_dict[ent.text] = ent.label_
    return NER_tag_dict

# Binary Question

In [14]:
auxiliary_verbs = ["am", "is", "are", "was", "were", "shall", "do", "does", "did","can", "could", "have", "need", "should", "will", "would"]

In [15]:
#input: a single sentence, with its dependency dict and root word
def binaryQ(sentence, token_dict, root):
    output = ''
    if root in auxiliary_verbs:
        output += root.capitalize() + ' '
    for k in sentence.split():
        if k != root:
            output += k + ' '
    output = output[:-2]+ '?'
    return output
    

In [16]:
binaryQ(sentence1, token_Dict1, root1)

'Is Harry Potter and the Prisoner of Azkaban a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros?'

# Who Question

In [17]:
pos_tag_sentence(sentence2)

{'Harry': [('PROPN', 'NNP', 'ROOT', False)],
 'Potter': [('NOUN', 'NN', 'ROOT', False)],
 'has': [('VERB', 'VBZ', 'ROOT', True)],
 'been': [('VERB', 'VBN', 'ROOT', True)],
 'spending': [('VERB', 'VBG', 'ROOT', False)],
 'another': [('DET', 'DT', 'ROOT', True)],
 'unhappy': [('ADJ', 'JJ', 'ROOT', False)],
 'summer': [('NOUN', 'NN', 'ROOT', False)],
 'with': [('ADP', 'IN', 'ROOT', True)],
 'the': [('DET', 'DT', 'ROOT', True)],
 '.': [('PROPN', 'NNP', 'ROOT', False), ('PUNCT', '.', 'punct', False)]}

In [18]:
ner_tag_dict2 = ner_tag_sentence(sentence2)

In [19]:
dependency_dict2, root2 = dependency_dict(doc2)
dependency_dict2, root2

({'Harry': ('compound', 'Potter', 'PROPN', []),
  'Potter': ('nsubj', 'spending', 'VERB', [Harry]),
  'has': ('aux', 'spending', 'VERB', []),
  'been': ('aux', 'spending', 'VERB', []),
  'spending': ('ROOT',
   'spending',
   'VERB',
   [Potter, has, been, summer, with, .]),
  'another': ('det', 'summer', 'NOUN', []),
  'unhappy': ('amod', 'summer', 'NOUN', []),
  'summer': ('dobj', 'spending', 'VERB', [another, unhappy]),
  'with': ('prep', 'spending', 'VERB', [Dursleys]),
  'the': ('det', 'Dursleys', 'PROPN', []),
  'Dursleys': ('pobj', 'with', 'ADP', [the]),
  '.': ('punct', 'spending', 'VERB', [])},
 'spending')

In [20]:
#input: a single sentence, and its ner tag dict and dependency dict
#Who Question
def whoQ(sentence, ner_tag_dict, dependency_dict):
    #find PERSON tag
    theName = ''
    output = ''
    for k in ner_tag_dict.keys():
        if ner_tag_dict[k] == 'PERSON':
            #check if is a subject
            names = k.split()
            for n in names:
                print(dependency_dict[n])
                if dependency_dict[n][0] == 'nsubj':
                    theName = k
    print(theName)
    output = sentence.replace(theName, 'who')
    output = output[:-1] + "?"
    output = output[0].upper() + output[1:]
    return output
    

In [21]:
whoQ(sentence2, ner_tag_dict2, dependency_dict2)

('compound', 'Potter', 'PROPN', [])
('nsubj', 'spending', 'VERB', [Harry])
('pobj', 'with', 'ADP', [the])
Harry Potter


'Who has been spending another unhappy summer with the Dursleys?'

# How much Question


In [46]:
ner_tag_dict3 = ner_tag_sentence(sentence3)
dependency_dict3, root3 = dependency_dict(doc3)
pos_tag_dict3 = pos_tag_sentence(sentence3)
# "The film was produced by La Petite Reine and ARP Sélection for 13.47 million euros." don't identify as MONEY
# "The film was produced by La Petite Reine and ARP Sélection for 13.47 million dollars."
    # How much was the film produced by La Petite Reine and ARP Sélection?
# "The film costs 13.47 million dollars."
    # How much does the film costs?


In [61]:
ner_tag_dict4 = ner_tag_sentence(sentence4)
dependency_dict4, root4 = dependency_dict(doc4)
pos_tag_dict4 = pos_tag_sentence(sentence4)

In [136]:
ner_tag_dict3, dependency_dict3, root3, pos_tag_dict3

({'La Petite': 'PERSON', '13.47 million dollars': 'MONEY'},
 {'The': ('det', 'film', 'NOUN', []),
  'film': ('nsubjpass', 'produced', 'VERB', [The]),
  'was': ('auxpass', 'produced', 'VERB', []),
  'produced': ('ROOT', 'produced', 'VERB', [film, was, by, for, .]),
  'by': ('agent', 'produced', 'VERB', [Reine]),
  'La': ('compound', 'Reine', 'PROPN', []),
  'Petite': ('compound', 'Reine', 'PROPN', []),
  'Reine': ('pobj', 'by', 'ADP', [La, Petite, and, Sélection]),
  'and': ('cc', 'Reine', 'PROPN', []),
  'ARP': ('compound', 'Sélection', 'PROPN', []),
  'Sélection': ('conj', 'Reine', 'PROPN', [ARP]),
  'for': ('prep', 'produced', 'VERB', [dollars]),
  '13.47': ('compound', 'million', 'NUM', []),
  'million': ('nummod', 'dollars', 'NOUN', [13.47]),
  'dollars': ('pobj', 'for', 'ADP', [million]),
  '.': ('punct', 'produced', 'VERB', [])},
 'produced',
 {'The': [('DET', 'DT', 'ROOT', True)],
  'film': [('NOUN', 'NN', 'ROOT', False)],
  'was': [('AUX', 'VBD', 'ROOT', True)],
  'produced': [

In [63]:
ner_tag_dict4, dependency_dict4, root4, pos_tag_dict4

({'13.47 million dollars': 'MONEY'},
 {'The': ('det', 'film', 'NOUN', []),
  'film': ('nsubj', 'took', 'VERB', [The]),
  'took': ('ROOT', 'took', 'VERB', [film, them, dollars, .]),
  'them': ('dative', 'took', 'VERB', []),
  '13.47': ('compound', 'million', 'NUM', []),
  'million': ('nummod', 'dollars', 'NOUN', [13.47]),
  'dollars': ('dobj', 'took', 'VERB', [million]),
  '.': ('punct', 'took', 'VERB', [])},
 'took',
 {'The': ('DET', 'DT', 'ROOT', True),
  'film': ('NOUN', 'NN', 'ROOT', False),
  'took': ('VERB', 'VBD', 'ROOT', False),
  'them': ('PRON', 'PRP', 'ROOT', True),
  '13.47': ('NUM', 'CD', 'ROOT', False),
  'million': ('NUM', 'CD', 'ROOT', False),
  '.': ('NOUN', 'NNS', 'ROOT', False)})

VB verb, base form take
VBD verb, past tense took
VBG verb, gerund/present participle taking
VBN verb, past participle taken
VBP verb, sing. present, non-3d take
VBZ verb, 3rd person sing. present takes

In [49]:
#check tense of verb
def check_tense(root, pos_dict):
    tag = pos_dict[root][1]
    if tag == "VB":
        return "do"
    elif tag == "VBD":
        return "did"
    elif tag == "VBG":
        return "doing"
    elif tag == "VBN":
        return "done"
    elif tag == "VBP":
        return "do"
    elif tag == "VBZ":
        return "does"
    else:
        return None

In [139]:
#input: a single sentence, and its ner tag dict and dependency dict
#How much Question
#The film was produced by La Petite Reine and ARP Sélection for 13.47 million dollars.
#How much was the film produced by La Petite Reine and ARP Sélection?
def howMuchQ(sentence, doc, ner_tag_dict, dependency_dict, root, pos_dict):
    theMoney = ""
    output = ""
    theSubj = ""
    for k in ner_tag_dict.keys():
        if ner_tag_dict[k] == 'MONEY':
            theMoney = k
    #check passive tense 
    sentence_lst = sentence.split()
    root_ind = sentence_lst.index(root)
    root_token = doc[root_ind]
    if root_ind != 0:
        word_in_front_of_root = sentence_lst[root_ind -1] 
        #if it's passive tense
        if dependency_dict[word_in_front_of_root][0] == 'auxpass':
            root_aux = word_in_front_of_root
            output += 'How much '+ root_aux + ' '
            for n in dependency_dict:
                if dependency_dict[n][0] == 'nsubjpass':
                    theSubj = n    
            words_before_subj = dependency_dict[theSubj][-1]
            if len(words_before_subj) != 0:
                output += str(words_before_subj[0]).lower() + ' '
            output += theSubj + "?"
        else: #if it's not passive tense
            for n in dependency_dict:
                if dependency_dict[n][0] == 'nsubj':
                    theSubj = n  
            output += 'How much '
            #check tense
            tense = check_tense(root, pos_dict)
            if tense != None:
                output += tense + ' '
                #check subject
                for n in dependency_dict:
                    if dependency_dict[n][0] == 'nsubj':
                        theSubj = n    
                words_before_subj = dependency_dict[theSubj][-1]
                if len(words_before_subj) != 0:
                    output += str(words_before_subj[0]).lower() + ' '
                output += theSubj + ' ' + root_token.lemma_ + "?"
            else:
                return None
    return output

            
            
        
    

In [141]:
howMuchQ(sentence4, doc4, ner_tag_dict4, dependency_dict4, root4, pos_tag_dict4)

'How much did the film take?'

In [142]:
howMuchQ(sentence3, doc3, ner_tag_dict3, dependency_dict3, root3, pos_tag_dict3)

'How much was the film?'

# Why Question

In [None]:
# identify “because”, “since”, “for”, “due to”, “... result…”, “lead to”
# NP VP because (of) B