In [1]:
import nltk
from nltk.tokenize import sent_tokenize
import spacy
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import wordnet

# TIPS
1. Delete everything inside a bracket first

In [2]:
nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()
textfile = "data/set4/a7.txt"

sentence1 = "Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros."
sentence2 = "Harry Potter has been spending another unhappy summer with the Dursleys."
sentence3 = "The Prisoner of Azkaban made a total of $796.7 million worldwide"
sentence4 = "The film took them 13.47 million dollars."


doc1 = nlp(sentence1)
doc2 = nlp(sentence2)
doc3 = nlp(sentence3)
doc4 = nlp(sentence4)



In [3]:
textfile = "data/set4/a7.txt"

def get_text(textfile):
    #put entire text file into a list of sentences
    text = []
    with open(textfile, "r") as f:
        for line in f:
            line = line.split('. ')
            if len(line) != 0:
                temp = line[0].strip('\n')
                if len(temp) != 0:
                    text.append(temp)
    return text

In [4]:
textfile1 = "data/set4/a1.txt"
textfile2 = "data/set4/a7.txt"

In [5]:
text_artist, text_hp = get_text(textfile1), get_text(textfile2)

# POS tagging


In [7]:
def pos_tag_lst(text):
    #list of sentences
    POS_tag_dict = dict()
    for i,line in enumerate(text):
        tags = []
        doc = nlp(str(line))
        for token in doc:
            tags.append((token.text, token.pos_, token.tag_, token.dep_, token.is_stop))
        if len(tags) != 0:
            POS_tag_dict[i] = tags
    return POS_tag_dict

In [8]:
def pos_tag_sentence(sentence):
    #list of sentences
    POS_tag_dict = dict()
    text = sentence.split()
    for line in text:
        tags = []
        doc = nlp(str(line))
        for token in doc:
            tags.append((token.pos_, token.tag_, token.dep_, token.is_stop, ))
        if len(tags) != 0:
            POS_tag_dict[token.text] = tags[0]
    return POS_tag_dict

In [9]:
pos_tag_sentence(sentence3)

{'The': ('DET', 'DT', 'ROOT', True),
 'Prisoner': ('PROPN', 'NNP', 'ROOT', False),
 'of': ('ADP', 'IN', 'ROOT', True),
 'Azkaban': ('ADJ', 'JJ', 'ROOT', False),
 'made': ('VERB', 'VBD', 'ROOT', True),
 'a': ('DET', 'DT', 'ROOT', True),
 'total': ('ADJ', 'JJ', 'ROOT', False),
 '796.7': ('SYM', '$', 'nmod', False),
 'million': ('NUM', 'CD', 'ROOT', False),
 'worldwide': ('ADV', 'RB', 'ROOT', False)}

# Dependency Tree

In [10]:
#Token dict 
def dependency_dict(doc):
    out = dict()
    root = ''
    for token in doc:
        out[token.text] = (token.dep_, token.head.text, token.head.pos_,[child for child in token.children])
        if token.dep_ == "ROOT":
            root = token.text
    return out, root

In [11]:
token_Dict1, root1 = dependency_dict(doc1)
token_Dict1, root1

({'Harry': ('compound', 'Potter', 'PROPN', []),
  'Potter': ('nsubj', 'is', 'AUX', [Harry, and, Prisoner]),
  'and': ('cc', 'directed', 'VERB', []),
  'the': ('det', 'Prisoner', 'PROPN', []),
  'Prisoner': ('conj', 'Potter', 'PROPN', [the, of]),
  'of': ('prep', 'Prisoner', 'PROPN', [Azkaban]),
  'Azkaban': ('pobj', 'of', 'ADP', []),
  'is': ('ROOT', 'is', 'AUX', [Potter, film]),
  'a': ('det', 'film', 'NOUN', []),
  '2004': ('nummod', 'film', 'NOUN', []),
  'fantasy': ('compound', 'film', 'NOUN', []),
  'film': ('attr', 'is', 'AUX', [a, 2004, fantasy, directed]),
  'directed': ('acl', 'film', 'NOUN', [by, and, distributed]),
  'by': ('agent', 'distributed', 'VERB', [Bros.]),
  'Alfonso': ('compound', 'Cuarón', 'PROPN', []),
  'Cuarón': ('pobj', 'by', 'ADP', [Alfonso]),
  'distributed': ('conj', 'directed', 'VERB', [by]),
  'Warner': ('compound', 'Bros.', 'PROPN', []),
  'Bros.': ('pobj', 'by', 'ADP', [Warner])},
 'is')

In [12]:
token_Dict2, root2 = dependency_dict(doc2)
token_Dict2, root2

({'Harry': ('compound', 'Potter', 'PROPN', []),
  'Potter': ('nsubj', 'spending', 'VERB', [Harry]),
  'has': ('aux', 'spending', 'VERB', []),
  'been': ('aux', 'spending', 'VERB', []),
  'spending': ('ROOT',
   'spending',
   'VERB',
   [Potter, has, been, summer, with, .]),
  'another': ('det', 'summer', 'NOUN', []),
  'unhappy': ('amod', 'summer', 'NOUN', []),
  'summer': ('dobj', 'spending', 'VERB', [another, unhappy]),
  'with': ('prep', 'spending', 'VERB', [Dursleys]),
  'the': ('det', 'Dursleys', 'PROPN', []),
  'Dursleys': ('pobj', 'with', 'ADP', [the]),
  '.': ('punct', 'spending', 'VERB', [])},
 'spending')

In [13]:
token_Dict3, root3 = dependency_dict(doc3)
token_Dict3, root3

({'The': ('det', 'Prisoner', 'PROPN', []),
  'Prisoner': ('nsubj', 'made', 'VERB', [The, of]),
  'of': ('prep', 'total', 'NOUN', [million]),
  'Azkaban': ('pobj', 'of', 'ADP', []),
  'made': ('ROOT', 'made', 'VERB', [Prisoner, total, worldwide]),
  'a': ('det', 'total', 'NOUN', []),
  'total': ('dobj', 'made', 'VERB', [a, of]),
  '$': ('quantmod', 'million', 'NUM', []),
  '796.7': ('compound', 'million', 'NUM', []),
  'million': ('pobj', 'of', 'ADP', [$, 796.7]),
  'worldwide': ('advmod', 'made', 'VERB', [])},
 'made')

# NER Tagging

In [14]:
def ner_tag(text):
    NER_tag_dict = dict()
    for i,line in enumerate(text):
        tags = []
        doc = nlp(str(line))

        for ent in doc.ents:
            # print(ent.text +'-' + ent.label_ + '\n')
            tags.append(ent.text +'-' + ent.label_)
        if len(tags) != 0:
            NER_tag_dict[i] = tags
    return NER_tag_dict

In [15]:
def ner_tag_sentence(sentence):
    doc = nlp(str(sentence))
    NER_tag_dict = dict()
    tags = []
    for ent in doc.ents:
        # print(ent.text +'-' + ent.label_ + '\n')
        NER_tag_dict[ent.text] = ent.label_
    return NER_tag_dict

# Binary Question

In [16]:
auxiliary_verbs = ["am", "is", "are", "was", "were", "shall", "do", "does", "did","can", "could", "have", "need", "should", "will", "would"]



In [17]:
#input: a single sentence, with its dependency dict and root word
def binaryQ(sentence, token_dict, root):
    output = ''
    if root in auxiliary_verbs:
        output += root.capitalize() + ' '
    for k in sentence.split():
        if k != root:
            output += k + ' '
    output = output[:-2]+ '?'
    return output
    

In [18]:
binaryQ(sentence1, token_Dict1, root1)

'Is Harry Potter and the Prisoner of Azkaban a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros?'

# Who Question

In [19]:
pos_tag_sentence(sentence2)

{'Harry': ('PROPN', 'NNP', 'ROOT', False),
 'Potter': ('NOUN', 'NN', 'ROOT', False),
 'has': ('VERB', 'VBZ', 'ROOT', True),
 'been': ('VERB', 'VBN', 'ROOT', True),
 'spending': ('VERB', 'VBG', 'ROOT', False),
 'another': ('DET', 'DT', 'ROOT', True),
 'unhappy': ('ADJ', 'JJ', 'ROOT', False),
 'summer': ('NOUN', 'NN', 'ROOT', False),
 'with': ('ADP', 'IN', 'ROOT', True),
 'the': ('DET', 'DT', 'ROOT', True),
 '.': ('PROPN', 'NNP', 'ROOT', False)}

In [20]:
ner_tag_dict2 = ner_tag_sentence(sentence2)

In [21]:
dependency_dict2, root2 = dependency_dict(doc2)
dependency_dict2, root2,ner_tag_dict2 

({'Harry': ('compound', 'Potter', 'PROPN', []),
  'Potter': ('nsubj', 'spending', 'VERB', [Harry]),
  'has': ('aux', 'spending', 'VERB', []),
  'been': ('aux', 'spending', 'VERB', []),
  'spending': ('ROOT',
   'spending',
   'VERB',
   [Potter, has, been, summer, with, .]),
  'another': ('det', 'summer', 'NOUN', []),
  'unhappy': ('amod', 'summer', 'NOUN', []),
  'summer': ('dobj', 'spending', 'VERB', [another, unhappy]),
  'with': ('prep', 'spending', 'VERB', [Dursleys]),
  'the': ('det', 'Dursleys', 'PROPN', []),
  'Dursleys': ('pobj', 'with', 'ADP', [the]),
  '.': ('punct', 'spending', 'VERB', [])},
 'spending',
 {'Harry Potter': 'PERSON', 'summer': 'DATE', 'Dursleys': 'PERSON'})

In [22]:
#input: a single sentence, and its ner tag dict and dependency dict
#Who Question
def whoQ(sentence, ner_tag_dict, dependency_dict):
    #find PERSON tag
    theName = ''
    output = ''
    for k in ner_tag_dict.keys():
        if ner_tag_dict[k] == 'PERSON':
            #check if is a subject
            names = k.split()
            for n in names:
                print(dependency_dict[n])
                if dependency_dict[n][0] == 'nsubj':
                    theName = k
    print(theName)
    output = sentence.replace(theName, 'who')
    output = output[:-1] + "?"
    output = output[0].upper() + output[1:]
    return output
    

In [23]:
whoQ(sentence2, ner_tag_dict2, dependency_dict2)

('compound', 'Potter', 'PROPN', [])
('nsubj', 'spending', 'VERB', [Harry])
('pobj', 'with', 'ADP', [the])
Harry Potter


'Who has been spending another unhappy summer with the Dursleys?'

# How much Question


In [24]:
ner_tag_dict3 = ner_tag_sentence(sentence3)
dependency_dict3, root3 = dependency_dict(doc3)
pos_tag_dict3 = pos_tag_sentence(sentence3)
# "The film was produced by La Petite Reine and ARP Sélection for 13.47 million euros." don't identify as MONEY
# "The film was produced by La Petite Reine and ARP Sélection for 13.47 million dollars."
    # How much was the film produced by La Petite Reine and ARP Sélection?
# "The film costs 13.47 million dollars."
    # How much does the film costs?


In [25]:
ner_tag_dict4 = ner_tag_sentence(sentence4)
dependency_dict4, root4 = dependency_dict(doc4)
pos_tag_dict4 = pos_tag_sentence(sentence4)

In [26]:
ner_tag_dict3, dependency_dict3, root3, pos_tag_dict3

({'$796.7 million': 'MONEY'},
 {'The': ('det', 'Prisoner', 'PROPN', []),
  'Prisoner': ('nsubj', 'made', 'VERB', [The, of]),
  'of': ('prep', 'total', 'NOUN', [million]),
  'Azkaban': ('pobj', 'of', 'ADP', []),
  'made': ('ROOT', 'made', 'VERB', [Prisoner, total, worldwide]),
  'a': ('det', 'total', 'NOUN', []),
  'total': ('dobj', 'made', 'VERB', [a, of]),
  '$': ('quantmod', 'million', 'NUM', []),
  '796.7': ('compound', 'million', 'NUM', []),
  'million': ('pobj', 'of', 'ADP', [$, 796.7]),
  'worldwide': ('advmod', 'made', 'VERB', [])},
 'made',
 {'The': ('DET', 'DT', 'ROOT', True),
  'Prisoner': ('PROPN', 'NNP', 'ROOT', False),
  'of': ('ADP', 'IN', 'ROOT', True),
  'Azkaban': ('ADJ', 'JJ', 'ROOT', False),
  'made': ('VERB', 'VBD', 'ROOT', True),
  'a': ('DET', 'DT', 'ROOT', True),
  'total': ('ADJ', 'JJ', 'ROOT', False),
  '796.7': ('SYM', '$', 'nmod', False),
  'million': ('NUM', 'CD', 'ROOT', False),
  'worldwide': ('ADV', 'RB', 'ROOT', False)})

In [27]:
ner_tag_dict4, dependency_dict4, root4, pos_tag_dict4

({'13.47 million dollars': 'MONEY'},
 {'The': ('det', 'film', 'NOUN', []),
  'film': ('nsubj', 'took', 'VERB', [The]),
  'took': ('ROOT', 'took', 'VERB', [film, them, dollars, .]),
  'them': ('dative', 'took', 'VERB', []),
  '13.47': ('compound', 'million', 'NUM', []),
  'million': ('nummod', 'dollars', 'NOUN', [13.47]),
  'dollars': ('dobj', 'took', 'VERB', [million]),
  '.': ('punct', 'took', 'VERB', [])},
 'took',
 {'The': ('DET', 'DT', 'ROOT', True),
  'film': ('NOUN', 'NN', 'ROOT', False),
  'took': ('VERB', 'VBD', 'ROOT', False),
  'them': ('PRON', 'PRP', 'ROOT', True),
  '13.47': ('NUM', 'CD', 'ROOT', False),
  'million': ('NUM', 'CD', 'ROOT', False),
  '.': ('NOUN', 'NNS', 'ROOT', False)})

VB verb, base form take
VBD verb, past tense took
VBG verb, gerund/present participle taking
VBN verb, past participle taken
VBP verb, sing. present, non-3d take
VBZ verb, 3rd person sing. present takes

In [28]:
#check tense of verb
def check_tense(root, pos_dict):
    tag = pos_dict[root][1]
    if tag == "VB":
        return "do"
    elif tag == "VBD":
        return "did"
    elif tag == "VBG":
        return "doing"
    elif tag == "VBN":
        return "done"
    elif tag == "VBP":
        return "do"
    elif tag == "VBZ":
        return "does"
    else:
        return None

In [29]:
#input: a single sentence, and its ner tag dict and dependency dict
#How much Question
#The film was produced by La Petite Reine and ARP Sélection for 13.47 million dollars.
#How much was the film produced by La Petite Reine and ARP Sélection?
def howMuchQ(sentence, doc, ner_tag_dict, dependency_dict, root, pos_dict):
    theMoney = ""
    output = ""
    theSubj = ""
    for k in ner_tag_dict.keys():
        if ner_tag_dict[k] == 'MONEY':
            theMoney = k
    #check passive tense 
    sentence_lst = sentence.split()
    root_ind = sentence_lst.index(root)
    root_token = doc[root_ind]
    if root_ind != 0:
        word_in_front_of_root = sentence_lst[root_ind -1] 
        #if it's passive tense
        if dependency_dict[word_in_front_of_root][0] == 'auxpass':
            root_aux = word_in_front_of_root
            output += 'How much '+ root_aux + ' '
            for n in dependency_dict:
                if dependency_dict[n][0] == 'nsubjpass':
                    theSubj = n    
            words_before_subj = dependency_dict[theSubj][-1]
            if len(words_before_subj) != 0:
                output += str(words_before_subj[0]).lower() + ' '
            output += theSubj + "?"
            
        else: #if it's not passive tense
            for n in dependency_dict:
                if dependency_dict[n][0] == 'nsubj':
                    theSubj = n  
            output += 'How much '
            #check tense
            tense = check_tense(root, pos_dict)
            if tense != None:
                output += tense + ' '
                #check subject
                for i, n in enumerate(dependency_dict):
                    if dependency_dict[n][0] == 'nsubj':
                        theSubj = str(n)    

                words_before_subj = dependency_dict[theSubj][-1]
                if len(words_before_subj) != 0:
                    for t in words_before_subj:
                        output += str(t).lower() + " "
                output += theSubj + ' ' + root_token.lemma_ + "?"
            else:
                return None
    return output

    

In [30]:
howMuchQ(sentence4, doc4, ner_tag_dict4, dependency_dict4, root4, pos_tag_dict4)

'How much did the film take?'

In [31]:
howMuchQ(sentence3, doc3, ner_tag_dict3, dependency_dict3, root3, pos_tag_dict3)

'How much did the of Prisoner make?'

# Why Question

In [32]:
# identify “because”, “since”, “for”, “due to”, “... result…”, “lead to”
# NP VP because (of) something
# Since ..., NP VP
# NP VP for ... (reason)
# NP result (in) NP
# NP lead to NP


In [33]:
sentence5 = "Oldman accepted the part because he needed the money"
doc5 = nlp(sentence5)
ner_tag_dict5 = ner_tag_sentence(sentence5)
dependency_dict5, root5 = dependency_dict(doc5)
pos_tag_dict5 = pos_tag_sentence(sentence5)


In [34]:
ner_tag_dict5, dependency_dict5, pos_tag_dict5, root5

({},
 {'Oldman': ('nsubj', 'accepted', 'VERB', []),
  'accepted': ('ROOT', 'accepted', 'VERB', [Oldman, part, needed]),
  'the': ('det', 'money', 'NOUN', []),
  'part': ('dobj', 'accepted', 'VERB', [the]),
  'because': ('mark', 'needed', 'VERB', []),
  'he': ('nsubj', 'needed', 'VERB', []),
  'needed': ('advcl', 'accepted', 'VERB', [because, he, money]),
  'money': ('dobj', 'needed', 'VERB', [the])},
 {'Oldman': ('PROPN', 'NNP', 'ROOT', False),
  'accepted': ('VERB', 'VBD', 'ROOT', False),
  'the': ('DET', 'DT', 'ROOT', True),
  'part': ('NOUN', 'NN', 'ROOT', True),
  'because': ('SCONJ', 'IN', 'ROOT', True),
  'he': ('PRON', 'PRP', 'ROOT', True),
  'needed': ('VERB', 'VBD', 'ROOT', False),
  'money': ('NOUN', 'NN', 'ROOT', False)},
 'accepted')

In [35]:
sentence6 = "Only the first Quidditch game was kept in the film, due to its importance to the storyline."
doc6 = nlp(sentence6)
ner_tag_dict6 = ner_tag_sentence(sentence6)
dependency_dict6, root6 = dependency_dict(doc6)
pos_tag_dict6 = pos_tag_sentence(sentence6)

In [36]:
ner_tag_dict6, dependency_dict6, pos_tag_dict6, root6

({'first': 'ORDINAL', 'Quidditch': 'NORP'},
 {'Only': ('advmod', 'game', 'NOUN', []),
  'the': ('det', 'storyline', 'NOUN', []),
  'first': ('amod', 'game', 'NOUN', []),
  'Quidditch': ('compound', 'game', 'NOUN', []),
  'game': ('nsubjpass', 'kept', 'VERB', [Only, the, first, Quidditch]),
  'was': ('auxpass', 'kept', 'VERB', []),
  'kept': ('ROOT', 'kept', 'VERB', [game, was, in, ,, due, .]),
  'in': ('prep', 'kept', 'VERB', [film]),
  'film': ('pobj', 'in', 'ADP', [the]),
  ',': ('punct', 'kept', 'VERB', []),
  'due': ('prep', 'kept', 'VERB', [to, importance]),
  'to': ('prep', 'importance', 'NOUN', [storyline]),
  'its': ('poss', 'importance', 'NOUN', []),
  'importance': ('pobj', 'due', 'ADP', [its, to]),
  'storyline': ('pobj', 'to', 'ADP', [the]),
  '.': ('punct', 'kept', 'VERB', [])},
 {'Only': ('ADV', 'RB', 'ROOT', True),
  'the': ('DET', 'DT', 'ROOT', True),
  'first': ('ADV', 'RB', 'ROOT', True),
  'Quidditch': ('PROPN', 'NNP', 'ROOT', False),
  'game': ('NOUN', 'NN', 'ROOT',

In [37]:
sentence7 = "Censors initially gave it adult ratings due to profanity"
doc7 = nlp(sentence7)
ner_tag_dict7 = ner_tag_sentence(sentence7)
dependency_dict7, root7 = dependency_dict(doc7)
pos_tag_dict7 = pos_tag_sentence(sentence7)

In [38]:
ner_tag_dict7, dependency_dict7, pos_tag_dict7, root7

({},
 {'Censors': ('nsubj', 'gave', 'VERB', []),
  'initially': ('advmod', 'gave', 'VERB', []),
  'gave': ('ROOT', 'gave', 'VERB', [Censors, initially, it, ratings, due]),
  'it': ('dative', 'gave', 'VERB', []),
  'adult': ('compound', 'ratings', 'NOUN', []),
  'ratings': ('dobj', 'gave', 'VERB', [adult]),
  'due': ('prep', 'gave', 'VERB', [to, profanity]),
  'to': ('pcomp', 'due', 'ADP', []),
  'profanity': ('pobj', 'due', 'ADP', [])},
 {'Censors': ('NOUN', 'NNS', 'ROOT', False),
  'initially': ('ADV', 'RB', 'ROOT', False),
  'gave': ('VERB', 'VBD', 'ROOT', False),
  'it': ('PRON', 'PRP', 'ROOT', True),
  'adult': ('NOUN', 'NN', 'ROOT', False),
  'ratings': ('NOUN', 'NNS', 'ROOT', False),
  'due': ('ADJ', 'JJ', 'ROOT', True),
  'to': ('PART', 'TO', 'ROOT', True),
  'profanity': ('NOUN', 'NN', 'ROOT', False)},
 'gave')

In [39]:
def WhyQ(sentence, doc, ner_tag_dict, dependency_dict, pos_tag_dict, root):
    #A do sth Because B
    theSubj = ""
    output = ""
    theObj = ""
    sentence_lst = sentence.split()
    root_ind = sentence_lst.index(root)
    #check tense
    tense = check_tense(root, pos_tag_dict)
    
    
    if "because" or "due to" or "Due to" in sentence:
        #check if passive tense:
        word_in_front_of_root = sentence_lst[root_ind-1]
        if dependency_dict[word_in_front_of_root][0] == 'auxpass':
            root_aux = word_in_front_of_root
            for n in dependency_dict:
                if dependency_dict[n][0] == 'nsubjpass': 
                    words_before_subj = dependency_dict[n][-1]
                    if len(words_before_subj) != 0:
                        for t in words_before_subj:
                            if str(t) not in ner_tag_dict.keys():
                                theSubj += str(t).lower() + ' '
                            else:
                                theSubj += str(t) + ' '
                    theSubj += n + " "
            output += "Why " + root_aux + " "+ theSubj + doc[root_ind].text + "?"

        else:
            #not passive tense:
            #find subject
            for n in dependency_dict:
                if dependency_dict[n][0] == 'nsubj':
                    #find determinant if there is one
                    words_before_subj = dependency_dict[n][-1]
                    if len(words_before_subj) != 0:
                        for t in words_before_subj:
                            theSubj += str(t) + ' '
                    theSubj += n  
                    break
            #find object
            for n in dependency_dict:
                if dependency_dict[n][0] == 'dobj':
                    #find determinant if there is one
                    words_before_obj = dependency_dict[n][-1]
                    if len(words_before_obj) != 0:
                        for t in words_before_obj:
                            theObj += str(t) + ' '
                    theObj += n
                    break

            #check tense
            tense = check_tense(root, pos_tag_dict)
            #Get rid of things after because

            #Why + do/does/did sb do sth?
            output += "Why " + tense + " " + theSubj +  " " + doc[root_ind].lemma_ + " " + theObj + "?"
    return output
        
    
    

In [40]:
WhyQ(sentence5, doc5, ner_tag_dict5, dependency_dict5, pos_tag_dict5, root5)

'Why did Oldman accept the part?'

In [41]:
WhyQ(sentence6, doc6, ner_tag_dict6, dependency_dict6, pos_tag_dict6, root6)

'Why was only the first Quidditch game kept?'

In [42]:
root6

'kept'

In [43]:
WhyQ(sentence7, doc7, ner_tag_dict7, dependency_dict7, pos_tag_dict7, root7)

'Why did Censors give adult ratings?'

# Question Answering

sentence1 = "Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros."
sentence2 = "Harry Potter has been spending another unhappy summer with the Dursleys."
sentence3 = "The film was produced by La Petite Reine and ARP Sélection for 13.47 million dollars."
sentence4 = "The film took them 13.47 million dollars."

question4 = "How much did the film take?"


In [44]:
#Bert sentence embedding
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [45]:

question1 = "How much did the film take?"
question2 = 'Is Harry Potter and the Prisoner of Azkaban a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros?'

question1_emb = np.array(sbert_model.encode(question1)).reshape(1, -1)
question2_emb = np.array(sbert_model.encode(question2)).reshape(1, -1)




In [46]:
#stop words (?), contains cardinal numbers
all_stopwords = nlp.Defaults.stop_words
len(all_stopwords)

326

## Turn the entire text into sentence embeddings

In [47]:
#input: text file 
#output: a dictionary of sentence embeddings {sentence: sentence embeddings}

def sentence_emb(text):
    result = dict()
    for sentence in text:
        sentence_emb = np.array(sbert_model.encode(sentence)).reshape(1, -1)
        result[sentence] = sentence_emb
    return result

In [48]:
text_artist_emb, text_hp_emb = sentence_emb(text_artist), sentence_emb(text_hp)


In [49]:
def find_best_sentence(question, text_emb_dict):
    question_emb = np.array(sbert_model.encode(question)).reshape(1, -1)
    sim_max = 0
    output = ""
    for sentence, sentence_emb in text_emb_dict.items():
        sim = cosine_similarity(sentence_emb, question_emb)
        if sim > sim_max:
            sim_max = sim
            output = sentence
    return output, sim_max

In [50]:
a = [1, 2, 5, 3]
sorted(a)[::-1]


[5, 3, 2, 1]

In [55]:
#Check Question type
question_types = ["Who", "When", "What" , "Where"," How many", "How long", "How much", "Why"]

In [53]:
def check_question_type(question):
    #Check Question type
    for q_type in question_types:
        if question.startswith(q_type):
            return q_type
    for a_verb in auxiliary_verbs:
        a_verb = a_verb[0].upper() + a_verb[1:]
        if question.startswith(a_verb):
            return a_verb
    return "No idea"
    

In [58]:
check_question_type(question2), check_question_type(question1)

('Is', 'How much')

In [51]:
#find sentence with question type as an argument
def find_best_k_sentence(question, text_emb_dict, k, question_type):
    question_emb = np.array(sbert_model.encode(question)).reshape(1, -1)
    sims_dict = dict()
    output = ""
    for sentence, sentence_emb in text_emb_dict.items():
        sim = cosine_similarity(sentence_emb, question_emb)
        if check_NER(sentence, question_type):
            sim += 1
        sims_dict[sentence] = sim
    sorted_sim = sorted(sims_dict.items(), key = lambda kv: kv[1])[::-1][:k]
    return sorted_sim

In [52]:
#check if the sentence contains certain NER tags
#When - date
#Who - Person
#Where",
#"How many"
#"How long"
#"How much"
#"Why"


def check_NER(sentence, question_type):
    output = False
    ner_tag_dict = ner_tag_sentence(sentence)
    if question_type == "when":
        return 

SyntaxError: invalid syntax (<ipython-input-52-554e8ad007e2>, line 9)

In [None]:
output1, sim_max1 = find_best_sentence(question1, text_artist_emb)

In [None]:
output1, sim_max1


In [None]:
output2, sim_max2 = find_best_sentence(question2, text_hp_emb)
output2, sim_max2

In [None]:
find_best_k_sentence(question1, text_artist_emb, 15)

In [57]:
output2, question2

NameError: name 'output2' is not defined

# Binary Answer

In [None]:
#Binary Questions
#Strip the punctuation at the end
#input: question and original sentence in text
#check : 1) negation words: no/not/'nt √ 
#        2) Adjectives -> check antonymn
#        3) Check Info matching?

def binary_answer(question, sentence):
    negate = False
    output = ""
    neg_words = {'no', 'not', "don't", "doesn't", "did't", "haven't", "hasn't", "wasn't", "weren't"}
    sentence_set = set([x.lower() for x in sentence.split()])
    question_set = set([x.lower() for x in question[:-1].split()])
    intersect_words = sentence_set.intersection(question_set)
    leftover_question = question_set - intersect_words
    leftover_sentence = sentence_set - intersect_words
    print("leftover words: ")
    print(leftover_question, leftover_question)
    
    negate = not check_negate(leftover_question, leftover_question)
    if negate:
        #No
        output += "No. " + sentence
    else:
        #Yes
        output += "Yes. " + sentence

    return output

In [None]:
#neg_words = ['no', 'not',"n't"]
#return true if same
def check_negate(set1, set2):
    print("hi")
    negate1, negate2 = True, True
    print("Negate", negate1, negate2)
    if len(set1) == 0 and len(set2) == 0:
        return negate1 and negate2
    for item1 in set1:
        if (item1 == 'no') or (item1 == 'not') or ("n't" in item1):
            negate1 = not negate1
#             print("1", item1)

    for item2 in set2:
        if (item2 == 'no') or (item2 == 'not') or ("n't" in item2):
            negate2 = not negate2
#             print("2", item2)
    print("Negate", negate1 and negate2)
            
    return negate1 and negate2
    

In [None]:
set1 = set(["nose", "not", "hi", "won't"])
set2 = set(["n", "hi", "won't"])
check_negate(set1, set2)

In [None]:
binary_answer(question2, output2)

# HOW MUCH Answer

In [None]:
question_howmuch = "How much did The Prisoner of Azkaban make worldwide?"

In [None]:
find_best_k_sentence(question_howmuch, text_hp_emb)

In [None]:
output, sim_max

In [None]:
for token in doc:
    ancestors = [t.text for t in token.ancestors]
    children = [t.text for t in token.children]
    print(token.text, "\t", token.i, "\t", 
          token.pos_, "\t", token.dep_, "\t", 
          ancestors, "\t", children)