In [3]:
import nltk
from nltk.tokenize import sent_tokenize
import spacy
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG

In [4]:
from pattern.en import conjugate, lemma, lexeme, PRESENT, SG
print (lemma('gave'))
print (lexeme('gave'))
print (lemma('took'))
print (lexeme('take'))
print (lexeme('had'))

give
['give', 'gives', 'giving', 'gave', 'given']
take
['take', 'takes', 'taking', 'took', 'taken']
['have', 'has', 'having', 'had', "haven't", "hasn't", "hadn't"]


# Question Answering

## Tools to Use

In [5]:
nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()
sw_nltk = stopwords.words('english')
print(sw_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
def get_text(textfile):
    #put entire text file into a list of sentences
    text = []
    with open(textfile, "r") as f:
        for line in f:
            text = text + sent_tokenize(line)
    return text

In [7]:
textfile1 = "data/set4/a1.txt"
textfile2 = "data/set4/a7.txt"

In [8]:
text_artist, text_hp = get_text(textfile1), get_text(textfile2)

In [9]:
def pos_tag_sentence(sentence):
    #list of sentences
    POS_tag_dict = dict()
    text = sentence.split()
    for line in text:
        tags = []
        doc = nlp(str(line))
        for token in doc:
            tags.append((token.pos_, token.tag_, token.dep_, token.is_stop, ))
        if len(tags) != 0:
            POS_tag_dict[token.text] = tags[0]
    return POS_tag_dict

In [10]:
#denpendency dict 
def dependency_dict(doc):
    out = dict()
    root = ''
    for token in doc:
        out[token.text] = (token.dep_, token.head.text, token.head.pos_,[child for child in token.children])
        if token.dep_ == "ROOT":
            root = token.text
    return out, root

In [11]:
def ner_tag_sentence(sentence):
    doc = nlp(str(sentence))
    NER_tag_dict = dict()
    tags = []
    for ent in doc.ents:
        # print(ent.text +'-' + ent.label_ + '\n')
        NER_tag_dict[ent.text] = ent.label_
    return NER_tag_dict

# Questions

In [12]:
#Bert sentence embedding
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [13]:
question1 = "How much did the film take?"
question2 = 'Is Harry Potter and the Prisoner of Azkaban a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros?'

question1_emb = np.array(sbert_model.encode(question1)).reshape(1, -1)
question2_emb = np.array(sbert_model.encode(question2)).reshape(1, -1)

In [14]:
#stop words (?), contains cardinal numbers
all_stopwords = nlp.Defaults.stop_words
len(all_stopwords)

326

## Turn the entire text into sentence embeddings

In [15]:
#input: text file 
#output: a dictionary of sentence embeddings {sentence: sentence embeddings}

def sentence_emb(text):
    result = dict()
    for sentence in text:
        sentence_emb = np.array(sbert_model.encode(sentence)).reshape(1, -1)
        result[sentence] = sentence_emb
    return result

In [16]:
text_artist_emb, text_hp_emb = sentence_emb(text_artist), sentence_emb(text_hp)

# Identify the original sentence

In [17]:
#input: question, text embedding dict
def find_best_sentence(question, text_emb_dict):
    question_emb = np.array(sbert_model.encode(question)).reshape(1, -1)
    sim_max = 0
    output = ""
    for sentence, sentence_emb in text_emb_dict.items():
        sim = cosine_similarity(sentence_emb, question_emb)
        if sim > sim_max:
            sim_max = sim
            output = sentence
    return output, sim_max

In [18]:
#Check Question type
question_types = ["Who", "When", "What" , "Where"," How many", "How long", "How much", "Why"]
auxiliary_verbs = ["am", "is", "are", "was", "were", "shall", "do", "does", "did","can", "could", "have", "need", "should", "will", "would"]

In [19]:
def check_question_type(question):
    #Check Question type
    for q_type in question_types:
        if question.startswith(q_type):
            return q_type
    for a_verb in auxiliary_verbs:
        a_verb = a_verb[0].upper() + a_verb[1:]
        if question.startswith(a_verb):
            return a_verb
    return "No idea"

In [20]:
check_question_type(question2), check_question_type(question1)

('Is', 'How much')

In [21]:
ner_tag_sentence(text_artist[3]), text_artist[3]

({'Hollywood': 'GPE', 'between 1927 and 1932': 'DATE'},
 'The story takes place in Hollywood, between 1927 and 1932, and focuses on the relationship of an older silent film star and a rising young actress as silent cinema falls out of fashion and is replaced by the "talkies".')

In [22]:
#find sentence with question type as an argument
def find_best_k_sentence(question, text_emb_dict, k):
    question_type = check_question_type(question)
    question_emb = np.array(sbert_model.encode(question)).reshape(1, -1)
    sims_dict = dict()
    output = ""
    for sentence, sentence_emb in text_emb_dict.items():
        sim = cosine_similarity(sentence_emb, question_emb)
        #if question type and NER matches, +1
        if check_NER(sentence, question_type):
            sim += 1
        #check extra matching bonus
        extra = NER_match(question, sentence)
        # print(extra, sentence)
        sims_dict[sentence] = sim + extra
    sorted_sim = sorted(sims_dict.items(), key = lambda kv: kv[1])[::-1][:k]
    return sorted_sim

In [23]:
#find same NER keys 
#find same words after getting rid of stopwords
def NER_match(question, sentence):
    output = 0
    question_ner = ner_tag_sentence(question)
    sentence_ner = ner_tag_sentence(sentence)
    #find same NER key and add 0.2 for each
    for key in question_ner.keys():
        # print(key)
        all_keys = [key.lower() for key in sentence_ner.keys()]
        if key.lower() in all_keys:
            output += 0.2
    #find same words and plus 0.1 for each
    q_words = [word for word in question.split() if word.lower() not in sw_nltk]
    s_words = [word for word in sentence.split() if word.lower() not in sw_nltk]
    for word in q_words:
        if word in s_words:
            output += 0.1
    return output

In [24]:
#check if the sentence contains certain NER tags
#When - DATE
#Who - PERSON
#Where" - FAC, ORG, ORG, ORG
#"How many" - CARDINAL
#"How long" - DATE
#"How much" - MONEY
#"Why" - "because"

def check_NER(sentence, question_type):
    output = False
    ner_tag_dict = ner_tag_sentence(sentence)
    if question_type == "When":
        if ("TIME" in ner_tag_dict.values()) or ("DATE" in ner_tag_dict.values()):
            output = True
    elif question_type == "Who":
        if ("PERSON" in ner_tag_dict.values()):
            output = True
    elif question_type == "Where":  
        if ("FAC" in ner_tag_dict.values()) or ("ORG" in ner_tag_dict.values()) or ("ORG" in ner_tag_dict.values()) or ("LOC" in ner_tag_dict.values()):
            output = True
    elif question_type == "How much":
        if ("MONEY" in ner_tag_dict.values()):
            output = True
    elif question_type == "How long":
        if ("DATE" in ner_tag_dict.values()):
            output = True
    elif question_type == "How many":
        if ("CARDINAL" in ner_tag_dict.values()):
            output = True
    elif question_type == "How often":
        if ("CARDINAL" in ner_tag_dict.values()):
            output = True
    elif question_type == "Why":
        if ("because" in sentence) or ("due to" in sentence) or ("Due to" in sentence):
            output = True
    else:
        for aux in auxiliary_verbs:
            aux_cap = aux[0].upper() + aux[1:]
            if aux or aux_cap in sentence:
                output = True
    return output

In [25]:
check_NER(text_artist[3], 'How many')

False

In [26]:
output1, sim_max1 = find_best_sentence(question1, text_artist_emb)
output1, sim_max1

('The Artist (film)', array([[0.64464533]], dtype=float32))

In [27]:
output2, sim_max2 = find_best_sentence(question2, text_hp_emb)
output2, sim_max2

('Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros. Pictures.',
 array([[0.9836956]], dtype=float32))

In [28]:
find_best_k_sentence(question1, text_artist_emb, 10)

[('The Artist grossed $44,671,682 in North America, along with $88,761,174 in other territories for a worldwide total of $133,432,856.',
  array([[1.279254]], dtype=float32)),
 ('The Artist (film)', array([[0.64464533]], dtype=float32)),
 ('The film was produced by La Petite Reine and ARP Sélection for 13.47 million euro, including co-production support from Studio 37 and France 3 Cinéma, and pre-sales investment from Canal+ and CinéCinéma.',
  array([[0.58277726]], dtype=float32)),
 ('To recreate the slightly sped-up look of 1920s silent films, the film was shot at a slightly lower frame rate of 22 fps as opposed to the standard 24 fps.',
  array([[0.55374616]], dtype=float32)),
 ('In response, director Hazanavicius released a statement:',
  array([[0.5327408]], dtype=float32)),
 ('The film was initially given limited release in the United States on 23 November 2011.',
  array([[0.5283947]], dtype=float32)),
 ('Box officeEdit', array([[0.5197315]], dtype=float32)),
 ('ProductionEdit',

In [29]:
find_best_k_sentence(question2, text_hp_emb, 10)

[('Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros. Pictures.',
  array([[3.9836955]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban (film)',
  array([[2.5757165]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban at Box Office Mojo',
  array([[2.5321522]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban at AllMovie',
  array([[2.5223794]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban at the Internet Movie Database',
  array([[2.5186093]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban was nominated for Best Original Music Score (John Williams) and Best Visual Effects at the 77th Academy Awards held in 2005.',
  array([[2.5171807]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban at Metacritic',
  array([[2.4705005]], dtype=float32)),
 ('Prisoner of Azkaban earned notable critical acclaim, garnering a 91% "Certified Fresh" approval

# BINARY Answer

In [30]:
#Binary Questions
#Strip the punctuation at the end
#input: question and original sentence in text
#check : 1) negation words: no/not/'nt √ 
#        2) Adjectives -> check antonymn
#        3) Check Info matching?

def binary_answer(question, text_emb):
    negate = False
    output = ""
    neg_words = {'no', 'not', "don't", "doesn't", "did't", "haven't", "hasn't", "wasn't", "weren't"}
    sentence = find_best_k_sentence(question, text_emb, 1)[0][0]
    sentence_set = set([x.lower() for x in sentence.split()])
    question_set = set([x.lower() for x in question[:-1].split()])
    intersect_words = sentence_set.intersection(question_set)
    leftover_question = question_set - intersect_words
    leftover_sentence = sentence_set - intersect_words
    # print("leftover words: ")
    # print(leftover_question, leftover_question)
    
    negate = not check_negate(leftover_question, leftover_question)
    if negate:
        #No
        output += "No. " + sentence
    else:
        #Yes
        output += "Yes. " + sentence

    return output

In [31]:
#neg_words = ['no', 'not',"n't"]
#return true if same
def check_negate(set1, set2):
    # print("hi")
    negate1, negate2 = True, True
    # print("Negate", negate1, negate2)
    if len(set1) == 0 and len(set2) == 0:
        return negate1 and negate2
    for item1 in set1:
        if (item1 == 'no') or (item1 == 'not') or ("n't" in item1):
            negate1 = not negate1
            # print("1", item1)

    for item2 in set2:
        if (item2 == 'no') or (item2 == 'not') or ("n't" in item2):
            negate2 = not negate2
            # print("2", item2)
    # print("Negate", negate1 and negate2)
            
    return negate1 and negate2 

# Get Answer

In [32]:
def get_answer(question, text_emb):
    k = 1
    best_k_sentence = find_best_k_sentence(question, text_emb, k)
    top_sentence = best_k_sentence[0]
    return top_sentence[0]

# Testing

In [40]:
# Binary
question1 = "Is Harry Potter and the Prisoner of Azkaban a a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros?"
answer1 = binary_answer(question1, text_hp_emb)
print(answer1)
split_sentence(answer1, question1)

Yes. Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros. Pictures.


'Yes. Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros. Pictures.'

In [41]:
# Who
question2 = "Who has been spending another unhappy summer at Privet Drive?"
answer2 = get_answer(question2, text_hp_emb)
print(answer2)
split_sentence(answer2, question2)

Harry Potter, now aged 13, has been spending another dissatisfying summer at Privet Drive.


'Harry Potter, now aged 13, has been spending another dissatisfying summer at Privet Drive.'

In [42]:
# When
question3 = "When was the film released in United Kingdom?"
answer3 = get_answer(question3, text_hp_emb)
print(answer3)
split_sentence(answer3, question3)

The film was released on 31 May 2004 in the United Kingdom and on 4 June 2004 in North America, as the first Harry Potter film released into IMAX theatres and to be using IMAX Technology.


'The film was released on 31 May 2004 in the United Kingdom and on 4 June 2004 in North America.'

In [43]:
# What
question4 = "What is found ruined and empty?"
answer4 = get_answer(question4, text_hp_emb)
print(answer4)
split_sentence(answer4, question4)


The Fat Lady's portrait, which guards the Gryffindor quarters, is found ruined and empty.


"The Fat Lady's portrait, which guards the Gryffindor quarters, is found ruined and empty."

In [44]:
# Where
question5 = "Where is Harry forgiven by Minister of Magic Cornelius Fudge for using magic outside of Hogwarts?"
answer5 = get_answer(question5, text_hp_emb)
print(answer5)
split_sentence(answer5, question5)


The Knight Bus delivers Harry to the Leaky Cauldron, where he is forgiven by Minister of Magic Cornelius Fudge for using magic outside of Hogwarts.


'Harry is forgiven by Minister of Magic Cornelius Fudge for using magic outside of Hogwarts at the Leaky Cauldron.'

In [45]:
# Where
question5 = "Where was the film released on 31 May 2004?"
answer5 = get_answer(question5, text_hp_emb)
print(answer5)
split_sentence(answer5, question5)

The film was released on 31 May 2004 in the United Kingdom and on 4 June 2004 in North America, as the first Harry Potter film released into IMAX theatres and to be using IMAX Technology.


'The film was released on 31 May 2004 in the United Kingdom and on 4 June 2004 in North America.'

In [46]:
# Why
question6 = "Why did Oldman accept the part?"
answer6 = get_answer(question6, text_hp_emb)
print(answer6)
split_sentence(answer6, question6)

Oldman accepted the part because he needed the money, as he had not taken on any major work in several years, having decided to spend more time with his children.


'Oldman accepted the part because he needed the money.'

In [47]:
# How much
question7 = "How much did the Prisoner of Azkaban grossed a total of worldwide?"
get_answer(question7, text_hp_emb)
answer7 = get_answer(question7, text_hp_emb)
print(answer7)
split_sentence(answer7, question7)

Prisoner of Azkaban grossed a total of $796.7 million worldwide, with its box office performance ranking as the lowest-grossing in the series.


'Prisoner of Azkaban grossed a total of $796.7 million worldwide.'

In [48]:
# How many
question8 = "How many Academy Awards was the film nominated for?"
get_answer(question8, text_hp_emb)
answer8 = get_answer(question8, text_hp_emb)
print(answer8)
split_sentence(answer8, question8)

The film was nominated for two Academy Awards, Best Original Music Score and Best Visual Effects at the 77th Academy Awards in 2005.


'The film was nominated for two Academy Awards.'

In [49]:
# How long
question9 = "How long after Harris's death, did Cuaron choose Gambon as his replacement?"
answer9 = get_answer(question9, text_hp_emb)
print(answer9)
split_sentence(answer9, question9)

Four months after Harris's death, Cuarón chose Gambon as his replacement.


"Four months after Harris's death, Cuarón chose Gambon as his replacement."

# Split sentence and get the final answer

In [34]:
def relativeWhoClause(answer, question):
    ner_dict = ner_tag_sentence(nlp(answer.split(",")[0]+','))
    if ner_dict == {}: return answer
    output = list(ner_dict.keys())[-1]
    if ner_dict[output] not in ["PERSON"]:
        return answer
    else:
        question = question[:-1]
        question_list = question.split(" ")
        question_list.pop(0)
        question_list.insert(0,output)
        return " ".join(question_list) + "."

In [35]:
def relativeWhenClause(answer, question):
    ner_dict = ner_tag_sentence(nlp(answer.split(",")[0]+','))
    if ner_dict == {}: return answer
    output = list(ner_dict.keys())[-1]
    if ner_dict[output] not in ["DATE", "TIME"]: return answer
    else:
        question_list = question.split(" ")
        if question_list[1] in ["is", "are", "was", "were"]:
            curr = question_list[1]
            root = dependency_dict(nlp(question))[1]
            if root in ["is", "are", "was", "were"]: return answer
            question = question[:-1]
            question_list = question.split(" ")
            root_index = question_list.index(root)
            question_list.insert(root_index, curr)
            question_list.pop(0)
            question_list.pop(0)
            return " ".join(question_list) + " at " + output + "."
        elif question_list[1] in ["do", "does", "did"]:
            if question_list[1] == "does":
                root = dependency_dict(nlp(question))[1]
                root_index = question_list.index(root)
                new_root = lexeme(root)[1]
                question_list[root_index] = new_root
                question_list.pop(0)
                question_list.pop(0)
                return " ".join(question_list) + " at " + output + "."
            elif question_list[1] == "do":
                question_list.pop(0)
                question_list.pop(0)
                return " ".join(question_list) + " at " + output + "."
            else:
                root = dependency_dict(nlp(question))[1]
                root_index = question_list.index(root)
                new_root = lexeme(root)[3]
                question_list[root_index] = new_root
                question_list.pop(0)
                question_list.pop(0)
                return " ".join(question_list) + " at " + output + "."
        else:
            return answer

In [36]:
def relativeWhereClause(answer, question):
    ner_dict = ner_tag_sentence(nlp(answer.split(",")[0]+','))
    output = list(ner_dict.keys())[-1]
    if ner_dict[output] not in ["FAC", "ORG", "GPE", "LOC"]:
        return answer
    else:
        question = question[:-1]
        question_list = question.split(" ")
        if question_list[1] in ["is", "are", "was", "were"]:
            curr = question_list[1]
            root = dependency_dict(nlp(question))[1]
            root_index = question_list.index(root)
            question_list.insert(root_index, curr)
            question_list.pop(0)
            question_list.pop(0)
            return " ".join(question_list) + " at " + output + "."
        elif question_list[1] in ["do", "does", "did"]:
            if question_list[1] == "does":
                root = dependency_dict(nlp(question))[1]
                root_index = question_list.index(root)
                new_root = lexeme(root)[1]
                question_list[root_index] = new_root
                question_list.pop(0)
                question_list.pop(0)
                return " ".join(question_list) + " at " + output + "."
            elif question_list[1] == "do":
                question_list.pop(0)
                question_list.pop(0)
                return " ".join(question_list) + " at " + output + "."
            else:
                root = dependency_dict(nlp(question))[1]
                root_index = question_list.index(root)
                new_root = lexeme(root)[3]
                question_list[root_index] = new_root
                question_list.pop(0)
                question_list.pop(0)
                return " ".join(question_list) + " at " + output + "."
        else:
            return answer

In [37]:
def split_sentence(answer, question):
    question_emb = np.array(sbert_model.encode(question)).reshape(1, -1)
    answer_emb = np.array(sbert_model.encode(answer)).reshape(1, -1)
    overall_sim = cosine_similarity(answer_emb, question_emb)
    max_sim = 0
    max_part = None
    for sentence in answer.split(','):
#         print(sentence)
        if sentence.startswith(' '): 
            sentence = sentence[1:]
#         print(sentence)
        sentence_emb = np.array(sbert_model.encode(sentence)).reshape(1, -1)
        curr_sim = cosine_similarity(sentence_emb, question_emb)
#         print(curr_sim)
        if curr_sim > max_sim and check_complete_sentence(sentence):
            max_sim = curr_sim
            max_part = sentence
    if max_part == None or overall_sim > max_sim:
        max_part = answer
    if max_part[-1] != '.':
        max_part += '.'
    if max_part.startswith("who"):
        max_part = relativeWhoClause(answer, question)
    elif max_part.startswith("where"):
        max_part = relativeWhereClause(answer, question)
    elif max_part.startswith("when"):
        max_part = relativeWhenClause(answer, question)
    return max_part[0].upper() + max_part[1:]

In [38]:
def check_complete_sentence(sentence):
    complete = False
    dependence_dict, root = dependency_dict(nlp(sentence))
#     print(dependence_dict)
    for item in dependence_dict.values():
        if (item[0] == 'nsubj' or item[0] == 'nsubjpass'):
            complete = True
    return complete