# Question Answering

In [70]:
import nltk
from nltk.tokenize import sent_tokenize
import spacy
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import wordnet
from nltk.corpus import stopwords


## Tools to Use

In [72]:
nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()
sw_nltk = stopwords.words('english')
print(sw_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
def get_text(textfile):
    #put entire text file into a list of sentences
    text = []
    with open(textfile, "r") as f:
        for line in f:
            line = line.split('. ')
            if len(line) != 0:
                temp = line[0].strip('\n')
                if len(temp) != 0:
                    text.append(temp)
    return text

In [6]:
textfile1 = "data/set4/a1.txt"
textfile2 = "data/set4/a7.txt"

In [7]:
text_artist, text_hp = get_text(textfile1), get_text(textfile2)

In [9]:
def pos_tag_sentence(sentence):
    #list of sentences
    POS_tag_dict = dict()
    text = sentence.split()
    for line in text:
        tags = []
        doc = nlp(str(line))
        for token in doc:
            tags.append((token.pos_, token.tag_, token.dep_, token.is_stop, ))
        if len(tags) != 0:
            POS_tag_dict[token.text] = tags[0]
    return POS_tag_dict

In [10]:
#denpendency dict 
def dependency_dict(doc):
    out = dict()
    root = ''
    for token in doc:
        out[token.text] = (token.dep_, token.head.text, token.head.pos_,[child for child in token.children])
        if token.dep_ == "ROOT":
            root = token.text
    return out, root

In [11]:
def ner_tag_sentence(sentence):
    doc = nlp(str(sentence))
    NER_tag_dict = dict()
    tags = []
    for ent in doc.ents:
        # print(ent.text +'-' + ent.label_ + '\n')
        NER_tag_dict[ent.text] = ent.label_
    return NER_tag_dict

# Questions

In [12]:
#Bert sentence embedding
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [13]:
question1 = "How much did the film take?"
question2 = 'Is Harry Potter and the Prisoner of Azkaban a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros?'

question1_emb = np.array(sbert_model.encode(question1)).reshape(1, -1)
question2_emb = np.array(sbert_model.encode(question2)).reshape(1, -1)


In [14]:
#stop words (?), contains cardinal numbers
all_stopwords = nlp.Defaults.stop_words
len(all_stopwords)

326

## Turn the entire text into sentence embeddings

In [15]:
#input: text file 
#output: a dictionary of sentence embeddings {sentence: sentence embeddings}

def sentence_emb(text):
    result = dict()
    for sentence in text:
        sentence_emb = np.array(sbert_model.encode(sentence)).reshape(1, -1)
        result[sentence] = sentence_emb
    return result

In [16]:
text_artist_emb, text_hp_emb = sentence_emb(text_artist), sentence_emb(text_hp)


# Identify the original sentence

In [17]:
#input: question, text embedding dict
def find_best_sentence(question, text_emb_dict):
    question_emb = np.array(sbert_model.encode(question)).reshape(1, -1)
    sim_max = 0
    output = ""
    for sentence, sentence_emb in text_emb_dict.items():
        sim = cosine_similarity(sentence_emb, question_emb)
        if sim > sim_max:
            sim_max = sim
            output = sentence
    return output, sim_max

In [21]:
#Check Question type
question_types = ["Who", "When", "What" , "Where"," How many", "How long", "How much", "Why"]
auxiliary_verbs = ["am", "is", "are", "was", "were", "shall", "do", "does", "did","can", "could", "have", "need", "should", "will", "would"]




In [19]:
def check_question_type(question):
    #Check Question type
    for q_type in question_types:
        if question.startswith(q_type):
            return q_type
    for a_verb in auxiliary_verbs:
        a_verb = a_verb[0].upper() + a_verb[1:]
        if question.startswith(a_verb):
            return a_verb
    return "No idea"
    

In [22]:
check_question_type(question2), check_question_type(question1)

('Is', 'How much')

In [35]:
ner_tag_sentence(text_artist[3]), text_artist[3]


({'ten': 'CARDINAL',
  'Academy Awards': 'ORG',
  'five': 'CARDINAL',
  'Best Picture for Langmann': 'ORG',
  'Hazanavicius': 'ORG',
  'Dujardin': 'PERSON',
  'first': 'ORDINAL',
  'French': 'NORP'},
 'It was nominated for ten Academy Awards and won five, including Best Picture for Langmann, Best Director for Hazanavicius, and Best Actor for Dujardin, making him the first French actor ever to win for Best Actor')

In [59]:
#find sentence with question type as an argument
def find_best_k_sentence(question, text_emb_dict, k):
    question_type = check_question_type(question)
    question_emb = np.array(sbert_model.encode(question)).reshape(1, -1)
    sims_dict = dict()
    output = ""
    for sentence, sentence_emb in text_emb_dict.items():
        sim = cosine_similarity(sentence_emb, question_emb)
        #if question type and NER matches, +1
        if check_NER(sentence, question_type):
            sim += 1
        #check extra matching bonus
        extra = NER_match(question, sentence)
        print(extra, sentence)
        sims_dict[sentence] = sim + extra
    sorted_sim = sorted(sims_dict.items(), key = lambda kv: kv[1])[::-1][:k]
    return sorted_sim

In [74]:
#find same NER keys 
#find same words after getting rid of stopwords
def NER_match(question, sentence):
    output = 0
    question_ner = ner_tag_sentence(question)
    sentence_ner = ner_tag_sentence(sentence)
    #find same NER key and add 0.2 for each
    for key in question_ner.keys():
        print(key)
        all_keys = [key.lower() for key in sentence_ner.keys()]
        if key.lower() in all_keys:
            output += 0.2
    #find same words and plus 0.1 for each
    q_words = [word for word in question.split() if word.lower() not in sw_nltk]
    s_words = [word for word in sentence.split() if word.lower() not in sw_nltk]
    for word in q_words:
        if word in s_words:
            output += 0.1
    return output

In [65]:
#check if the sentence contains certain NER tags
#When - DATE
#Who - PERSON
#Where" - FAC, ORG, ORG, ORG
#"How many" - CARDINAL
#"How long" - DATE
#"How much" - MONEY
#"Why" - "because"

def check_NER(sentence, question_type):
    output = False
    ner_tag_dict = ner_tag_sentence(sentence)
    if question_type == "When":
        if ("TIME" in ner_tag_dict.values()) or ("DATE" in ner_tag_dict.values()):
            output = True
    elif question_type == "Who":
        if ("PERSON" in ner_tag_dict.values()):
            output = True
    elif question_type == "Where":  
        if ("FAC" in ner_tag_dict.values()) or ("ORG" in ner_tag_dict.values()) or ("ORG" in ner_tag_dict.values()) or ("LOC" in ner_tag_dict.values()):
            output = True
    elif question_type == "How much":
        if ("MONEY" in ner_tag_dict.values()):
            output = True
    elif question_type == "How long":
        if ("DATE" in ner_tag_dict.values()):
            output = True
    elif question_type == "How many":
        if ("CARDINAL" in ner_tag_dict.values()):
            output = True
    elif question_type == "How often":
        if ("CARDINAL" in ner_tag_dict.values()):
            output = True
    elif question_type == "Why":
        if ("because" in sentence) or ("due to" in sentence) or ("Due to" in sentence):
            output = True
    else:
        for aux in auxiliary_verbs:
            aux_cap = aux[0].upper() + aux[1:]
            if aux or aux_cap in sentence:
                output = True
    return output

In [36]:
check_NER(text_artist[3], 'How many')

True

In [40]:
output1, sim_max1 = find_best_sentence(question1, text_artist_emb)
output1, sim_max1

('The Artist (film)', array([[0.6446453]], dtype=float32))

In [41]:
output2, sim_max2 = find_best_sentence(question2, text_hp_emb)
output2, sim_max2

('Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros',
 array([[0.9850467]], dtype=float32))

In [46]:
find_best_k_sentence(question1, text_artist_emb, 10)

[('The Artist grossed $44,671,682 in North America, along with $88,761,174 in other territories for a worldwide total of $133,432,856',
  array([[1.2797168]], dtype=float32)),
 ('The Artist (film)', array([[0.6446453]], dtype=float32)),
 ('In response, director Hazanavicius released a statement:',
  array([[0.53274065]], dtype=float32)),
 ('Box officeEdit', array([[0.51973134]], dtype=float32)),
 ('ProductionEdit', array([[0.5167222]], dtype=float32)),
 ('Track listingEdit', array([[0.4924271]], dtype=float32)),
 ('ReleaseEdit', array([[0.49158168]], dtype=float32)),
 ('The film was produced by La Petite Reine and ARP Sélection for 13.47 million euro, including co-production support from Studio 37 and France 3 Cinéma, and pre-sales investment from Canal+ and CinéCinéma',
  array([[0.47808385]], dtype=float32)),
 ('Chief among the influences shaping the screenplay’s protagonist was Douglas Fairbanks',
  array([[0.4662738]], dtype=float32)),
 ('AccoladesEdit', array([[0.45877093]], dtype

In [47]:
find_best_k_sentence(question2, text_hp_emb, 10)

[('Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros',
  array([[1.9850466]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban (film)',
  array([[1.7757163]], dtype=float32)),
 ('The film was released on 31 May 2004 in the United Kingdom and on 4 June 2004 in North America, as the first Harry Potter film released into IMAX theatres and to be using IMAX Technology',
  array([[1.7456156]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban at Box Office Mojo',
  array([[1.732152]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban at AllMovie',
  array([[1.7223794]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban at the Internet Movie Database',
  array([[1.7186091]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban was nominated for Best Original Music Score (John Williams) and Best Visual Effects at the 77th Academy Awards held in 2005',
  array([[1.7139666

# Binary Answer

In [49]:
#Binary Questions
#Strip the punctuation at the end
#input: question and original sentence in text
#check : 1) negation words: no/not/'nt √ 
#        2) Adjectives -> check antonymn
#        3) Check Info matching?

def binary_answer(question, sentence):
    negate = False
    output = ""
    neg_words = {'no', 'not', "don't", "doesn't", "did't", "haven't", "hasn't", "wasn't", "weren't"}
    sentence_set = set([x.lower() for x in sentence.split()])
    question_set = set([x.lower() for x in question[:-1].split()])
    intersect_words = sentence_set.intersection(question_set)
    leftover_question = question_set - intersect_words
    leftover_sentence = sentence_set - intersect_words
    print("leftover words: ")
    print(leftover_question, leftover_question)
    
    negate = not check_negate(leftover_question, leftover_question)
    if negate:
        #No
        output += "No. " + sentence
    else:
        #Yes
        output += "Yes. " + sentence

    return output

In [50]:
#neg_words = ['no', 'not',"n't"]
#return true if same
def check_negate(set1, set2):
    print("hi")
    negate1, negate2 = True, True
    print("Negate", negate1, negate2)
    if len(set1) == 0 and len(set2) == 0:
        return negate1 and negate2
    for item1 in set1:
        if (item1 == 'no') or (item1 == 'not') or ("n't" in item1):
            negate1 = not negate1
#             print("1", item1)

    for item2 in set2:
        if (item2 == 'no') or (item2 == 'not') or ("n't" in item2):
            negate2 = not negate2
#             print("2", item2)
    print("Negate", negate1 and negate2)
            
    return negate1 and negate2
    

In [51]:
binary_answer(question2, output2)

leftover words: 
set() set()
hi
Negate True True


'Yes. Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros'

# HOW MUCH Answer

In [52]:
question_howmuch = "How much did The Prisoner of Azkaban make worldwide?"

In [75]:
find_best_k_sentence(question_howmuch, text_hp_emb, 5)

The Prisoner of Azkaban
0.4 Harry Potter and the Prisoner of Azkaban (film)
The Prisoner of Azkaban
0.4 Harry Potter and the Prisoner of Azkaban is a 2004 fantasy film directed by Alfonso Cuarón and distributed by Warner Bros
The Prisoner of Azkaban
0 The film stars Daniel Radcliffe as Harry Potter, alongside Rupert Grint and Emma Watson as Harry's best friends Ron Weasley and Hermione Granger
The Prisoner of Azkaban
0 The film was released on 31 May 2004 in the United Kingdom and on 4 June 2004 in North America, as the first Harry Potter film released into IMAX theatres and to be using IMAX Technology
The Prisoner of Azkaban
0.2 Prisoner of Azkaban grossed a total of $796.7 million worldwide, with its box office performance ranking as the lowest-grossing in the series
The Prisoner of Azkaban
0 PlotEdit
The Prisoner of Azkaban
0 Harry Potter, now aged 13, has been spending another dissatisfying summer at Privet Drive
The Prisoner of Azkaban
0 The trio are returning to Hogwarts for the 

[("The Prisoner of Azkaban made a total of $796.7 million worldwide, which made it 2004's second-highest-grossing film worldwide behind Shrek 2",
  array([[1.5841146]], dtype=float32)),
 ('Prisoner of Azkaban grossed a total of $796.7 million worldwide, with its box office performance ranking as the lowest-grossing in the series',
  array([[1.5567474]], dtype=float32)),
 ("Upon release, the film broke the record for biggest single day in the United Kingdom's box office history making £5.3 million on a Monday",
  array([[1.3981736]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban at AllMovie',
  array([[0.9762496]], dtype=float32)),
 ('Harry Potter and the Prisoner of Azkaban at Box Office Mojo',
  array([[0.9610052]], dtype=float32))]

In [64]:
ner_tag_sentence(question_howmuch)

{'The Prisoner of Azkaban': 'PERSON'}

In [68]:
q = "The Prisoner of Azkaban made a total of $796.7 million worldwide, which made it 2004's second-highest-grossing film worldwide behind Shrek 2"

def how_much_answer(question, text_emb):
    k = 1
    best_k_sentence = find_best_k_sentence(question_howmuch, text_emb, k)
    top_sentence = best_k_sentence[0]
    


In [69]:
ner_tag_sentence(q)

{'$796.7 million': 'MONEY', '2004': 'DATE', 'second': 'ORDINAL'}