In [20]:
import json
import ast
import numpy as np
import pandas as pd
import nltk
import spacy
from spacy import displacy
from collections import Counter
import random;random.seed(1)
import en_core_web_sm
import textacy
import re
import unidecode
from text2digits import text2digits
t2d = text2digits.Text2Digits()
nlp = en_core_web_sm.load()


In [4]:
with open('squad-dev-v1.1.json', 'r') as f:
    df = json.load(f)

In [24]:
#build dataframe for squad to get random sample of 100
topics = []
paragraphs = []
questions = []
answers = []
squad = pd.DataFrame()
for title in range(len(df['data'])):
    for p in range(len(df['data'][title]['paragraphs'])):
        for qa in range(len(df['data'][title]['paragraphs'][p]['qas'])):
            topics.append(df['data'][title]['title'])
            paragraphs.append(df['data'][title]['paragraphs'][p]['context'])
            questions.append(df['data'][title]['paragraphs'][p]['qas'][qa]['question'])
            answers.append(df['data'][title]['paragraphs'][p]['qas'][qa]['answers'][0]['text'])
squad['topics'] = topics
squad['paragraphs'] = paragraphs
squad['questions'] = questions
squad['answers'] = answers
sample100 = squad.sample(100, random_state = 1)
sample100.head()

Unnamed: 0,topics,paragraphs,questions,answers
3280,Steam_engine,"In the 1840s and 50s, there were attempts to o...",How is lap provided by overlapping the admissi...,lengthening rubbing surfaces of the valve
220,Super_Bowl_50,"The Panthers offense, which led the NFL in sco...",Who started at tight end for the Panthers?,Greg Olsen
9004,Prime_number,"Hence, 6 is not prime. The image at the right ...","Besides 1,3 and 7, what other number must all ...",9
7093,Private_school,"In the final years of the apartheid era, paren...",How do academic results in former Model C scho...,better
1790,Computational_complexity_theory,Other important complexity classes include BPP...,IP and AM are most commonly defined by what ty...,Interactive


In [48]:
def preprocess(answer):
    prep_answer = answer.lstrip('\'')
    prep_answer = prep_answer.rstrip('.')
    prep_answer = prep_answer.replace('\""','')
    prep_answer = prep_answer.replace('\"','')
    prep_answer = prep_answer.replace('\"','')
    prep_answer = unidecode.unidecode(prep_answer)
    prep_answer = prep_answer.lower()
    return prep_answer

In [25]:
def check_answer_type(paragraph, prep_answer):
    answer_type = []
    article = nlp(paragraph)

    # get all named entities in paragraph
    ent_list = [t2d.convert(str(i).lower()).replace(' ','') for i in list(article.ents)]
    # check whether correct answer starts with 'the'
    with_the = 'the ' + prep_answer 
    in_ent_list = [1 for i in ent_list if (prep_answer.replace(' ','') in i) | \
                   (with_the.replace(' ','') in i) | (i in prep_answer.replace(' ',''))]
    if in_ent_list!=[]:
        #print('answer is a named entity')
        answer_type.append('NE')
    # get numbers in paragraph
    number_list = [token.orth_ for token in article if token.pos_=='NUM']
    if prep_answer in number_list:
        answer_type.append('number')
    # get all noun phrases and adjective phrases in paragraph
    if len(prep_answer) < 2:
        np_list = [token.orth_.lower() for token in article if (token.pos_=='PROPN') | (token.pos_=='NOUN')]
        adj_list = [token.orth_.lower() for token in article if token.pos_=='ADJ']
        hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),paragraph)
        adj_list.extend(hyphen_sep_words)
    else:
        np_list = [str(i) for i in list(article.noun_chunks)]
        single_noun = [token.orth_.lower() for token in article if (token.pos_=='PROPN') | (token.pos_=='NOUN')]
        np_list.extend(single_noun)
        adj_pattern = r'<ADV>*<ADJ>+<PART>*<DET>*<NOUN>*'
        doc = textacy.Doc(paragraph, lang='en_core_web_sm')
        adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, adj_pattern)]
        single_adj = [token.orth_.lower() for token in article if token.pos_=='ADJ']
        hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),paragraph)
        adj_list.extend(hyphen_sep_words)
    in_np_list = [1 for i in np_list if (prep_answer in i) | (with_the in i) | (i in prep_answer)]
    in_adj_list = [1 for i in adj_list if (prep_answer in i) | (with_the in i) | (i in prep_answer)]
    if in_np_list!=[]:
        #print('answer is a noun phrase')
        answer_type.append('NP')
    elif in_adj_list!=[]:
        #print('answer is an adj/adj-phrase')
        answer_type.append('ADJ')

    # get verb phrases
    vb_pattern = r'<VERB>+<ADV>*<PART>*<DET>*<NOUN>+'
    doc = textacy.Doc(paragraph, lang='en_core_web_sm')
    vb_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, vb_pattern)]
    if prep_answer in vb_list:
        #print('answer is a verb phrase')
        answer_type.append('VB')
    address_pattern = r'^([^,]+),\s([A-Z]{2})'
    doc = textacy.Doc(paragraph, lang='en_core_web_sm')
    add_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, address_pattern)]
    if answer_type == []:
        print('No type found')
        answer_type.append('Unknown type')
    return answer_type
    

In [44]:
def generate_distractor(topic, paragraph, question, correct_answer, answer_type):
    # get length of answer
    correct_answer_len = len(correct_answer)
#     print('correct answer: ' + str(correct_answer))
#     print('length: ' + str(correct_answer_len))

    # tokenize paragraph
    article = nlp(unidecode.unidecode(paragraph))
    doc = textacy.Doc(paragraph, lang='en_core_web_sm')
        
    # Preprocessing for same sentence distractor generation
    # get all sentences in paragraph
    sent_list = [str(i).lower() for i in list(article.sents)]
    # find sentence that has answer
    for s in sent_list:
        if correct_answer in s:
            ans_sent = s
    # tokenize sentence
    sentence = nlp(ans_sent)
    sent = textacy.Doc(sentence, lang='en_core_web_sm')
        
    # Preprocessing for same topic distractor generation
    # choose random paragraph from same topic
    topic_index = next((index for (index, d) in enumerate(df['data']) if d["title"] == topic), None)
    index_list = list(range(len(df['data'][topic_index]['paragraphs'])))
    # make sure isn't the same paragraph as current paragraph
    p_index = next((index for (index, d) in enumerate(df['data'][topic_index]['paragraphs'])\
                    if d["context"] == paragraph), None)
    index_list.remove(p_index)
    alt_p_index = random.choice(index_list)
    alt_p = df['data'][topic_index]['paragraphs'][alt_p_index]['context']
    alt_article = nlp(alt_p)
    alt_paragraph = textacy.Doc(alt_p, lang='en_core_web_sm')
        
    # Preprocessing for different topic distractor generation
    # choose random topic
    topic_list = list(range(len(df['data'])))
    # make sure isn't the same topic as current topic
    topic_list.remove(topic_index)
    alt_topic_index = random.choice(topic_list)
    alt_topic = df['data'][alt_topic_index]['title']
    alt_topic_paragraph = df['data'][alt_topic_index]['paragraphs'][0]['context']
    # select first paragraph from topic - option to randomize selection as well
    alt_topic_article = nlp(alt_topic_paragraph)
    alt_topic = textacy.Doc(alt_topic_paragraph, lang='en_core_web_sm')
    
    # Generate distractors
    # If answer is named entity
    if 'NE' in answer_type:
        # get all named entities in paragraph
        ent_list = [str(i) for i in list(article.ents)]
        ent_labels = [x.label_ for x in article.ents]
        # get all named entities in sentence
        sent_ent_list = [str(i) for i in list(sentence.ents)]
        sent_labels = [x.label_ for x in sentence.ents]
        alt_topic_ent_list = [str(i) for i in list(alt_topic_article.ents)]
        alt_topic_labels = [x.label_ for x in alt_topic_article.ents]
        alt_article_ent_list = [str(i) for i in list(alt_article.ents)]
        alt_article_labels = [x.label_ for x in alt_article.ents]
        
        # create table of named entities
        max_length = max(len(ent_list), len(sent_ent_list), len(alt_article_ent_list),len(alt_topic_ent_list))
#         print(max_length)
        ne_pd = pd.DataFrame()
        ne_pd['entity'] = ent_list + (['NA'] * (max_length - len(ent_list)))
        ne_pd['label'] = list(ent_labels) + (['NA'] * (max_length - len(ent_list)))
        ne_pd['sent_entity'] = sent_ent_list + (['NA'] * (max_length - len(sent_ent_list)))
        ne_pd['sent_label'] = list(sent_labels) + (['NA'] * (max_length - len(sent_ent_list)))
        ne_pd['altp_entity'] = alt_article_ent_list + (['NA'] * (max_length - len(alt_article_ent_list)))
        ne_pd['altp_label'] = list(alt_article_labels) + (['NA'] * (max_length - len(alt_article_ent_list)))
        ne_pd['altt_entity'] = alt_topic_ent_list + (['NA'] * (max_length - len(alt_topic_ent_list)))
        ne_pd['altt_label'] = list(alt_topic_labels) + (['NA'] * (max_length - len(alt_topic_ent_list)))
        
        ans_label = [ne_pd[ne_pd['entity']==e]['label'].values[0] for e in ne_pd['entity'] if e in prep_answer]
#         print(ans_label)
        alt_ans_list = list(ne_pd[(ne_pd['label'].isin(ans_label))]['entity'])
        alt_ans_list.extend(list(ne_pd[(ne_pd['sent_label'].isin(ans_label))]['sent_entity']))
        alt_ans_list.extend(list(ne_pd[(ne_pd['altp_label'].isin(ans_label))]['altp_entity']))
        alt_ans_list.extend(list(ne_pd[(ne_pd['altt_label'].isin(ans_label))]['altt_entity']))
        wrong_answers = set(alt_ans_list) - set([str(correct_answer)])
    
    
    elif 'NP' in answer_type:
        np_list = [str(i) for i in list(article.noun_chunks)]
        sent_np_list = [s.orth_ for s in list(sentence.noun_chunks)]
        alt_article_np_list = [str(i) for i in list(alt_article.noun_chunks)]
        alt_topic_np_list = [str(i) for i in list(alt_topic_article.noun_chunks)]
        
        single_noun = [token.orth_.lower() for token in article if (token.pos_=='PROPN') | (token.pos_=='NOUN')]
        single_noun_sent = [token.orth_.lower() for token in sentence if (token.pos_=='PROPN') | (token.pos_=='NOUN')]
        alt_single_noun = [token.orth_.lower() for token in alt_article if (token.pos_=='PROPN') | (token.pos_=='NOUN')]
        alt_topic_single_noun = [token.orth_.lower() for token in alt_topic_article if (token.pos_=='PROPN') | (token.pos_=='NOUN')]

        np_list.extend(single_noun)
        sent_np_list.extend(single_noun_sent)
        alt_article_np_list.extend(alt_single_noun)
        alt_topic_np_list.extend(alt_topic_single_noun)
        wrong_answers = set(np_list+sent_np_list+alt_topic_np_list+alt_article_np_list)-set([correct_answer])

    
    elif 'ADJ' in answer_type:
        adj_pattern = r'<ADV>*<ADJ>+<PART>*<DET>*<NOUN>*'
        
        adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, adj_pattern)]
        sent_adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, adj_pattern)]
        alt_paragraph_adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_paragraph, adj_pattern)]
        alt_topic_adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_topic, adj_pattern)]
        
        single_adj = [token.orth_.lower() for token in article if token.pos_=='ADJ']
        single_adj_sent = [token.orth_.lower() for token in sentence if token.pos_=='ADJ']
        alt_paragraph_single_adj = [token.orth_.lower() for token in alt_p if token.pos_=='ADJ']
        alt_topic_single_adj = [token.orth_.lower() for token in alt_topic_paragraph if token.pos_=='ADJ']
        
        hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),paragraph)
        hyphen_sep_words_sent = re.findall(r'\w+(?:-\w+)+'.lower(),sentence)
        alt_p_hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),alt_p)
        alt_topic_hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),alt_topic_paragraph)
        
        adj_list.extend(hyphen_sep_words).extend(single_adj)
        sent_adj_list.extend(hyphen_sep_words_sent).extend(single_adj_sent)
        alt_paragraph_adj_list.extend(alt_p_hyphen_sep_words).extend(alt_paragraph_single_adj)
        alt_topic_adj_list.extend(alt_topic_hyphen_sep_words).extend(alt_topic_single_adj)
        wrong_answers = set(adj_list+sent_adj_list+alt_paragraph_adj_list+alt_topic_adj_list)-set([correct_answer])
        
    elif 'number' in answer_type:
        number_list = [token.orth_ for token in article if token.pos_=='NUM']
        number_list_sent = [token.orth_ for token in sentence if token.pos_=='NUM']
        alt_paragraph_number_list = [token.orth_ for token in alt_p if token.pos_=='NUM']
        alt_topic_number_list = [token.orth_ for token in alt_topic_paragraph if token.pos_=='NUM']
        wrong_answers = set(number_list+number_list_sent+alt_paragraph_number_list+alt_topic_number_list)-set([correct_answer])
    
    
    elif 'VB' in answer_type:
        vb_pattern = r'<VERB>+<ADV>*<PART>*<DET>*<NOUN>+'
        vb_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, vb_pattern)]
        sent_vb_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, vb_pattern)]
        alt_paragraph_vb_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_paragraph, vb_pattern)]
        alt_topic_vb_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_topic, vb_pattern)]  
        wrong_answers = set(vb_list+sent_vb_list+alt_paragraph_vb_list+alt_topic_vb_list)-set([correct_answer])
    else:
        print('no distractors found')
        wrong_answers = [0,0,0]
    if len(wrong_answers) > 3:
        wrong_answers = random.sample(wrong_answers,3)
    return (topic, paragraph, question, correct_answer, answer_type, wrong_answers)

In [22]:
#### Test Example for numerical answer
test_num_topic = df['data'][1]['title']
test_num = df['data'][1]['paragraphs'][5]['context']
test_num_q = df['data'][1]['paragraphs'][5]['qas'][0]['question']
test_num_a = df['data'][1]['paragraphs'][5]['qas'][0]['answers'][0]['text']
print('Correct Answer: ' + str(df['data'][1]['paragraphs'][5]['qas'][0]['answers'][0]['text']))
print('Preprocessed answer: ' + str(preprocess(test_num_a)))
prep_answer = preprocess(test_num_a)
print('Answer type: ' + str(check_answer_type(test_num,prep_answer)))
answer_type = check_answer_type(test_num,prep_answer)
print(generate_distractor(test_num_topic, test_num, test_num_q, prep_answer, answer_type))

Correct Answer: 1816
Preprocessed answer: 1816
Answer type: ['NE', 'number']
('Warsaw', 'Another important library – the University Library, founded in 1816, is home to over two million items. The building was designed by architects Marek Budzyński and Zbigniew Badowski and opened on 15 December 1999. It is surrounded by green. The University Library garden, designed by Irena Bajerska, was opened on 12 June 2002. It is one of the largest and most beautiful roof gardens in Europe with an area of more than 10,000 m2 (107,639.10 sq ft), and plants covering 5,111 m2 (55,014.35 sq ft). As the university garden it is open to the public every day.', 'When was the University Library founded?', '1816', ['NE', 'number'], ['the first half of the 10th century', 'centuries', '15 December 1999'])


In [14]:

ent_list = [str(i) for i in list(nlp(test_np2).ents)]
ent_labels = [x.label_ for x in nlp(test_np2).ents]
ne_pd = pd.DataFrame()
ne_pd['entity'] = ent_list
ne_pd['label'] = list(ent_labels)
print(ne_pd)
get_entity = [ne_pd[ne_pd['entity']==e]['label'].values[0] for e in ne_pd['entity'] if e in test_np2_a]
print(get_entity)
list(ne_pd[(ne_pd['label'].isin(get_entity))]['entity'])


                                               entity    label
0                                           July 1888     DATE
1                                               Brown   PERSON
2                                                Peck   PERSON
3                                 George Westinghouse   PERSON
4                                               Tesla   PERSON
5                                              60,000    MONEY
6                                                2.50    MONEY
7                                                  AC      ORG
8                                        Westinghouse      ORG
9                                               Tesla  PRODUCT
10                                           one year     DATE
11                                              2,000    MONEY
12                                             52,700    MONEY
13                                              today     DATE
14  the Westinghouse Electric & Manufacturing Comp...  

['60,000',
 '2.50',
 'AC',
 'Westinghouse',
 '2,000',
 '52,700',
 "the Westinghouse Electric & Manufacturing Company's"]

In [23]:
test_np2_topic = df['data'][3]['title']
test_np2 = df['data'][3]['paragraphs'][21]['context']
test_np2_q = df['data'][3]['paragraphs'][21]['qas'][1]['question']
test_np2_a = df['data'][3]['paragraphs'][21]['qas'][1]['answers'][0]['text']
print('Correct Answer: ' + str(test_np2_a))
print('Preprocessed answer: ' + str(preprocess(test_np2_a)))
prep_answer = preprocess(test_np2_a)
print('Answer type: ' + str(check_answer_type(test_np2,prep_answer)))
answer_type = check_answer_type(test_np2,prep_answer)
print(generate_distractor(test_np2_topic, test_np2, test_np2_q, prep_answer, answer_type))

Correct Answer: $60,000 in cash and stock and a royalty of $2.50 per AC horsepower produced by each motor
Preprocessed answer: $60,000 in cash and stock and a royalty of $2.50 per ac horsepower produced by each motor
Answer type: ['NE', 'NP']
('Nikola_Tesla', "In July 1888, Brown and Peck negotiated a licensing deal with George Westinghouse for Tesla's polyphase induction motor and transformer designs for $60,000 in cash and stock and a royalty of $2.50 per AC horsepower produced by each motor. Westinghouse also hired Tesla for one year for the large fee of $2,000 ($52,700 in today's dollars) per month to be a consultant at the Westinghouse Electric & Manufacturing Company's Pittsburgh labs.", "How much did Westinghouse pay to license Tesla's designs?", '$60,000 in cash and stock and a royalty of $2.50 per ac horsepower produced by each motor', ['NE', 'NP'], ['60,000', '52,700', '2,000'])


In [24]:
test_vp_topic = df['data'][25]['title']
test_vp = df['data'][25]['paragraphs'][25]['context']
test_vp_q = df['data'][25]['paragraphs'][25]['qas'][0]['question']
test_vp_a = df['data'][25]['paragraphs'][25]['qas'][0]['answers'][0]['text']
print('Correct Answer: ' + str(test_vp_a))
print('Preprocessed answer: ' + str(preprocess(test_vp_a)))
prep_answer = preprocess(test_vp_a)
print('Answer type: ' + str(check_answer_type(test_vp,prep_answer)))
answer_type = check_answer_type(test_vp,prep_answer)
print(generate_distractor(test_vp_topic, test_vp, test_vp_q, prep_answer, answer_type))

Correct Answer: opened the gates
Preprocessed answer: opened the gates
Answer type: ['NP', 'VB']
('Genghis_Khan', "The city of Bukhara was not heavily fortified, with a moat and a single wall, and the citadel typical of Khwarezmi cities. The city leaders opened the gates to the Mongols, though a unit of Turkish defenders held the city's citadel for another twelve days. Survivors from the citadel were executed, artisans and craftsmen were sent back to Mongolia, young men who had not fought were drafted into the Mongolian army and the rest of the population was sent into slavery. As the Mongol soldiers looted the city, a fire broke out, razing most of the city to the ground. Genghis Khan had the city's surviving population assemble in the main mosque of the town, where he declared that he was the flail of God, sent to punish them for their sins.", 'How did the leaders of the city of Bukhara respond to the Mongol attack?', 'opened the gates', ['NP', 'VB'], ['army', 'christian', 'Luther'])

### Run on Sample of 100

In [31]:
s = pd.Series(list(range(0,100)))
sample100.set_index([s], inplace=True)
sample100.head()

Unnamed: 0,topics,paragraphs,questions,answers
0,Steam_engine,"In the 1840s and 50s, there were attempts to o...",How is lap provided by overlapping the admissi...,lengthening rubbing surfaces of the valve
1,Super_Bowl_50,"The Panthers offense, which led the NFL in sco...",Who started at tight end for the Panthers?,Greg Olsen
2,Prime_number,"Hence, 6 is not prime. The image at the right ...","Besides 1,3 and 7, what other number must all ...",9
3,Private_school,"In the final years of the apartheid era, paren...",How do academic results in former Model C scho...,better
4,Computational_complexity_theory,Other important complexity classes include BPP...,IP and AM are most commonly defined by what ty...,Interactive


In [56]:
for i in range(len(sample100)):
    try:
        print('Sample: ' + str(i))
        print('Raw Answer: ' + str(sample100.loc[i]['answers']))
        prep_answer = preprocess(sample100.loc[i]['answers'])
        answer_type = check_answer_type(sample100.loc[i]['paragraphs'],prep_answer)
        print('Answer type: ' + str(answer_type))
        topic, paragraph, question, correct_answer, answer_type, wrong_answers = generate_distractor(sample100.loc[i]['topics'], sample100.loc[i]['paragraphs'], \
                                  sample100.loc[i]['questions'], prep_answer, answer_type)
        print('---------------------------------------------------------')
        print(paragraph)
        print('Question: ' + str(question))
        print('Preprocessed answer: ' + str(correct_answer))
        print('Wrong answers: ' + str(wrong_answers))
        print('\n')
    except:
        print("error occurred for sample#: " + str(i))

Sample: 0
Raw Answer: lengthening rubbing surfaces of the valve
Answer type: ['NP']
---------------------------------------------------------
In the 1840s and 50s, there were attempts to overcome this problem by means of various patent valve gears with a separate, variable cutoff expansion valve riding on the back of the main slide valve; the latter usually had fixed or limited cutoff. The combined setup gave a fair approximation of the ideal events, at the expense of increased friction and wear, and the mechanism tended to be complicated. The usual compromise solution has been to provide lap by lengthening rubbing surfaces of the valve in such a way as to overlap the port on the admission side, with the effect that the exhaust side remains open for a longer period after cut-off on the admission side has occurred. This expedient has since been generally considered satisfactory for most purposes and makes possible the use of the simpler Stephenson, Joy and Walschaerts motions. Corliss, 

---------------------------------------------------------
In 1993, for the franchise's 30th anniversary, another charity special, titled Dimensions in Time was produced for Children in Need, featuring all of the surviving actors who played the Doctor and a number of previous companions. It also featured a crossover with the soap opera EastEnders, the action taking place in the latter's Albert Square location and around Greenwich. The special was one of several special 3D programmes the BBC produced at the time, using a 3D system that made use of the Pulfrich effect requiring glasses with one darkened lens; the picture would look normal to those viewers who watched without the glasses.
Question: What special was created for the show's 30th anniversary?
Preprocessed answer: dimensions in time
Wrong answers: set()


Sample: 9
Raw Answer: 2006
Answer type: ['NE', 'number', 'NP']
---------------------------------------------------------
The show has received recognition as one of Britain's 

---------------------------------------------------------
All of the forces in the universe are based on four fundamental interactions. The strong and weak forces are nuclear forces that act only at very short distances, and are responsible for the interactions between subatomic particles, including nucleons and compound nuclei. The electromagnetic force acts between electric charges, and the gravitational force acts between masses. All other forces in nature derive from these four fundamental interactions. For example, friction is a manifestation of the electromagnetic force acting between the atoms of two surfaces, and the Pauli exclusion principle, which does not permit atoms to pass through each other. Similarly, the forces in springs, modeled by Hooke's law, are the result of electromagnetic forces and the Exclusion Principle acting together to return an object to its equilibrium position. Centrifugal forces are acceleration forces that arise simply from the acceleration of rotati

Answer type: ['NP']
---------------------------------------------------------
There are also many places commemorating the heroic history of Warsaw. Pawiak, an infamous German Gestapo prison now occupied by a Mausoleum of Memory of Martyrdom and the museum, is only the beginning of a walk in the traces of Heroic City. The Warsaw Citadel, an impressive 19th-century fortification built after the defeat of the November Uprising, was a place of martyr for the Poles. Another important monument, the statue of Little Insurgent located at the ramparts of the Old Town, commemorates the children who served as messengers and frontline troops in the Warsaw Uprising, while the impressive Warsaw Uprising Monument by Wincenty Kućma was erected in memory of the largest insurrection of World War II.
Question: What was the name of the infamous German Gestapo prison?
Preprocessed answer: pawiak
Wrong answers: ['water', 'Little Insurgent', 'century']


Sample: 24
Raw Answer: catch more sunlight in deep wa

---------------------------------------------------------
Many locals and tourists frequent the southern California coast for its popular beaches, and the desert city of Palm Springs is popular for its resort feel and nearby open spaces.
Question: Other than for its resort feel, what is Palm Springs popular for?
Preprocessed answer: open spaces
Wrong answers: ['a course', 'a descriptor', 'moniker']


Sample: 31
Raw Answer: an international data communications network headquartered in San Jose, CA
Answer type: ['NE', 'NP']
---------------------------------------------------------
Tymnet was an international data communications network headquartered in San Jose, CA that utilized virtual call packet switched technology and used X.25, SNA/SDLC, BSC and ASCII interfaces to connect host computers (servers)at thousands of large companies, educational institutions, and government agencies. Users typically connected via dial-up connections or dedicated async connections. The business consisted 

---------------------------------------------------------
The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tra

---------------------------------------------------------
The first recorded travels by Europeans to China and back date from this time. The most famous traveler of the period was the Venetian Marco Polo, whose account of his trip to "Cambaluc," the capital of the Great Khan, and of life there astounded the people of Europe. The account of his travels, Il milione (or, The Million, known in English as the Travels of Marco Polo), appeared about the year 1299. Some argue over the accuracy of Marco Polo's accounts due to the lack of mentioning the Great Wall of China, tea houses, which would have been a prominent sight since Europeans had yet to adopt a tea culture, as well the practice of foot binding by the women in capital of the Great Khan. Some suggest that Marco Polo acquired much of his knowledge through contact with Persian traders since many of the places he named were in Persian.
Question: What was the Italian title of Polo's book?
Preprocessed answer: il milione
Wrong answers: s

---------------------------------------------------------
Tesla gained experience in telephony and electrical engineering before emigrating to the United States in 1884 to work for Thomas Edison in New York City. He soon struck out on his own with financial backers, setting up laboratories and companies to develop a range of electrical devices. His patented AC induction motor and transformer were licensed by George Westinghouse, who also hired Tesla for a short time as a consultant. His work in the formative years of electric power development was involved in a corporate alternating current/direct current "War of Currents" as well as various patent battles.
Question: What other invention of Tesla's did Westinghouse license?
Preprocessed answer: transformer
Wrong answers: ['His patented AC induction motor', 'supply', 'the company name']


Sample: 56
Raw Answer: popularly based authority
Answer type: ['NP']
---------------------------------------------------------
Sudbury model democrati

---------------------------------------------------------
By far the most famous work of Norman art is the Bayeux Tapestry, which is not a tapestry but a work of embroidery. It was commissioned by Odo, the Bishop of Bayeux and first Earl of Kent, employing natives from Kent who were learned in the Nordic traditions imported in the previous half century by the Danish Vikings.
Question: What kind of needlework was used in the creation of the Bayeux Tapestry?
Preprocessed answer: embroidery
Wrong answers: ['raiders', 'king', 'force']


Sample: 63
Raw Answer: Ancient Egypt
Answer type: ['NE', 'NP']
---------------------------------------------------------
The glass collection covers 4000 years of glass making, and has over 6000 items from Africa, Britain, Europe, America and Asia. The earliest glassware on display comes from Ancient Egypt and continues through the Ancient Roman, Medieval, Renaissance covering areas such as Venetian glass and Bohemian glass and more recent periods, includin

---------------------------------------------------------
For many native populations, the elimination of French power in North America meant the disappearance of a strong ally and counterweight to British expansion, leading to their ultimate dispossession. The Ohio Country was particularly vulnerable to legal and illegal settlement due to the construction of military roads to the area by Braddock and Forbes. Although the Spanish takeover of the Louisiana territory (which was not completed until 1769) had modest repercussions, the British takeover of Spanish Florida resulted in the westward migration of tribes that did not want to do business with the British, and a rise in tensions between the Choctaw and the Creek, historic enemies whose divisions the British at times exploited. The change of control in Florida also prompted most of its Spanish Catholic population to leave. Most went to Cuba, including the entire governmental records from St. Augustine, although some Christianized Ya

---------------------------------------------------------
Normans came into Scotland, building castles and founding noble families who would provide some future kings, such as Robert the Bruce, as well as founding a considerable number of the Scottish clans. King David I of Scotland, whose elder brother Alexander I had married Sybilla of Normandy, was instrumental in introducing Normans and Norman culture to Scotland, part of the process some scholars call the "Davidian Revolution". Having spent time at the court of Henry I of England (married to David's sister Maud of Scotland), and needing them to wrestle the kingdom from his half-brother Máel Coluim mac Alaxandair, David had to reward many with lands. The process was continued under David's successors, most intensely of all under William the Lion. The Norman-derived feudal system was applied in varying degrees to most of Scotland. Scottish families of the names Bruce, Gray, Ramsay, Fraser, Ogilvie, Montgomery, Sinclair, Pollock, Bur

---------------------------------------------------------
Between 1991 and 2000, the total area of forest lost in the Amazon rose from 415,000 to 587,000 square kilometres (160,000 to 227,000 sq mi), with most of the lost forest becoming pasture for cattle. Seventy percent of formerly forested land in the Amazon, and 91% of land deforested since 1970, is used for livestock pasture. Currently, Brazil is the second-largest global producer of soybeans after the United States. New research however, conducted by Leydimere Oliveira et al., has shown that the more rainforest is logged in the Amazon, the less precipitation reaches the area and so the lower the yield per hectare becomes. So despite the popular perception, there has been no economical advantage for Brazil from logging rainforest zones and converting these to pastoral fields.
Question: What percentage of the land cleared in the Amazon is used for growing livestock?
Preprocessed answer: 91%
Wrong answers: {'seventy percent', 'Seve

---------------------------------------------------------
The phrase "Hiding behind (or 'watching from behind') the sofa" entered British pop culture, signifying in humour the stereotypical early-series behaviour of children who wanted to avoid seeing frightening parts of a television programme while remaining in the room to watch the remainder of it. The phrase retains this association with Doctor Who, to the point that in 1991 the Museum of the Moving Image in London named their exhibition celebrating the programme "Behind the Sofa". The electronic theme music too was perceived as eerie, novel, and frightening, at the time. A 2012 article placed this childhood juxtaposition of fear and thrill "at the center of many people's relationship with the show", and a 2011 online vote at Digital Spy deemed the series the "scariest TV show of all time".
Question: What did people vote the Doctor Who series as in a 2011 online vote?
Preprocessed answer: scariest tv show of all time
Wrong answers:

In [53]:
sample100.loc[21]['paragraphs']

'New Rochelle, located in the county of Westchester on the north shore of Long Island Sound, seemed to be the great location of the Huguenots in New York. It is said that they landed on the coastline peninsula of Davenports Neck called "Bauffet\'s Point" after traveling from England where they had previously taken refuge on account of religious persecution, four years before the revocation of the Edict of Nantes. They purchased from John Pell, Lord of Pelham Manor, a tract of land consisting of six thousand one hundred acres with the help of Jacob Leisler. It was named New Rochelle after La Rochelle, their former strong-hold in France. A small wooden church was first erected in the community, followed by a second church that built of stone. Previous to the erection of it, the strong men would often walk twenty-three miles on Saturday evening, the distance by the road from New Rochelle to New York, to attend the Sunday service. The church was eventually replaced by a third, Trinity-St. 

In [54]:
article = nlp(unidecode.unidecode(sample100.loc[21]['paragraphs']))
correct_answer = preprocess(sample100.loc[21]['answers'])
print(correct_answer)
sent_list = [str(i).lower() for i in list(article.sents)]
    # find sentence that has answer
for s in sent_list:
    print(unidecode.unidecode(s))
    print('original: ' + str(s))
    if correct_answer in s:
        ans_sent = s
        print('answer found')

trinity-st. paul's episcopal church
new rochelle, located in the county of westchester on the north shore of long island sound, seemed to be the great location of the huguenots in new york.
original: new rochelle, located in the county of westchester on the north shore of long island sound, seemed to be the great location of the huguenots in new york.
it is said that they landed on the coastline peninsula of davenports neck called "bauffet's point" after traveling from england where they had previously taken refuge on account of religious persecution, four years before the revocation of the edict of nantes.
original: it is said that they landed on the coastline peninsula of davenports neck called "bauffet's point" after traveling from england where they had previously taken refuge on account of religious persecution, four years before the revocation of the edict of nantes.
they purchased from john pell, lord of pelham manor, a tract of land consisting of six thousand one hundred acres 