In [1]:
import json
import ast
import numpy as np
import pandas as pd
import nltk
import spacy
from spacy import displacy
from collections import Counter
import random;random.seed(1)
import en_core_web_sm
import textacy
import re
import unidecode
from text2digits import text2digits
import time
t2d = text2digits.Text2Digits()
nlp = en_core_web_sm.load()


In [4]:
def preprocess(answer):
    # Remove extra punctuations and lowercase answer
    prep_answer = answer.lstrip('\'')
    prep_answer = prep_answer.rstrip('.')
    prep_answer = prep_answer.replace('\""','')
    prep_answer = prep_answer.replace('\"','')
    prep_answer = prep_answer.replace('\"','')
    prep_answer = prep_answer.lower()
    return prep_answer

In [5]:
def check_answer_type(raw_answer):
    answer_type = []
    tokenized_answer = nlp(preprocess(raw_answer))
    # get part of speech for answer
    pos_answer = [token.pos_ for token in tokenized_answer]
    # create month list to identify date answers
    month_list = ['january', 'jan', 'february', 'feb', 'march', 'april', \
                                   'may','june', 'july', 'august', 'aug', 'september', 'sept',\
                                   'october', 'oct', 'nov', 'dec','november', 'december']
    if raw_answer[0].isupper():
        # Check if it's a date
        if str(tokenized_answer[0]).lower() in month_list:
            answer_type.append('DATE')
        else:
            answer_type.append('PROPN')
    elif '°C' in raw_answer:
        answer_type.append('MEASUREMENT')
    elif (pos_answer[0]=='VERB') and (len(raw_answer)<2):
        answer_type.append('VERB')
    elif (pos_answer[0]=='VERB') and (len(raw_answer)>=2):
        answer_type.append('VP')
    # conditional for "to be" verbs
    elif (pos_answer[0]=='PART') and (pos_answer[1]=='VERB'):
        answer_type.append('VP')
    elif (pos_answer[0]=='ADV'):
        if len(tokenized_answer)>1:
            if (pos_answer[1]=='ADJ'):
                answer_type.append('ADJ')
            elif (pos_answer[1]=='PUNCT') and (pos_answer[2]=='ADJ'):
                answer_type.append('ADJ')
            elif (pos_answer[1]=='VERB'):
                answer_type.append('VP')
            else:
                answer_type.append('ADV')
        else:
            answer_type.append('ADV')
    elif (pos_answer[0]=='NUM') |  (pos_answer[0]=='PUNCT'):
        month_present = [1 for i in month_list if i in raw_answer.lower()]
        BC_AD = [1 for i in ['AD','BC'] if i in raw_answer.lower()]
        measurement_present = [1 for i in ['minutes', 'hours', 'seconds', 'days','%','°C','°F'] if i in raw_answer]
        if month_present!=[]:
            answer_type.append('DATE')
        elif BC_AD!=[]:
            answer_type.append('YEAR')
        elif measurement_present!=[]:
            answer_type.append('MEASUREMENT')
        else:
            answer_type.append('NUM')
    elif pos_answer[0]=='ADJ':
        answer_type.append('ADJ')
    elif (pos_answer[0]=='NOUN') | (pos_answer[0]=='X'):
        month_present = [1 for i in month_list if i in raw_answer.lower()]
        # check if it's a hyphen-separated adjective
        if re.findall(r'\w+(?:-\w+)+'.lower(),raw_answer):
            answer_type.append('ADJ')
        # check if there's a digit in the answer
        elif bool(re.search(r'\d', raw_answer)):
             # check for currency symbols
            if bool(re.search(r'([£\$€])', raw_answer)):
                answer_type.append('MONEY')
            elif str(tokenized_answer[1])=='-':
                answer_type.append('SCORE')
            elif month_present!=[]:
                answer_type.append('DATE')
            else:
                answer_type.append('YEAR')
        else:
            answer_type.append('NOUN')
    elif pos_answer[0]=='ADP':
        answer_type.append('ADP')
    elif pos_answer[0]=='DET':
        if pos_answer[1]=='NOUN':
            answer_type.append('NOUN')
        elif pos_answer[1]=='PROPN':
            answer_type.append('PROPN')
    elif pos_answer[0]=='SYM':
        answer_type.append('MONEY')
    else:
        answer_type.append('Unknown type')
    return answer_type
    

In [44]:
def generate_distractor(topic, paragraph, qid, question, answer, answer_type):
    wrong_answers=[]
    #preprocess answer
    correct_answer = nlp(unidecode.unidecode(preprocess(answer)))
    # get answer pos
    ans_length = len(correct_answer)
    ans_tag = [token.tag_ for token in correct_answer]
    ans_pos = [token.pos_ for token in correct_answer]
    # tokenize paragraph
    article = nlp(unidecode.unidecode(paragraph))
    doc = textacy.Doc(paragraph, lang='en_core_web_sm')
        
    # Preprocessing for same sentence distractor generation
    # get all sentences in paragraph
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sent_list = sent_detector.tokenize(paragraph.strip())

    # find sentence that has answer
    for s in sent_list:
        if answer in s:
            ans_sent = s
    # tokenize sentence
    sentence = nlp(ans_sent)
    sent = textacy.Doc(sentence, lang='en_core_web_sm')
        
    # Preprocessing for same topic distractor generation
    # choose random paragraph from same topic
    topic_index = next((index for (index, d) in enumerate(df['data']) if d["title"] == topic), None)
    index_list = list(range(len(df['data'][topic_index]['paragraphs'])))
    # make sure isn't the same paragraph as current paragraph
    p_index = next((index for (index, d) in enumerate(df['data'][topic_index]['paragraphs'])\
                    if d["context"] == paragraph), None)
    index_list.remove(p_index)
    # Choose 5 random paragraphs from the same article
    alt_p_index = random.choices(index_list,k=5)
    alt_p_list = [df['data'][topic_index]['paragraphs'][i]['context'] for i in alt_p_index]
    alt_article_list = [nlp(alt_p) for alt_p in alt_p_list]
    alt_paragraph_list = [textacy.Doc(alt_p, lang='en_core_web_sm') for alt_p in alt_p_list]
    
    ent_list = [str(i).lower() for i in list(article.ents)]
#     print('ent_list: ' + str(ent_list))
#     print('correct answer: ' + str(correct_answer))
    if str(correct_answer) in ent_list:
        answer_type.append('ENTITY')    
#         print('in entity list')
        ent_labels = [x.label_ for x in article.ents]
        # get all named entities in sentence
        sent_ent_list = [str(i).lower() for i in list(sentence.ents)]
        sent_labels = [x.label_ for x in sentence.ents]
        alt_article_ent_list = []
        alt_article_labels = []
        for p in alt_article_list:
            p_ent_list = [str(i).lower() for i in list(p.ents)]
            p_labels = [x.label_ for x in p.ents]
            alt_article_ent_list.extend(p_ent_list)
            alt_article_labels.extend(p_labels)
#         print('alt_article_ent: ' + str(alt_article_ent_list))
#         print('alt_article_labels: ' + str(alt_article_labels))
        merged=set(ent_labels+sent_ent_list+alt_article_ent_list)
        max_length = max(len(ent_list), len(sent_ent_list), len(alt_article_ent_list))
        # create table of named entities

        ne_pd = pd.DataFrame()
        ne_pd['entity'] = ent_list + (['NA'] * (max_length - len(ent_list)))
        ne_pd['label'] = list(ent_labels) + (['NA'] * (max_length - len(ent_list)))
        ne_pd['sent_entity'] = sent_ent_list + (['NA'] * (max_length - len(sent_ent_list)))
        ne_pd['sent_label'] = list(sent_labels) + (['NA'] * (max_length - len(sent_ent_list)))
        ne_pd['altp_entity'] = alt_article_ent_list + (['NA'] * (max_length - len(alt_article_ent_list)))
        ne_pd['altp_label'] = list(alt_article_labels) + (['NA'] * (max_length - len(alt_article_ent_list)))


        ans_label = [ne_pd[ne_pd['entity']==e]['label'].values[0] for e in ne_pd['entity'] if e in str(correct_answer)]
#         print('answer label: ' + str(ans_label))
        # This filters scores such as 49-15 that are labeled as "DATE"
        
                    
        alt_ans_list = list(ne_pd[(ne_pd['label'].isin(ans_label))]['entity'])
        alt_ans_list.extend(list(ne_pd[(ne_pd['sent_label'].isin(ans_label))]['sent_entity']))
        alt_ans_list.extend(list(ne_pd[(ne_pd['altp_label'].isin(ans_label))]['altp_entity']))
#         print('alt_ans_list: ' + str(alt_ans_list))
        if (['DATE'] in ans_label) | (ans_label==['DATE']):
            score_list=[]
            if 'SCORE' in answer_type:
                num_list = re.findall(r'(\d+-?){1,2}',paragraph)
                # Iterate through same topic articles to find all numbers
                for p in alt_p_list:
                    num_list.extend(re.findall(r'(\d+-?){1,2}',p))
                    for n in range(len(num_list)-1):
                        score = num_list[n][:2]+'-'+num_list[n+1][:2]
                        score_list.append(score)
                wrong_answers = score_list
            else:

                filtered_month = set(['January', 'Jan', 'February', 'March', 'April', \
                                           'May','June', 'July', 'August', 'September', \
                                           'October','November','December'])-set([(str(correct_answer[0]).capitalize())])
                month_list=random.sample(filtered_month,5)
                day_list = random.sample(range(1, 30), 5)
                year_list = random.sample(range(1300,2050), 5)
                random_date = [str(m)+' '+str(d)+','+str(y) for m in month_list for d in day_list for y in year_list]
                alt_ans_list.extend(random_date)
        if len(alt_ans_list)<=3:
            if (['TIME'] in ans_label) | (ans_label==['TIME']) | (['PERCENT'] in ans_label) | (ans_label==['PERCENT']):
                random_time = [str(num)+' ' +str(correct_answer[-1]) for num in random.sample(range(1, 60), 5)]
                alt_ans_list.extend(random_time)
            elif (['MONEY'] in ans_label) | (ans_label==['MONEY']):
                currency = answer[0]
                random_money = [answer[0]+str(num)+' ' +str(correct_answer[-1]) for num in random.sample(range(1, 60), 5)]
                alt_ans_list.extend(random_money)
            elif (['DATE'] in ans_label) | (ans_label==['DATE']):
                filtered_month = set(['January', 'Jan', 'February', 'March', 'April', \
                                           'May','June', 'July', 'August', 'September', \
                                           'October','November','December'])-set([(str(correct_answer[0]).capitalize())])
                month_list=random.sample(filtered_month,5)
                day_list = random.sample(range(1, 30), 5)
                year_list = random.sample(range(1300,2050), 5)
                random_date = [str(m)+' '+str(d)+','+str(y) for m in month_list for d in day_list for y in year_list]
                alt_ans_list.extend(random_date)
            elif (['ORDINAL'] in ans_label) | (ans_label==['ORDINAL']) | (['CARDINAL'] in ans_label) | (ans_label==['CARDINAL']):
#                 print('answer is ORDINAL')
                correct_ans_pos = str(['r'+str(token.pos_)+'l' for token in \
       correct_answer])[1:-1].replace("'r","<").replace("l'",">").replace(',','+',1).replace(',','*').replace(' ','')+'+'
#                 print(correct_ans_pos)
                alt_ans_list=[]
                for p in alt_paragraph_list:
                    p_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(p, correct_ans_pos)]
                    alt_p_list.extend(p_list)
                alt_ans_list.extend(alt_p_list)
            else:
                correct_ans_pos = str(['r'+'PROPN'+'l' for token in \
       correct_answer])[1:-1].replace("'r","<").replace("l'",">").replace(',','+',1).replace(',','*').replace(' ','')+'+'
                alt_ans_list=[]
                for p in alt_paragraph_list:
                    p_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(p, correct_ans_pos)]
                    alt_p_list.extend(p_list)
                alt_ans_list.extend(alt_p_list)
        wrong_answers = alt_ans_list

        
    elif 'PROPN' in answer_type:
        doc_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, r'<PROPN>+')]
        sent_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, r'<PROPN>+')]
        alt_p_list=[]
        for p in alt_paragraph_list:
            p_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(p, r'<PROPN>+')]
            alt_p_list.extend(p_list)
        merged = set(doc_list+sent_list+alt_p_list)
        wrong_answers=merged    
    elif 'NUM' in answer_type:
        
        doc_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, r'<NUM>+')]
        sent_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, r'<NUM>+')]
        alt_p_list=[]
        for p in alt_paragraph_list:
            p_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(p, r'<NUM>+')]
            alt_p_list.extend(p_list)
        merged = set(doc_list+sent_list+alt_p_list)
        wrong_answers = [t2d.convert(str(i)).lstrip() for i in merged]
        if len(str(correct_answer))==4:
            wrong_answers = [i if len(i)==4 else str(correct_answer)[:2]+str(random.sample(range(0, 99),1)[0]) for i in wrong_answers]
    
    elif 'DATE' in answer_type:
        filtered_month = set(['January', 'Jan', 'February', 'March', 'April', \
                                           'May','June', 'July', 'August', 'September', \
                                           'October','November','December'])-set([(str(correct_answer[0]).capitalize())])
        month_list=random.sample(filtered_month,5)
        if ans_length==1:
            wrong_answers=month_list
        else:
            day_list = random.sample(range(1, 30), 5)
            year_list = random.sample(range(1300,2050), 5)
            random_date = [str(m)+' '+str(d)+','+str(y) for m in month_list for d in day_list for y in year_list]
            wrong_answers=random_date
    else:
        merged=[]
        
        correct_ans_pos = str(['r'+str(token.pos_)+'l' for token in \
           correct_answer])[1:-1].replace("'r","<").replace("l'",">").replace(',','+',1).replace(',','*').replace(' ','')+'+'
#         print('correct_ans_pos: ' + str(correct_ans_pos))
        doc_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, correct_ans_pos)]
        sent_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, correct_ans_pos)]
        alt_p_list=[]
        for p in alt_paragraph_list:
            p_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(p, correct_ans_pos)]
            alt_p_list.extend(p_list)
        merged = set(doc_list+sent_list+alt_p_list)
        wrong_answers=list(merged)

        if len(wrong_answers)<=3:
            if '<NOUN>' in correct_ans_pos:
                for p in alt_paragraph_list:
                    p_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(p,r'<NOUN>+')]
                    wrong_answers.extend(p_list)
                    
            elif '<VERB>' in correct_ans_pos:
                for p in alt_paragraph_list:
                    p_list = [l.text.lower()for l in textacy.extract.pos_regex_matches(p,r'<VERB>+')]
                    wrong_answers.extend(p_list)
                    
        if 'VP' in answer_type:
            if len(wrong_answers) >= 3:
                for i in wrong_answers:
                    if isinstance(i, str):
                        item = nlp(i)
                        if (item[0].tag_ == ans_tag[0]) and (item[-1].pos_ == ans_pos[-1]):
                            wrong_answers.append(item)
        
        if 'VERB' in answer_type:
            if len(wrong_answers) >= 3:
                for i in wrong_answers:
                    if isinstance(i, str):
                        item = nlp(i)
                        if (item[0].tag_ == ans_tag[0]):
                            wrong_answers.append(item.orth_)
        
        if 'ADJ' in answer_type:
            hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),paragraph)
            hyphen_sep_words_sent = re.findall(r'\w+(?:-\w+)+'.lower(),ans_sent)
            alt_p_hyphen_sep_words=[]
            for p in alt_p_list:
                p_hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),p)
                alt_p_hyphen_sep_words.extend(p_hyphen_sep_words)
#             alt_topic_hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),alt_topic_paragraph)

            hyphen_words = hyphen_sep_words+hyphen_sep_words_sent+alt_p_hyphen_sep_words
#             print('hyphen_words: ' + str(hyphen_words))
            for w in hyphen_words:
                wrong_answers.append(w)

        if 'YEAR' in answer_type:
            wrong_answer = [i for i in wrong_answers if len(i)==4]
        if 'MEASUREMENT' in answer_type:            
            wrong_answers = [t2d.convert(str(nlp(i)[0])).lstrip()+' ' + str(correct_answer[-1]) for i in merged]
        if 'MONEY' in answer_type:
            currency = answer[0]
            random_money = [answer[0]+str(num)+' ' +str(correct_answer[-1]) for num in random.sample(range(1, 60), 5)]
            wrong_answers.extend(random_money)
        if 'ADP' in answer_type:
            wrong_answers = [i for i in wrong_answers if i[0]==correct_answer[0].orth_]
    #Remove distractors with part of answer in it
    answer_found=False
    filtered_wrong_answers = []
    
    if ('MEASUREMENT' not in answer_type) and ('NUM' not in answer_type) and ('MONEY' not in answer_type)\
    and ('SCORE' not in answer_type) and ('DATE' not in answer_type):
        print('not measurement, num, date, money')
        for i in wrong_answers:
            answer_found=False
            for word in range(len(correct_answer)):
                if (correct_answer[word].orth_ in ['the','of','a','an','that','to','between', 'and']):
                    answer_found=False
                elif (correct_answer[word].orth_ in str(i)):
                    answer_found=True


            if answer_found==False:
                filtered_wrong_answers.append(i)
                filtered_wrong_answers=[item for item in filtered_wrong_answers if item not in ['a','an','the','that','it','who','what', '',""]]
    else:
        filtered_wrong_answers = wrong_answers
    # if not enough filtered answers, choose first tokens of paragraph up to length of answer
    if len(set(filtered_wrong_answers))<3:

        for article in alt_article_list:
            if len(article)>ans_length:
                filtered_wrong_answers.append(article[:ans_length])
    return (topic, paragraph, qid, question, correct_answer, answer_type, random.sample(list(set(filtered_wrong_answers)),3))
       
                    

In [51]:
# with open('squad-dev-v1.1.json', 'r') as f:
#     df = json.load(f)
with open('/Users/joannahuang/Desktop/w210-capstone-qg-master/AA_squad_w_ans/AA_squad_w_ans_wiki_01', 'r') as f:
    df = json.load(f)

In [52]:
#build dataframe for squad to get random sample of 100
topics = []
paragraphs = []
questions = []
qids = []
answers = []
autoq = pd.DataFrame()
for title in range(len(df['data'])):
    for p in range(len(df['data'][title]['paragraphs'])):
        if df['data'][title]['paragraphs'][p]['qas'] == []:
            topics.append(df['data'][title]['title'])
            paragraphs.append(df['data'][title]['paragraphs'][p]['context'])
            questions.append('')
            answers.append('')
            qids.append('')
        else:
            for qa in range(len(df['data'][title]['paragraphs'][p]['qas'])):
                topics.append(df['data'][title]['title'])
                paragraphs.append(df['data'][title]['paragraphs'][p]['context'])
                questions.append(df['data'][title]['paragraphs'][p]['qas'][qa]['question'])
                qids.append(df['data'][title]['paragraphs'][p]['qas'][qa]['id'])
                answers.append(df['data'][title]['paragraphs'][p]['qas'][qa]['answers'][0]['text'])
autoq['topics'] = topics
autoq['paragraphs'] = paragraphs
autoq['questions'] = questions
autoq['id'] = qids
autoq['answers'] = answers
autoq.head()
#sample100 = squad.sample(100, random_state = 5)
#new_index = [pd.Index(list(range(100)))]
#sample100.set_index(new_index, inplace=True)
#sample100.head()

Unnamed: 0,topics,paragraphs,questions,id,answers
0,Amphibian,"Amphibians are ectothermic, tetrapod vertebrat...",,,
1,Amphibian,The earliest amphibians evolved in the Devonia...,,,
2,Amphibian,The three modern orders of amphibians are Anur...,what are the two modern orders of mammals ?,"(0, 2, 0)",Anura (the frogs and toads
3,Amphibian,The three modern orders of amphibians are Anur...,what is the study of mammals called ?,"(0, 2, 4)",batrachology
4,Amphibian,"The word ""amphibian"" is derived from the Ancie...",what are the numbers of the species of species ?,"(0, 3, 5)","over 7,000"


In [53]:
start = time.time()
distractors = []
for i in range(len(autoq)):
    try:
        print('Sample: ' + str(i))
        print('Raw Answer: ' + str(autoq.loc[i]['answers']))
        answer_type = check_answer_type(autoq.loc[i]['answers'])
        print('Answer type: ' + str(answer_type))
        topic, paragraph, qid, question, correct_answer, answer_type, wrong_answers = generate_distractor(autoq.loc[i]['topics'], autoq.loc[i]['paragraphs'], \
                                  autoq.loc[i]['id'],autoq.loc[i]['questions'], autoq.loc[i]['answers'], answer_type)
        distractors.append(wrong_answers)
        print('---------------------------------------------------------')
        print('QID: ' + str(qid))
        print('Question: ' + str(question))
        print('Wrong answers: ' + str(wrong_answers))
        print('\n')
    except:
        print("error occurred for sample#: " + str(i))
        distractors.append(['NA'])
autoq['distractors']=distractors
autoq['distractor_length'] = [1 if (len(autoq.loc[i]['distractors'])==1) else 0 for i in list(autoq.index.values) ]

print("Progress: processed {} records in {:.2f} minutes".format(len(autoq),(time.time()-start)/60))

Sample: 0
Raw Answer: 
error occurred for sample#: 0
Sample: 1
Raw Answer: 
error occurred for sample#: 1
Sample: 2
Raw Answer: Anura (the frogs and toads
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (0, 2, 0)
Question: what are the two modern orders of mammals ?
Wrong answers: ['ceratophrys', 'caecilians', 'apoda']


Sample: 3
Raw Answer: batrachology
Answer type: ['NOUN']
not measurement, num, date, money
---------------------------------------------------------
QID: (0, 2, 4)
Question: what is the study of mammals called ?
Wrong answers: ['skin', 'toe pads', 'walkers']


Sample: 4
Raw Answer: over 7,000
Answer type: ['ADP']
not measurement, num, date, money
---------------------------------------------------------
QID: (0, 3, 5)
Question: what are the numbers of the species of species ?
Wrong answers: ['three', 'two', 'six']


Sample: 5
Raw Answer: 
error occurred for sample#: 5
Sample: 6
Raw Answer: 
error o

not measurement, num, date, money
---------------------------------------------------------
QID: (0, 45, 0)
Question: what are the eggs of the eggs of mammals typically laid in ?
Wrong answers: ['cloaca while in others', 'salamanders and frogs lack lungs', 'packet of sperm']


Sample: 49
Raw Answer: 
error occurred for sample#: 49
Sample: 50
Raw Answer: 
error occurred for sample#: 50
Sample: 51
Raw Answer: 
error occurred for sample#: 51
Sample: 52
Raw Answer: 
error occurred for sample#: 52
Sample: 53
Raw Answer: 
error occurred for sample#: 53
Sample: 54
Raw Answer: 
error occurred for sample#: 54
Sample: 55
Raw Answer: eggs do so in burrows or moist places on land near bodies of water
Answer type: ['NOUN']
not measurement, num, date, money
---------------------------------------------------------
QID: (0, 52, 0)
Question: where do most terrestrial caecilians lay ?
Wrong answers: ['sorties onto land', 'adult air', 'salamander species']


Sample: 56
Raw Answer: 
error occurred for sa

not measurement, num, date, money
---------------------------------------------------------
QID: (1, 33, 0)
Question: who was the only resident inhabited by americans ?
Wrong answers: ['united states', 'the united states', 'anchorage']


Sample: 110
Raw Answer: 
error occurred for sample#: 110
Sample: 111
Raw Answer: 
error occurred for sample#: 111
Sample: 112
Raw Answer: 
error occurred for sample#: 112
Sample: 113
Raw Answer: January 3, 1959
Answer type: ['DATE']
---------------------------------------------------------
QID: (1, 37, 5)
Question: when was alaska officially named a state ?
Wrong answers: ['September 21,1940', 'October 13,1499', 'November 12,1526']


Sample: 114
Raw Answer: 77.2% White
Answer type: ['MEASUREMENT']
---------------------------------------------------------
QID: (1, 38, 0)
Question: what is the population of the marshall islands ?
Wrong answers: [Alaska is not, When "Eskimo, The 2007 gross]


Sample: 115
Raw Answer: 
error occurred for sample#: 115
Sample

error occurred for sample#: 214
Sample: 215
Raw Answer: 
error occurred for sample#: 215
Sample: 216
Raw Answer: 
error occurred for sample#: 216
Sample: 217
Raw Answer: 
error occurred for sample#: 217
Sample: 218
Raw Answer: planted
Answer type: ['VP']
not measurement, num, date, money
---------------------------------------------------------
QID: (2, 4, 16)
Question: what happened to the animal that had already been harvested ?
Wrong answers: ['encompasses', 'covers', 'left']


Sample: 219
Raw Answer: Eurasia
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (2, 5, 0)
Question: where did the sumerians live in ?
Wrong answers: ['mehrgarh', 'india', 'rome']


Sample: 220
Raw Answer: squash, beans, and cocoa
Answer type: ['NOUN']
not measurement, num, date, money
---------------------------------------------------------
QID: (2, 6, 0)
Question: what crops are used in the americas ?
Wrong answers: ['agriculture. studi

not measurement, num, date, money
---------------------------------------------------------
QID: (3, 19, 0)
Question: in what two types of work did huxley and maria order for president ?
Wrong answers: ['novel', 'injustice', 'matter']


Sample: 298
Raw Answer: 
error occurred for sample#: 298
Sample: 299
Raw Answer: 
error occurred for sample#: 299
Sample: 300
Raw Answer: 48
Answer type: ['NUM']
---------------------------------------------------------
QID: (3, 22, 0)
Question: how many articles were published by huxley ?
Wrong answers: ['one', 'seven', '48']


Sample: 301
Raw Answer: Vedanta
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (3, 23, 0)
Question: what type of temples were notably engaged at the hollywood and santa barbara temples ?
Wrong answers: ['happy valley school', 'godalming', 'hollywood']


Sample: 302
Raw Answer: 1953
Answer type: ['NUM']
-------------------------------------------------------

not measurement, num, date, money
---------------------------------------------------------
QID: (5, 5, 0)
Question: what is the name of the `` least athletic team '' ?
Wrong answers: ['other methods', 'concrete example suppose', 'squared distribution']


Sample: 397
Raw Answer: breed is likely to produce a very good fit
Answer type: ['NOUN']
not measurement, num, date, money
---------------------------------------------------------
QID: (5, 6, 0)
Question: what is an attempt to explain weight by humans ?
Wrong answers: ['response variable values', 'method has some advantages', 'population survey']


Sample: 398
Raw Answer: 
error occurred for sample#: 398
Sample: 399
Raw Answer: the null hypothesis is that all groups are random samples from the same population
Answer type: []
not measurement, num, date, money
---------------------------------------------------------
QID: (5, 8, 0)
Question: what is a typical application of anova ?
Wrong answers: ['the evidence', 'the results', 'some a

error occurred for sample#: 510
Sample: 511
Raw Answer: 
error occurred for sample#: 511
Sample: 512
Raw Answer: 
error occurred for sample#: 512
Sample: 513
Raw Answer: 
error occurred for sample#: 513
Sample: 514
Raw Answer: 
error occurred for sample#: 514
Sample: 515
Raw Answer: 
error occurred for sample#: 515
Sample: 516
Raw Answer: 
error occurred for sample#: 516
Sample: 517
Raw Answer: 
error occurred for sample#: 517
Sample: 518
Raw Answer: 
error occurred for sample#: 518
Sample: 519
Raw Answer: 
error occurred for sample#: 519
Sample: 520
Raw Answer: 
error occurred for sample#: 520
Sample: 521
Raw Answer: 
error occurred for sample#: 521
Sample: 522
Raw Answer: 
error occurred for sample#: 522
Sample: 523
Raw Answer: 
error occurred for sample#: 523
Sample: 524
Raw Answer: 
error occurred for sample#: 524
Sample: 525
Raw Answer: 
error occurred for sample#: 525
Sample: 526
Raw Answer: 
error occurred for sample#: 526
Sample: 527
Raw Answer: 
error occurred for sample#: 527

not measurement, num, date, money
---------------------------------------------------------
QID: (7, 30, 0)
Question: what is the case of a lower court ?
Wrong answers: ['the plaintiff', 'the application of law', 'the evidence']


Sample: 603
Raw Answer: appellate court
Answer type: ['ADJ']
not measurement, num, date, money
---------------------------------------------------------
QID: (7, 30, 2)
Question: what is the name of the record that can be eligible by lower court ?
Wrong answers: ['different jurisdictions', 'collateral appeal', 'former occurs']


Sample: 604
Raw Answer: 
error occurred for sample#: 604
Sample: 605
Raw Answer: adversarial
Answer type: ['ADJ']
not measurement, num, date, money
---------------------------------------------------------
QID: (7, 32, 0)
Question: what type of system do courts not have ?
Wrong answers: ['only', 'violent', 'actual']


Sample: 606
Raw Answer: 
error occurred for sample#: 606
Sample: 607
Raw Answer: 
error occurred for sample#: 607
Samp

not measurement, num, date, money
---------------------------------------------------------
QID: (11, 4, 8)
Question: what was the name of the poem by bates's music ?
Wrong answers: [This song was, For Super Bowl, 'o mother dear, jerusalem']


Sample: 667
Raw Answer: Ward
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (11, 5, 0)
Question: who was the first to apply the song in 1904. ?
Wrong answers: ['toby keith', 'billy dean', 'lorrie morgan']


Sample: 668
Raw Answer: 
error occurred for sample#: 668
Sample: 669
Raw Answer: 
error occurred for sample#: 669
Sample: 670
Raw Answer: 
error occurred for sample#: 670
Sample: 671
Raw Answer: Bing Crosby
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (11, 9, 0)
Question: who was the song `` b songs '' ?
Wrong answers: ['david letterman', 'harry martin', 'bates']


Sample: 672
Raw Answer: 
error oc

not measurement, num, date, money
---------------------------------------------------------
QID: (13, 9, 0)
Question: the earliest evidence for the use of greek abacus dates back to what century ?
Wrong answers: ['mental calculation', 'upper deck', 'upper bead']


Sample: 737
Raw Answer: 
error occurred for sample#: 737
Sample: 738
Raw Answer: 
error occurred for sample#: 738
Sample: 739
Raw Answer: 
error occurred for sample#: 739
Sample: 740
Raw Answer: 
error occurred for sample#: 740
Sample: 741
Raw Answer: 
error occurred for sample#: 741
Sample: 742
Raw Answer: 
error occurred for sample#: 742
Sample: 743
Raw Answer: Suanpan
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (13, 16, 0)
Question: what can be used for functions other than count ?
Wrong answers: ['old', 'china', 'amc']


Sample: 744
Raw Answer: There are currently schools
Answer type: ['PROPN']
not measurement, num, date, money
-------------------

not measurement, num, date, money
---------------------------------------------------------
QID: (15, 3, 0)
Question: what term is sometimes specified by the term bitumen ?
Wrong answers: ['bitumen', 'excellence', 'scale']


Sample: 815
Raw Answer: 
error occurred for sample#: 815
Sample: 816
Raw Answer: 
error occurred for sample#: 816
Sample: 817
Raw Answer: concrete
Answer type: ['ADJ']
not measurement, num, date, money
---------------------------------------------------------
QID: (15, 6, 0)
Question: what is bitumen used instead of asphalt ?
Wrong answers: ['dry', 'viable', 'canadian']


Sample: 818
Raw Answer: 
error occurred for sample#: 818
Sample: 819
Raw Answer: bitumen
Answer type: ['NOUN']
not measurement, num, date, money
---------------------------------------------------------
QID: (15, 8, 0)
Question: what is asphalt called in american english ?
Wrong answers: ['people', 'pavements', 'form']


Sample: 820
Raw Answer: asphalt concrete
Answer type: ['VP']
not measurement,

not measurement, num, date, money
---------------------------------------------------------
QID: (17, 9, 3)
Question: who was expected to command apollo 11. ?
Wrong answers: ['lovell', 'moon', 'charles duke']


Sample: 920
Raw Answer: Neil Armstrong
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (17, 9, 0)
Question: who was the commander of commanding on the disk crew ?
Wrong answers: ['moon', 'buzz aldrin', 'aldrin']


Sample: 921
Raw Answer: 
error occurred for sample#: 921
Sample: 922
Raw Answer: Lovell
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (17, 11, 2)
Question: who did von neumann 's mission take place ?
Wrong answers: ['collins', 'jack garman', 'charles stark draper laboratory']


Sample: 923
Raw Answer: camaraderie
Answer type: ['ADJ']
not measurement, num, date, money
---------------------------------------------------------
Q

not measurement, num, date, money
---------------------------------------------------------
QID: (18, 13, 2)
Question: who was named `` columbia '' ?
Wrong answers: ['sirius', 'fred haise', 'neil armstrong']


Sample: 1020
Raw Answer: NASA
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (18, 14, 0)
Question: who adopted the plan for apollo missions ?
Wrong answers: ['the apollo spacecraft program office', 'apollo 5', 'msfc']


Sample: 1021
Raw Answer: C-Prime
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (18, 15, 6)
Question: what was the name of apollo 's mission ?
Wrong answers: ['glynn lunney', 'lunar orbit insertion', 'milton windler']


Sample: 1022
Raw Answer: McDivitt
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (18, 16, 0)
Question: who did the change in missio

not measurement, num, date, money
---------------------------------------------------------
QID: (19, 25, 0)
Question: who was the first woman in space ?
Wrong answers: ['november', 'mike melvill', 'walker']


Sample: 1104
Raw Answer: 
error occurred for sample#: 1104
Sample: 1105
Raw Answer: 
error occurred for sample#: 1105
Sample: 1106
Raw Answer: Seven
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (19, 28, 1)
Question: how many russian space agency members have been paid ?
Wrong answers: [On, 'two', '15-minute']


Sample: 1107
Raw Answer: training
Answer type: ['NOUN']
not measurement, num, date, money
---------------------------------------------------------
QID: (19, 29, 0)
Question: where were the first nasa astronauts selected for training in 1959. ?
Wrong answers: ['time', 'person', 'day']


Sample: 1108
Raw Answer: 
error occurred for sample#: 1108
Sample: 1109
Raw Answer: 
error occurred for sample#: 1

not measurement, num, date, money
---------------------------------------------------------
QID: (21, 12, 2)
Question: why are rubidium and caesium different elements ?
Wrong answers: ['other atoms', 'most other atoms', 'most covalent compounds']


Sample: 1174
Raw Answer: 
error occurred for sample#: 1174
Sample: 1175
Raw Answer: 
error occurred for sample#: 1175
Sample: 1176
Raw Answer: 
error occurred for sample#: 1176
Sample: 1177
Raw Answer: 
error occurred for sample#: 1177
Sample: 1178
Raw Answer: 
error occurred for sample#: 1178
Sample: 1179
Raw Answer: alkali metals
Answer type: ['ADJ']
not measurement, num, date, money
---------------------------------------------------------
QID: (21, 18, 0)
Question: what type of metal is more similar to each other ?
Wrong answers: ['odd nuclei', 'their similar ionic radii', 'different way']


Sample: 1180
Raw Answer: three
Answer type: ['NUM']
---------------------------------------------------------
QID: (21, 19, 0)
Question: how many me

not measurement, num, date, money
---------------------------------------------------------
QID: (21, 66, 0)
Question: what is the tendency to form monopositive ?
Wrong answers: ['small size', 'its extreme reactivity', 'single stable isotope']


Sample: 1229
Raw Answer: one electron per neutral atom
Answer type: ['NUM']
---------------------------------------------------------
QID: (21, 67, 0)
Question: what is the symbol of hydrogen ?
Wrong answers: ['150', '10', '1']


Sample: 1230
Raw Answer: small size of a bare proton H
Answer type: ['ADJ']
not measurement, num, date, money
---------------------------------------------------------
QID: (21, 68, 0)
Question: why is hydrogen different than the halogens metal ?
Wrong answers: ['alkali metal cations', 'fractional crystallisation of a rubidium', 'pure rubidium alum']


Sample: 1231
Raw Answer: 1312.0 kJ/mol
Answer type: ['NUM']
---------------------------------------------------------
QID: (21, 69, 0)
Question: what is the first renewa

not measurement, num, date, money
---------------------------------------------------------
QID: (22, 18, 0)
Question: what was the korean alphabet created by ?
Wrong answers: ['old', 'welsh', 'hangul']


Sample: 1278
Raw Answer: 
error occurred for sample#: 1278
Sample: 1279
Raw Answer: 
error occurred for sample#: 1279
Sample: 1280
Raw Answer: Wadi el-Hol script
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (22, 21, 5)
Question: what is the earliest known alphabet ?
Wrong answers: ['middle english', 'mandarin chinese', 'latin']


Sample: 1281
Raw Answer: 
error occurred for sample#: 1281
Sample: 1282
Raw Answer: 
error occurred for sample#: 1282
Sample: 1283
Raw Answer: 
error occurred for sample#: 1283
Sample: 1284
Raw Answer: 
error occurred for sample#: 1284
Sample: 1285
Raw Answer: twelve
Answer type: ['NUM']
---------------------------------------------------------
QID: (22, 26, 0)
Question: what is the nu

not measurement, num, date, money
---------------------------------------------------------
QID: (24, 14, 9)
Question: what are the two length of the sides line system ?
Wrong answers: ['receptor neurons', 'rediscovery of human anatomy', 'plant anatomy']


Sample: 1347
Raw Answer: dorso-ventrally
Answer type: ['ADJ']
not measurement, num, date, money
---------------------------------------------------------
QID: (24, 15, 0)
Question: what creatures are depicted in the triassic ?
Wrong answers: ['vision. there', 'pouch where', 'degree far']


Sample: 1348
Raw Answer: bony fish lineage
Answer type: ['NOUN']
not measurement, num, date, money
---------------------------------------------------------
QID: (24, 16, 0)
Question: what species show more than origin changes ?
Wrong answers: ['motor nerves', 'hypoglossal nerves', 'anterior end']


Sample: 1349
Raw Answer: 
error occurred for sample#: 1349
Sample: 1350
Raw Answer: 
error occurred for sample#: 1350
Sample: 1351
Raw Answer: 
error o

not measurement, num, date, money
---------------------------------------------------------
QID: (26, 5, 4)
Question: who decided to study film ?
Wrong answers: ['carl theodor dreyer', 'claudio abbado', 'leonid kozlov']


Sample: 1416
Raw Answer: 
error occurred for sample#: 1416
Sample: 1417
Raw Answer: 
error occurred for sample#: 1417
Sample: 1418
Raw Answer: 
error occurred for sample#: 1418
Sample: 1419
Raw Answer: 
error occurred for sample#: 1419
Sample: 1420
Raw Answer: 
error occurred for sample#: 1420
Sample: 1421
Raw Answer: Ivan's Childhood
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (26, 11, 0)
Question: what was the first feature film called ?
Wrong answers: ['vera nikolaevna vishnyakova', 'kozelsk', 'iași']


Sample: 1422
Raw Answer: 1971
Answer type: ['NUM']
---------------------------------------------------------
QID: (26, 12, 3)
Question: in what year was film released in the soviet union ?
W

not measurement, num, date, money
---------------------------------------------------------
QID: (27, 21, 0)
Question: what may be ambiguous: at the conceptual image ?
Wrong answers: ['prefixes and suffixes', 'language improvement', 'ambiguity and inconsistency']


Sample: 1492
Raw Answer: 
error occurred for sample#: 1492
Sample: 1493
Raw Answer: the SI prefixes kilo-, mega- and giga
Answer type: ['NOUN']
not measurement, num, date, money
---------------------------------------------------------
QID: (27, 23, 0)
Question: what are the names of the three powers that were used in computer science ?
Wrong answers: ['a word or phrase pertains', 'some polytonality, polymeter', 'this usage']


Sample: 1494
Raw Answer: 
error occurred for sample#: 1494
Sample: 1495
Raw Answer: 
error occurred for sample#: 1495
Sample: 1496
Raw Answer: 
error occurred for sample#: 1496
Sample: 1497
Raw Answer: 
error occurred for sample#: 1497
Sample: 1498
Raw Answer: 
error occurred for sample#: 1498
Sample:

not measurement, num, date, money
---------------------------------------------------------
QID: (29, 14, 17)
Question: what is the per hour of the aardwolf ?
Wrong answers: [The territory, 'about per hour', The aardwolf]


Sample: 1565
Raw Answer: 
error occurred for sample#: 1565
Sample: 1566
Raw Answer: aardwolf
Answer type: ['NOUN']
not measurement, num, date, money
---------------------------------------------------------
QID: (29, 16, 5)
Question: what are dogs also known as ?
Wrong answers: [There, The, The]


Sample: 1567
Raw Answer: decreasing numbers
Answer type: ['VP']
not measurement, num, date, money
---------------------------------------------------------
QID: (29, 16, 0)
Question: the aardwolf has not seen what ?
Wrong answers: ['eat carrion', 'takes place', 'eating larvae']


Sample: 1568
Raw Answer: Aardwolfs
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (29, 17, 0)
Question: what are common tou

not measurement, num, date, money
---------------------------------------------------------
QID: (32, 17, 0)
Question: what is the same word for latin ?
Wrong answers: ['japanese', 'muslim', 'east']


Sample: 1624
Raw Answer: 
error occurred for sample#: 1624
Sample: 1625
Raw Answer: Asia
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (32, 19, 2)
Question: what was the greek goddess of lydia ?
Wrong answers: ['the malay archipelago', 'europe', 'east']


Sample: 1626
Raw Answer: Lydians
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (32, 19, 0)
Question: what was the name of the group that said that the greeks was named after ?
Wrong answers: ['christians', 'jews', 'greek']


Sample: 1627
Raw Answer: 
error occurred for sample#: 1627
Sample: 1628
Raw Answer: 
error occurred for sample#: 1628
Sample: 1629
Raw Answer: East Asia
Answer type: ['PR

error occurred for sample#: 1700
Sample: 1701
Raw Answer: 
error occurred for sample#: 1701
Sample: 1702
Raw Answer: 
error occurred for sample#: 1702
Sample: 1703
Raw Answer: 
error occurred for sample#: 1703
Sample: 1704
Raw Answer: 
error occurred for sample#: 1704
Sample: 1705
Raw Answer: 
error occurred for sample#: 1705
Sample: 1706
Raw Answer: 
error occurred for sample#: 1706
Sample: 1707
Raw Answer: 
error occurred for sample#: 1707
Sample: 1708
Raw Answer: 
error occurred for sample#: 1708
Sample: 1709
Raw Answer: 
error occurred for sample#: 1709
Sample: 1710
Raw Answer: 
error occurred for sample#: 1710
Sample: 1711
Raw Answer: 
error occurred for sample#: 1711
Sample: 1712
Raw Answer: 
error occurred for sample#: 1712
Sample: 1713
Raw Answer: 
error occurred for sample#: 1713
Sample: 1714
Raw Answer: 
error occurred for sample#: 1714
Sample: 1715
Raw Answer: 
error occurred for sample#: 1715
Sample: 1716
Raw Answer: 
error occurred for sample#: 1716
Sample: 1717
Raw Answer

not measurement, num, date, money
---------------------------------------------------------
QID: (35, 5, 1)
Question: along with the amount and number of oceans and seas , what is the amount of amount of sea and seas ?
Wrong answers: ['john cabot', 'north atlantic', 'eastern europe']


Sample: 1814
Raw Answer: Arctic Ocean
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (35, 6, 0)
Question: what ocean is on the west side of the atlantic ocean ?
Wrong answers: ['june', 'south atlantic', 'strait']


Sample: 1815
Raw Answer: Indian Ocean
Answer type: ['PROPN']
not measurement, num, date, money
---------------------------------------------------------
QID: (35, 7, 0)
Question: what ocean did the atlantic border in the northwest ?
Wrong answers: ['north atlantic', 'n.', 'mar']


Sample: 1816
Raw Answer: Southern Ocean
Answer type: ['PROPN']
not measurement, num, date, money
----------------------------------------------

not measurement, num, date, money
---------------------------------------------------------
QID: (35, 43, 0)
Question: south america and africa -rrb- break up in what period ?
Wrong answers: ['atlas mountains', 'pangaea', 'pacific']


Sample: 1855
Raw Answer: 
error occurred for sample#: 1855
Sample: 1856
Raw Answer: 
error occurred for sample#: 1856
Sample: 1857
Raw Answer: 
error occurred for sample#: 1857
Sample: 1858
Raw Answer: 
error occurred for sample#: 1858
Sample: 1859
Raw Answer: 
error occurred for sample#: 1859
Sample: 1860
Raw Answer: 
error occurred for sample#: 1860
Sample: 1861
Raw Answer: 
error occurred for sample#: 1861
Sample: 1862
Raw Answer: 
error occurred for sample#: 1862
Sample: 1863
Raw Answer: 9–8.5 thousand years ago
Answer type: ['NUM']
---------------------------------------------------------
QID: (35, 52, 5)
Question: how long ago did the european isles lose the south atlantic ?
Wrong answers: ['5– 4', '1', '34– 30']


Sample: 1864
Raw Answer: 
error oc

In [54]:
new_distractors = []
for i in range(len(autoq)):
    processed_list = []
    for item in autoq.loc[i]['distractors']:
        if not isinstance(item,str):
            text_version = item.text
            processed_list.append(text_version)
        else:
            processed_list.append(item)
    new_distractors.append(processed_list)
autoq['new_distractors']=new_distractors

In [55]:
autoq.head(20)

Unnamed: 0,topics,paragraphs,questions,id,answers,distractors,distractor_length,new_distractors
0,Amphibian,"Amphibians are ectothermic, tetrapod vertebrat...",,,,[NA],1,[NA]
1,Amphibian,The earliest amphibians evolved in the Devonia...,,,,[NA],1,[NA]
2,Amphibian,The three modern orders of amphibians are Anur...,what are the two modern orders of mammals ?,"(0, 2, 0)",Anura (the frogs and toads,"[ceratophrys, caecilians, apoda]",0,"[ceratophrys, caecilians, apoda]"
3,Amphibian,The three modern orders of amphibians are Anur...,what is the study of mammals called ?,"(0, 2, 4)",batrachology,"[skin, toe pads, walkers]",0,"[skin, toe pads, walkers]"
4,Amphibian,"The word ""amphibian"" is derived from the Ancie...",what are the numbers of the species of species ?,"(0, 3, 5)","over 7,000","[three, two, six]",0,"[three, two, six]"
5,Amphibian,"With the phylogenetic classification, the taxo...",,,,[NA],1,[NA]
6,Amphibian,All modern amphibians are included in the subc...,,,,[NA],1,[NA]
7,Amphibian,Authorities disagree as to whether Salientia i...,,,,[NA],1,[NA]
8,Amphibian,The first major groups of amphibians developed...,what was the first major group of animals deve...,"(0, 7, 0)",amphibians,"[water surface, degree, horny beak]",0,"[water surface, degree, horny beak]"
9,Amphibian,Many examples of species showing transitional ...,,,,[NA],1,[NA]


In [56]:
wiki_dict = df['data']

for record in range(len(autoq)):
    qid = autoq.loc[record]['id']
    for title in range(len(wiki_dict)):
        for p in range(len(wiki_dict[title]['paragraphs'])):
            if wiki_dict[title]['paragraphs'][p]['qas'] == []:
                continue
            else:
                for qa in range(len(wiki_dict[title]['paragraphs'][p]['qas'])):
                    if qid==wiki_dict[title]['paragraphs'][p]['qas'][qa]['id']:
                        wiki_dict[title]['paragraphs'][p]['qas'][qa]['distractors'] = autoq.loc[record]['new_distractors']
                    
df['data']=wiki_dict

with open('AA_squad_w_mc_wiki_01.json', 'w') as outfile:  
    json.dump(df, outfile)

    

### Old Function

In [43]:
# For Debugging
num=1220
print(check_answer_type(autoq.loc[num]['answers']))
print(nlp(unidecode.unidecode(preprocess(autoq.loc[num]['answers']))))
topic, paragraph, qid, question, correct_answer, answer_type, wrong_answers = generate_distractor(autoq.loc[num]['topics'], autoq.loc[num]['paragraphs'], \
                                  autoq.loc[num]['id'],autoq.loc[num]['questions'], autoq.loc[num]['answers'], check_answer_type(autoq.loc[num]['answers']))
print(topic, paragraph, qid, question, correct_answer, answer_type, wrong_answers)
# article = nlp(unidecode.unidecode(autoq.loc[num]['paragraphs']))
# ent_list = [str(i).lower() for i in list(article.ents)]
# ent_labels = [x.label_ for x in article.ents]
# print(ent_list)
# print(ent_labels)

['NOUN']
reconcile platonic and aristotelian thinking
not measurement, num, date, money
Alchemy Meanwhile, theologian contemporaries of the translators made strides towards the reconciliation of faith and experimental rationalism, thereby priming Europe for the influx of alchemical thought. The 11th-century St Anselm put forth the opinion that faith and rationalism were compatible and encouraged rationalism in a Christian context. In the early 12th century, Peter Abelard followed Anselm's work, laying down the foundation for acceptance of Aristotelian thought before the first works of Aristotle had reached the West. In the early 13th century, Robert Grosseteste used Abelard's methods of analysis and added the use of observation, experimentation, and conclusions when conducting scientific investigations. Grosseteste also did much work to reconcile Platonic and Aristotelian thinking. (20, 36, 4) what did grosseteste do with aristotelian ? reconcile platonic and aristotelian thinking ['NO

In [149]:
wikipedia_data = {"data": [], "version" : 1.0}
print(wikipedia_data)
for topic in autoq.topics.unique():
    print('topic: ' + str(topic))
    unique_paragraphs = autoq[autoq['topics']==topic].paragraphs.unique()
    print('number of paragraphs: ' + str(len(unique_paragraphs)))
    agg = autoq[autoq['topics']==topic].groupby('paragraphs').agg({'questions': lambda x: '\n\n'.join(x), \
                                                                   'answers': lambda x: '\n'.join(x), \
                                                                   'id':lambda x: '\n'.join(x), \
                                                                  'distractors': lambda a: list(a)})
    context = []
    for item in range(len(unique_paragraphs)):
        para = unique_paragraphs[item]
        qas = []
        questions = agg.iloc[item]['questions'].split('\n\n')
        print('question')
        print(agg.iloc[item]['questions'])
        print('distractors')
        print(agg.iloc[item]['distractors'])
        for q in range(len(questions)):
            q_dict = {'question': '', 'id': '', 'answers': [], 'distractors': []}
            q_dict['question'] = questions[q]
            q_dict['id'] = agg.iloc[item]['id'].split('\n')[q]
            ans_dict = {'answer_start': 0, 'text': ''}
            ans_dict['text']= agg.iloc[item]['answers'].split('\n')[q]
            q_dict['answers'].append(ans_dict)
            qas.append(q_dict)
            q_dict['distractors'] = agg.iloc[item]['distractors'][q]
        context_dict = {'context': para, 'qas' : qas}
        context.append(context_dict)
    wikipedia_data['data'].append({'title' : topic, 'paragraphs' : context})

with open('AA_squad_w_mc_wiki_01.json', 'w') as outfile:  
    json.dump(wikipedia_data, outfile)

{'data': [], 'version': 1.0}
topic: Amphibian
number of paragraphs: 71
question



distractors
[['NA'], ['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what is permeable to water ?
distractors
[['triassic period', 'late carboniferous', 'early permian']]
question
what does gas exchange prevent ?
distractors
[['superclass', 'alveoli', 'loss']]
question

distractors
[['NA']]
question
what are the two main organs of mammals ?
distractors
[['adult air', 'wood frog', 'layer liquefies']]
question
where does the water move from side to side ?
distractors
[['NA']]
question

distractors
[['NA']]
question
what do some tree cats with limited access to ?
distractors
[['breeding sites before females', 'ovulation in females', 'body cavity']]
question
what type of food is usually ?
distractors
[['ambystoma', 'müllerian', 'allobates']]
question

distractors
[['NA']]
question
what changed to become more protective and prevented water loss ?



topic: Agriculture
number of paragraphs: 64
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what sector was the largest employer in 2007. ?
distractors
[['political', 'greater', 'higher']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what

question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what are ch ?
distractors
[['saturated carbon atoms', 'saturated hydrocarbon', 'trivial names']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what is the size of carbon atoms ?

how many sigma bonds does each hydrogen atom have ?
distractors
[['NA'], ['4', '\xa01', '6-ethyl-2-methyl-5-(1-methylethyl)octane']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
how many group are in the carbon strand ?
distractors
[[

question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what do computer users use ?
distractors
[['NA']]
question
what is a screen magnifier ?
distractors
[['prompt in the kitchen', 'reminders utilize motion sensors', 'puff control']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what are desktop devices ?
distractors
[['the keys', 'the text', 'the pen']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what are examples of videoconferencing technology ?
distractors
[

question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
topic: American National Standards Institute
number of paragraphs: 19
question



distractors
[['NA'], ['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
who founded the ansi ?
distractors
[['adam stanton', 'new york city', 'd.c

question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what type of forms are highly ?
distractors
[['soft', 'speculative', 'periodic']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what is the most common name for zinc ?

what chemical solution is produced by alkali ?
distractors
[['pollucite', 'precursor', 'trace amounts'], ['like structures', 'good fit in terms', 'its electrons']]
question
what type of metals do gases react with ?
distractors
[['other electrons', 'outermost electron', 'higher than caesium']]
question
what type of metal is francium ?
distractors
[['previous', 'bright', 'its atomic']]
question
what are the trivial composed of ?
distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what is the only alkali metal ?
distractors
[['metal cations', 'metal nitride', 'crystal structure']]
question
what is 

question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what was the atomic number of neptune 's nuclei ?
distractors
[['earth', 'james chadwick', 'march']]
question
what is the same atomic number ?
distractors
[['thomas royds', 'prout', 'n']]
question
what is the term for the method that sets the elements by hydrogen number ?
distractors
[['march', 'thomas royds', 'moseley']]
question
what does each element have ?
distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what is the atomic number of hydrogen ?

what is the atomic number equal to ?
distractors
[['evidence', 'target', 'z'], ['quarters', 'element', 'method']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what does p mean ?
distractors
[['prout', 'z', 'k']]
question

distractors
[['NA']]
topic: An

question

distractors
[['NA']]
question
what may play a role in resolving ambiguity ?

what may be ambiguous in one context ?
distractors
[['simone de beauvoir', 'paul sartre', 'martin heidegger'], ['algorithmic methods', 'which case', 'lexical ambiguity']]
question

distractors
[['NA']]
question
what are the names of the three powers that were used in computer science ?
distractors
[['goal', 'mondegreen', 'politician']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
what is common to define the coherent states in quantum physics with formula_37 and states states with formula_38. ?
distractors
[['assumed that multiplication', 'should mean formula_35', 'intended meaning']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distra

[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
where did some of the events lead in the world war ?
distractors
[['flood. ancient chinese mythology', 'nation: which today', 'information technology industry']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
where did the bahá'í faith originate ?
distractors
[['europe', 'the suez canal', 'noah—']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question

distractors
[['NA']]
question
when did the islamic caliphate's defeat the byzantine empire ?
distractors
[['second highest achiever', 'phoenician word', 'various proposals']]
question

distractors
[['NA']]
question
when did the russian empire expand into asia ?
distractors
[['various mycenaean states', 'mesopotamian mythology', 'main east']]
question
who defined the border between asia and europe 

In [16]:
answer =autoq.loc[110]['answers']
article = nlp(unidecode.unidecode(autoq.loc[110]['paragraphs']))

topic_index = next((index for (index, d) in enumerate(df['data']) if d["title"] == 'Autism'), None)
print(topic_index)
index_list = list(range(len(df['data'][topic_index]['paragraphs'])))
print(len(index_list))
# make sure isn't the same paragraph as current paragraph
p_index = next((index for (index, d) in enumerate(df['data'][topic_index]['paragraphs'])\
                if d["context"] == autoq.loc[110]['paragraphs']), None)
index_list.remove(p_index)
print('p_index removed: ' + str(len(index_list)))
# Choose 5 random paragraphs from the same article
alt_p_index = random.choices(index_list,k=5)
print('random indices: ' + str(alt_p_index))
alt_p_list = [df['data'][topic_index]['paragraphs'][i]['context'] for i in alt_p_index]
print(alt_p_list)
alt_article_list = [nlp(alt_p) for alt_p in alt_p_list]

ent_list = [str(i).lower() for i in list(article.ents)]
print('ent_list: ' + str(ent_list))

if answer in ent_list:
    print('in entity list')
    ent_labels = [x.label_ for x in article.ents]
    alt_article_ent_list = []
    alt_article_labels = []
    for p in alt_article_list:
        p_ent_list = [str(i).lower() for i in list(p.ents)]
        p_labels = [x.label_ for x in p.ents]
        alt_article_ent_list.extend(p_ent_list)
        alt_article_labels.extend(p_labels)
    print('alt_article_ent: ' + str(alt_article_ent_list))
    print('alt_article_labels: ' + str(alt_article_labels))
    merged=set(ent_labels+alt_article_ent_list)
#         merged=set(ent_labels+sent_ent_list+alt_article_ent_list+alt_article_ent_list)
    max_length = max(len(ent_list), len(alt_article_ent_list))
    # create table of named entities
#         max_length = max(len(ent_list), len(sent_ent_list), len(alt_article_ent_list),len(alt_topic_ent_list))
#         print(max_length)
    ne_pd = pd.DataFrame()
    ne_pd['entity'] = ent_list + (['NA'] * (max_length - len(ent_list)))
    ne_pd['label'] = list(ent_labels) + (['NA'] * (max_length - len(ent_list)))
    ne_pd['altp_entity'] = alt_article_ent_list + (['NA'] * (max_length - len(alt_article_ent_list)))
    ne_pd['altp_label'] = list(alt_article_labels) + (['NA'] * (max_length - len(alt_article_ent_list)))


    ans_label = [ne_pd[ne_pd['entity']==e]['label'].values[0] for e in ne_pd['entity'] if e in answer]
    print('answer label: ' + str(ans_label))
    alt_ans_list = list(ne_pd[(ne_pd['label'].isin(ans_label))]['entity'])
    alt_ans_list.extend(list(ne_pd[(ne_pd['altp_label'].isin(ans_label))]['altp_entity']))
#         alt_ans_list.extend(list(ne_pd[(ne_pd['altt_label'].isin(ans_label))]['altt_entity']))
print(alt_ans_list)

1
70
p_index removed: 69
random indices: [15, 62, 39, 63, 59]
['Several lines of evidence point to synaptic dysfunction as a cause of autism. Some rare mutations may lead to autism by disrupting some synaptic pathways, such as those involved with cell adhesion. Gene replacement studies in mice suggest that autistic symptoms are closely related to later developmental steps that depend on activity in synapses and on activity-dependent changes. All known teratogens (agents that cause birth defects) related to the risk of autism appear to act during the first eight weeks from conception, and though this does not exclude the possibility that autism can be initiated or affected later, there is strong evidence that autism arises very early in development.', 'The New Latin word "autismus" (English translation "autism") was coined by the Swiss psychiatrist Eugen Bleuler in 1910 as he was defining symptoms of schizophrenia. He derived it from the Greek word "autós" (αὐτός, meaning "self"), and u

In [13]:
topic_index = next((index for (index, d) in enumerate(df['data']) if d["title"] == 'Autism'), None)
print(topic_index)
index_list = list(range(len(df['data'][topic_index]['paragraphs'])))
print(len(index_list))
# make sure isn't the same paragraph as current paragraph
p_index = next((index for (index, d) in enumerate(df['data'][topic_index]['paragraphs'])\
                if d["context"] == autoq.loc[110]['paragraphs']), None)
index_list.remove(p_index)
print('p_index removed: ' + str(len(index_list)))
# Choose 5 random paragraphs from the same article
alt_p_index = random.choices(index_list,k=5)
print('random indices: ' + str(alt_p_index))
alt_p_list = [df['data'][topic_index]['paragraphs'][i]['context'] for i in alt_p_index]
print(alt_p_list)
alt_paragraph_list = [textacy.Doc(alt_p, lang='en_core_web_sm') for alt_p in alt_p_list]

answer = autoq.loc[110]['answers']
correct_ans_pos = str(['r'+str(token.pos_)+'l' for token in \
           nlp(answer)])[1:-1].replace("'r","<").replace("l'",">").replace(',','+',1).replace(',','*').replace(' ','')+'+'
print('correct_ans_pos: ' + str(correct_ans_pos))
alt_p_list=[]
for p in alt_paragraph_list:
    p_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(p, correct_ans_pos)]
    alt_p_list.extend(p_list)
print('alt_p_list: ' + str(alt_p_list))

1
70
p_index removed: 69
random indices: [49, 14, 31, 46, 42]
["Medications may be used to treat ASD symptoms that interfere with integrating a child into home or school when behavioral treatment fails. They may also be used for associated health problems, such as ADHD or anxiety. More than half of US children diagnosed with ASD are prescribed psychoactive drugs or anticonvulsants, with the most common drug classes being antidepressants, stimulants, and antipsychotics. The atypical antipsychotic drugs risperidone and aripiprazole are FDA-approved for treating associated aggressive and self-injurious behaviors. However, their side effects must be weighed against their potential benefits, and people with autism may respond atypically. Side effects, for example, may include weight gain, tiredness, drooling, and aggression. SSRI antidepressants, such as fluoxetine and fluvoxamine, have been shown to be effective in reducing repetitive and ritualistic behaviors, while the stimulant medicati

In [134]:
for i in range(len(sample100)):
    try:
        print('Sample: ' + str(i))
        print('Raw Answer: ' + str(sample100.loc[i]['answers']))
        answer_type = check_answer_type(sample100.loc[i]['answers'])
        print('Answer type: ' + str(answer_type))
        topic, paragraph, question, correct_answer, answer_type, wrong_answers = generate_distractor(sample100.loc[i]['topics'], sample100.loc[i]['paragraphs'], \
                                  sample100.loc[i]['questions'], sample100.loc[i]['answers'], answer_type)
        print('---------------------------------------------------------')
        print('Question: ' + str(question))
        print('Wrong answers: ' + str(wrong_answers))
        print('\n')
    except:
        print("error occurred for sample#: " + str(i))

Sample: 0
Raw Answer: North Carolina and New Mexico
Answer type: ['PROPN']
---------------------------------------------------------
Question: In what states are pharmacist clinicians given prescriptive and diagnostic authority?
Wrong answers: ['indian health service', 'north carolina', 'new mexico']


Sample: 1
Raw Answer: non-specific
Answer type: ['ADJ']
---------------------------------------------------------
Question: The innate immune system responds in a generic way, meaning it is what?
Wrong answers: ['its great', 'innate immune', 'long-lasting']


Sample: 2
Raw Answer: charter
Answer type: ['NOUN']
---------------------------------------------------------
Question: Granting what status would allow private non-religious schools in the US to receive public funds?
Wrong answers: ['list', 'constitution', 'economy']


Sample: 3
Raw Answer: pigment-filled plastids responsible for the bright colors seen in flowers and ripe fruit
Answer type: ['ADJ']
---------------------------------

Answer type: ['ADJ']
---------------------------------------------------------
Question: What health problem did Tesla have in 1879?
Wrong answers: ['subsequent claims', 'same time', 'his family']


Sample: 36
Raw Answer: four
Answer type: ['NUM']
---------------------------------------------------------
Question: How many auricles do most species have?
Wrong answers: ['2', '8', 'two']


Sample: 37
Raw Answer: Supreme Court of the United States
Answer type: ['PROPN']
---------------------------------------------------------
Question: A decision made by what entity restored Tesla's patents?
Wrong answers: ['united states', 'oliver lodge', 'the united states']


Sample: 38
Raw Answer: Charles F. Peck
Answer type: ['PROPN']
---------------------------------------------------------
Question: Who else did Tesla make the acquaintance of in 1886?
Wrong answers: ['hotel new yorker', 'tesla electric company', 'petroleum exporting countries']


Sample: 39
Raw Answer: second most commonly
Answer 

error occurred for sample#: 66
Sample: 67
Raw Answer: Kenya's various ethnic groups typically speak their mother tongues within their own communities
Answer type: ['PROPN']
---------------------------------------------------------
Question: What language is spoken in Kenya?
Wrong answers: ['kenya', 'their mother tongues', 'armed forces']


Sample: 68
Raw Answer: flattened circular
Answer type: ['VP']
---------------------------------------------------------
Question: What shape are granal thylakoids?
Wrong answers: ['are continuous', 'retained fundamental', 'was due']


Sample: 69
Raw Answer: artifact
Answer type: ['ADJ']
---------------------------------------------------------
Question: What is the force called rgarding a potential field between two locations?
Wrong answers: ['british', 'earliest', 'civil']


Sample: 70
Raw Answer: force
Answer type: ['NOUN']
---------------------------------------------------------
Question: What notion keeps it's meaning through both Netonian and S

---------------------------------------------------------
Question: Los Angeles is in the lower part of what?
Wrong answers: ['serbian cyrillic', 'southern california', 'никола']


Sample: 99
Raw Answer: Veni redemptor gentium
Answer type: ['NOUN']
---------------------------------------------------------
Question: What was the hymn based on?
Wrong answers: ['oil shock', 'term effects', 'redemptor gentium']




In [27]:
index_list = list(range(len(df['data'][0]['paragraphs'])))
random.choices(index_list, k=5)

[72, 65, 21, 42, 38]

In [9]:
# For Debugging
topic, paragraph, question, correct_answer, answer_type, wrong_answers = generate_distractor(autoq.loc[5]['topics'], autoq.loc[5]['paragraphs'], \
                                  autoq.loc[5]['questions'], autoq.loc[5]['answers'], check_answer_type(autoq.loc[5]['answers']))
print(topic, paragraph, question, correct_answer, answer_type, wrong_answers)

not an entity
correct_ans_pos: <DET>+<NOUN>+
doc_list: ['the works', 'a ruler', 'a nation', 'the founder', 'the apostles', 'some manifestations', 'the imamate', 'a right', 'each individual', 'some historians', 'the term', 'a term', 'the time', 'the term', 'the turn']
sent_list: ['the apostles']
alt_p_list: ['the suppression', 'the amount', 'the consequences', 'the rulers', 'the anarchist', 'the abandonment', 'these kinds', 'a role', 'the dismemberment', 'the suppression', 'the execution', 'the anarchist', 'the concentration camps', 'the concentration camp', 'the night', 'the guards', 'a latrine']
merged: {'a right', 'the founder', 'a nation', 'the turn', 'the imamate', 'the anarchist', 'the concentration camps', 'the night', 'the guards', 'the concentration camp', 'the dismemberment', 'the amount', 'some historians', 'a latrine', 'a term', 'the term', 'the works', 'the execution', 'the consequences', 'the rulers', 'a role', 'each individual', 'a ruler', 'the apostles', 'some manifestat

In [70]:
def long_generate_distractor(topic, paragraph, question, answer, answer_type):
    wrong_answers=[]
    #preprocess answer
    correct_answer = nlp(preprocess(answer))
    # get answer pos
    ans_length = len(correct_answer)
    ans_tag = [token.tag_ for token in correct_answer]
    ans_pos = [token.pos_ for token in correct_answer]
    # tokenize paragraph
    article = nlp(unidecode.unidecode(paragraph))
    doc = textacy.Doc(paragraph, lang='en_core_web_sm')
        
    # Preprocessing for same sentence distractor generation
    # get all sentences in paragraph
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sent_list = sent_detector.tokenize(paragraph.strip())

    # find sentence that has answer
    for s in sent_list:
        if answer in s:
            ans_sent = s
    # tokenize sentence
    sentence = nlp(ans_sent)
    sent = textacy.Doc(sentence, lang='en_core_web_sm')
        
    # Preprocessing for same topic distractor generation
    # choose random paragraph from same topic
    topic_index = next((index for (index, d) in enumerate(df['data']) if d["title"] == topic), None)
    index_list = list(range(len(df['data'][topic_index]['paragraphs'])))
    # make sure isn't the same paragraph as current paragraph
    p_index = next((index for (index, d) in enumerate(df['data'][topic_index]['paragraphs'])\
                    if d["context"] == paragraph), None)
    index_list.remove(p_index)
    alt_p_index = random.choice(index_list)
    alt_p = df['data'][topic_index]['paragraphs'][alt_p_index]['context']
    alt_article = nlp(alt_p)
    alt_paragraph = textacy.Doc(alt_p, lang='en_core_web_sm')
        
    # Preprocessing for different topic distractor generation
    # choose random topic
    topic_list = list(range(len(df['data'])))
    # make sure isn't the same topic as current topic
    topic_list.remove(topic_index)
    alt_topic_index = random.choice(topic_list)
    alt_topic = df['data'][alt_topic_index]['title']
    alt_topic_paragraph = df['data'][alt_topic_index]['paragraphs'][0]['context']
    # select first paragraph from topic - option to randomize selection as well
    alt_topic_article = nlp(alt_topic_paragraph)
    alt_topic = textacy.Doc(alt_topic_paragraph, lang='en_core_web_sm')
    
    if 'VP' in answer_type:
        vb_pattern = r'<VERB>+<ADJ>*<ADV>*<PART>*<DET>*<NOUN>+<ADP>*<DET>*<NOUN>*'
        vb_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, vb_pattern)]
#         print('vb_list: ' + str(vb_list))
        sent_vb_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, vb_pattern)]
#         print('sent_vb_list: ' + str(sent_vb_list))
        alt_paragraph_vb_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_paragraph, vb_pattern)]
#         print('alt_paragraph_vb_list: ' + str(alt_paragraph_vb_list))
        alt_topic_vb_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_topic, vb_pattern)]  
#         print('alt_topic_vb_list: ' + str(alt_topic_vb_list))
        merged = set(vb_list+sent_vb_list+alt_paragraph_vb_list+alt_topic_vb_list)
        for i in merged:
#             print('i: ' + str(i))
            item = nlp(i)
#             print('item[0].tag_ '+str(item[0].tag_))
#             print('ans_tag[0] '+str(ans_tag[0]))
#             print('item[-1].pos_ '+str(item[-1].pos_))
#             print('ans_pos[-1] '+str(ans_pos[-1]))
            if (item[0].tag_ == ans_tag[0]) and (item[-1].pos_ == ans_pos[-1]):
                wrong_answers.append(item)
        if len(wrong_answers) < 3:
                wrong_answers = merged
    elif 'VERB' in answer_type:
        verbs = [token.orth_.lower() for token in article if (token.pos_=='VERB')]
        sent_verbs = verbs = [token.orth_.lower() for token in sentence if (token.pos_=='VERB')]
        alt_paragraph_verbs = [token.orth_.lower() for token in alt_article if (token.pos_=='VERB')]
        alt_topic_verbs = [token.orth_.lower() for token in alt_topic_article if (token.pos_=='VERB')]
        merged = set(verbs+sent_verbs+alt_paragraph_verbs+alt_topic_verbs)
        for i in merged:
            item = nlp(i)
            if (item[0].tag_ == ans_pos_tag[0]):
                wrong_answers.append(item)
    elif 'ADJ' in answer_type:
        if ans_length==1:
            single_adj = [token.orth_.lower() for token in article if token.pos_=='ADJ']
            single_adj_sent = [token.orth_.lower() for token in sentence if token.pos_=='ADJ']
            alt_paragraph_single_adj = [token.orth_.lower() for token in alt_article if token.pos_=='ADJ']
            alt_topic_single_adj = [token.orth_.lower() for token in alt_topic_article if token.pos_=='ADJ']
            wrong_answers = set(single_adj+single_adj_sent+alt_paragraph_single_adj+alt_topic_single_adj)
        else:
            ent_list = [str(i) for i in list(article.ents)]
#             print(ent_list)
            if answer in ent_list:
                ent_labels = [x.label_ for x in article.ents]
                # get all named entities in sentence
                sent_ent_list = [str(i) for i in list(sentence.ents)]
                sent_labels = [x.label_ for x in sentence.ents]
                alt_topic_ent_list = [str(i) for i in list(alt_topic_article.ents)]
                alt_topic_labels = [x.label_ for x in alt_topic_article.ents]
                alt_article_ent_list = [str(i) for i in list(alt_article.ents)]
                alt_article_labels = [x.label_ for x in alt_article.ents]

                # create table of named entities
                max_length = max(len(ent_list), len(sent_ent_list), len(alt_article_ent_list),len(alt_topic_ent_list))
        #         print(max_length)
                ne_pd = pd.DataFrame()
                ne_pd['entity'] = ent_list + (['NA'] * (max_length - len(ent_list)))
                ne_pd['label'] = list(ent_labels) + (['NA'] * (max_length - len(ent_list)))
                ne_pd['sent_entity'] = sent_ent_list + (['NA'] * (max_length - len(sent_ent_list)))
                ne_pd['sent_label'] = list(sent_labels) + (['NA'] * (max_length - len(sent_ent_list)))
                ne_pd['altp_entity'] = alt_article_ent_list + (['NA'] * (max_length - len(alt_article_ent_list)))
                ne_pd['altp_label'] = list(alt_article_labels) + (['NA'] * (max_length - len(alt_article_ent_list)))
                ne_pd['altt_entity'] = alt_topic_ent_list + (['NA'] * (max_length - len(alt_topic_ent_list)))
                ne_pd['altt_label'] = list(alt_topic_labels) + (['NA'] * (max_length - len(alt_topic_ent_list)))

                ans_label = [ne_pd[ne_pd['entity']==e]['label'].values[0] for e in ne_pd['entity'] if e in answer]
        #         print(ans_label)
                alt_ans_list = list(ne_pd[(ne_pd['label'].isin(ans_label))]['entity'])
                alt_ans_list.extend(list(ne_pd[(ne_pd['sent_label'].isin(ans_label))]['sent_entity']))
                alt_ans_list.extend(list(ne_pd[(ne_pd['altp_label'].isin(ans_label))]['altp_entity']))
                alt_ans_list.extend(list(ne_pd[(ne_pd['altt_label'].isin(ans_label))]['altt_entity']))
                wrong_answers = set(alt_ans_list)
            else:
            #adj_pattern = r'<ADV>*<ADJ>+<PART>*<DET>*<NOUN>*'
                correct_ans_pos = str(['r'+str(token.pos_)+'l' for token in \
               nlp(answer)])[1:-1].replace("'r","<").replace("l'",">").replace(',','+',1).replace(',','*').replace(' ','')
                adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, correct_ans_pos)]
                sent_adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, correct_ans_pos)]
                alt_paragraph_adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_paragraph, correct_ans_pos)]
                alt_topic_adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_topic, correct_ans_pos)]

                hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),paragraph)
                hyphen_sep_words_sent = re.findall(r'\w+(?:-\w+)+'.lower(),ans_sent)
                alt_p_hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),alt_p)
                alt_topic_hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),alt_topic_paragraph)

                adj_list.extend(hyphen_sep_words)
                sent_adj_list.extend(hyphen_sep_words_sent)
                alt_paragraph_adj_list.extend(alt_p_hyphen_sep_words)
                alt_topic_adj_list.extend(alt_topic_hyphen_sep_words)
                wrong_answers = set(adj_list+sent_adj_list+alt_paragraph_adj_list+alt_topic_adj_list)
    elif 'ADV' in answer_type:
        if ans_length==1:
            single_adj = [token.orth_.lower() for token in article if token.pos_=='ADV']
            single_adj_sent = [token.orth_.lower() for token in sentence if token.pos_=='ADV']
            alt_paragraph_single_adj = [token.orth_.lower() for token in alt_article if token.pos_=='ADV']
            alt_topic_single_adj = [token.orth_.lower() for token in alt_topic_article if token.pos_=='ADV']
            wrong_answers = set(single_adj+single_adj_sent+alt_paragraph_single_adj+alt_topic_single_adj)
        else:
            ent_list = [str(i) for i in list(article.ents)]
#             print(ent_list)
            if answer in ent_list:
                ent_labels = [x.label_ for x in article.ents]
                # get all named entities in sentence
                sent_ent_list = [str(i) for i in list(sentence.ents)]
                sent_labels = [x.label_ for x in sentence.ents]
                alt_topic_ent_list = [str(i) for i in list(alt_topic_article.ents)]
                alt_topic_labels = [x.label_ for x in alt_topic_article.ents]
                alt_article_ent_list = [str(i) for i in list(alt_article.ents)]
                alt_article_labels = [x.label_ for x in alt_article.ents]

                # create table of named entities
                max_length = max(len(ent_list), len(sent_ent_list), len(alt_article_ent_list),len(alt_topic_ent_list))
        #         print(max_length)
                ne_pd = pd.DataFrame()
                ne_pd['entity'] = ent_list + (['NA'] * (max_length - len(ent_list)))
                ne_pd['label'] = list(ent_labels) + (['NA'] * (max_length - len(ent_list)))
                ne_pd['sent_entity'] = sent_ent_list + (['NA'] * (max_length - len(sent_ent_list)))
                ne_pd['sent_label'] = list(sent_labels) + (['NA'] * (max_length - len(sent_ent_list)))
                ne_pd['altp_entity'] = alt_article_ent_list + (['NA'] * (max_length - len(alt_article_ent_list)))
                ne_pd['altp_label'] = list(alt_article_labels) + (['NA'] * (max_length - len(alt_article_ent_list)))
                ne_pd['altt_entity'] = alt_topic_ent_list + (['NA'] * (max_length - len(alt_topic_ent_list)))
                ne_pd['altt_label'] = list(alt_topic_labels) + (['NA'] * (max_length - len(alt_topic_ent_list)))

                ans_label = [ne_pd[ne_pd['entity']==e]['label'].values[0] for e in ne_pd['entity'] if e in answer]
        #         print(ans_label)
                alt_ans_list = list(ne_pd[(ne_pd['label'].isin(ans_label))]['entity'])
                alt_ans_list.extend(list(ne_pd[(ne_pd['sent_label'].isin(ans_label))]['sent_entity']))
                alt_ans_list.extend(list(ne_pd[(ne_pd['altp_label'].isin(ans_label))]['altp_entity']))
                alt_ans_list.extend(list(ne_pd[(ne_pd['altt_label'].isin(ans_label))]['altt_entity']))
                wrong_answers = set(alt_ans_list)
            else:
            #adj_pattern = r'<ADV>*<ADJ>+<PART>*<DET>*<NOUN>*'
                correct_ans_pos = str(['r'+str(token.pos_)+'l' for token in \
               nlp(answer)])[1:-1].replace("'r","<").replace("l'",">").replace(',','+',1).replace(',','*').replace(' ','')
                adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, correct_ans_pos)]
                sent_adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, correct_ans_pos)]
                alt_paragraph_adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_paragraph, correct_ans_pos)]
                alt_topic_adj_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_topic, correct_ans_pos)]

                hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),paragraph)
                hyphen_sep_words_sent = re.findall(r'\w+(?:-\w+)+'.lower(),ans_sent)
                alt_p_hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),alt_p)
                alt_topic_hyphen_sep_words = re.findall(r'\w+(?:-\w+)+'.lower(),alt_topic_paragraph)

                adj_list.extend(hyphen_sep_words)
                sent_adj_list.extend(hyphen_sep_words_sent)
                alt_paragraph_adj_list.extend(alt_p_hyphen_sep_words)
                alt_topic_adj_list.extend(alt_topic_hyphen_sep_words)
                wrong_answers = set(adj_list+sent_adj_list+alt_paragraph_adj_list+alt_topic_adj_list)
    elif 'NOUN' in answer_type:
        if ans_length==1:
            single_noun = [token.orth_.lower() for token in article if (token.pos_=='NOUN')]
            single_noun_sent = [token.orth_.lower() for token in sentence if (token.pos_=='NOUN')]
            alt_single_noun = [token.orth_.lower() for token in alt_article if (token.pos_=='NOUN')]
            alt_topic_single_noun = [token.orth_.lower() for token in alt_topic_article if (token.pos_=='NOUN')]
            wrong_answers = set(single_noun+single_noun_sent+alt_single_noun+alt_topic_single_noun)
        else:
            ent_list = [str(i) for i in list(article.ents)]
#             print(ent_list)
            if answer in ent_list:
                ent_labels = [x.label_ for x in article.ents]
                # get all named entities in sentence
                sent_ent_list = [str(i) for i in list(sentence.ents)]
                sent_labels = [x.label_ for x in sentence.ents]
                alt_topic_ent_list = [str(i) for i in list(alt_topic_article.ents)]
                alt_topic_labels = [x.label_ for x in alt_topic_article.ents]
                alt_article_ent_list = [str(i) for i in list(alt_article.ents)]
                alt_article_labels = [x.label_ for x in alt_article.ents]

                # create table of named entities
                max_length = max(len(ent_list), len(sent_ent_list), len(alt_article_ent_list),len(alt_topic_ent_list))
        #         print(max_length)
                ne_pd = pd.DataFrame()
                ne_pd['entity'] = ent_list + (['NA'] * (max_length - len(ent_list)))
                ne_pd['label'] = list(ent_labels) + (['NA'] * (max_length - len(ent_list)))
                ne_pd['sent_entity'] = sent_ent_list + (['NA'] * (max_length - len(sent_ent_list)))
                ne_pd['sent_label'] = list(sent_labels) + (['NA'] * (max_length - len(sent_ent_list)))
                ne_pd['altp_entity'] = alt_article_ent_list + (['NA'] * (max_length - len(alt_article_ent_list)))
                ne_pd['altp_label'] = list(alt_article_labels) + (['NA'] * (max_length - len(alt_article_ent_list)))
                ne_pd['altt_entity'] = alt_topic_ent_list + (['NA'] * (max_length - len(alt_topic_ent_list)))
                ne_pd['altt_label'] = list(alt_topic_labels) + (['NA'] * (max_length - len(alt_topic_ent_list)))

                ans_label = [ne_pd[ne_pd['entity']==e]['label'].values[0] for e in ne_pd['entity'] if e in answer]
        #         print(ans_label)
                alt_ans_list = list(ne_pd[(ne_pd['label'].isin(ans_label))]['entity'])
                alt_ans_list.extend(list(ne_pd[(ne_pd['sent_label'].isin(ans_label))]['sent_entity']))
                alt_ans_list.extend(list(ne_pd[(ne_pd['altp_label'].isin(ans_label))]['altp_entity']))
                alt_ans_list.extend(list(ne_pd[(ne_pd['altt_label'].isin(ans_label))]['altt_entity']))
                wrong_answers = set(alt_ans_list)
            else:
                correct_ans_pos = str(['r'+str(token.pos_)+'l' for token in \
               nlp(answer)])[1:-1].replace("'r","<").replace("l'",">").replace(',','+',1).replace(',','*').replace(' ','')
                np_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, correct_ans_pos)]
                sent_np_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, correct_ans_pos)]
                alt_paragraph_np_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_paragraph, correct_ans_pos)]
                alt_topic_np_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_topic, correct_ans_pos)]
                wrong_answers = set(np_list+sent_np_list+alt_paragraph_np_list+alt_topic_np_list)
    elif 'YEAR' in answer_type:
        number_list = [token.orth_ for token in article if token.pos_=='NUM']
        number_list_sent = [token.orth_ for token in sentence if token.pos_=='NUM']
        alt_paragraph_number_list = [token.orth_ for token in alt_article if token.pos_=='NUM']
        alt_topic_number_list = [token.orth_ for token in alt_topic_article if token.pos_=='NUM']
        merged = set(number_list+number_list_sent+alt_paragraph_number_list+alt_topic_number_list)
        for i in merged:
            if len(i)==4:
                wrong_answers.append(i)
    elif 'NUM' in answer_type:
        number_list = [token.orth_ for token in article if token.pos_=='NUM']
        number_list_sent = [token.orth_ for token in sentence if token.pos_=='NUM']
        alt_paragraph_number_list = [token.orth_ for token in alt_article if token.pos_=='NUM']
        alt_topic_number_list = [token.orth_ for token in alt_topic_article if token.pos_=='NUM']
        merged = set(number_list+number_list_sent+alt_paragraph_number_list+alt_topic_number_list)
        wrong_answers = [t2d.convert(str(i)).lstrip() for i in merged]
    elif 'TIME' in answer_type:
        time_list = [token.orth_ for token in article if token.pos_=='NUM']
        time_list_sent = [token.orth_ for token in sentence if token.pos_=='NUM']
        alt_paragraph_time_list = [token.orth_ for token in alt_article if token.pos_=='NUM']
        alt_topic_time_list = [token.orth_ for token in alt_topic_article if token.pos_=='NUM']
        merged = set(time_list+time_list_sent+alt_paragraph_time_list+alt_topic_time_list)
        print('merged: ' + str(merged))
        wrong_answers = [t2d.convert(str(i)).lstrip()+' ' + str(correct_answer[-1]) for i in merged]
    elif 'MONEY' in answer_type:
        currency = answer[0]
        number_list = [str(currency) + token.orth_ for token in article if token.pos_=='NUM']
        number_list_sent = [str(currency) + token.orth_ for token in sentence if token.pos_=='NUM']
        alt_paragraph_number_list = [str(currency) + token.orth_ for token in alt_article if token.pos_=='NUM']
        alt_topic_number_list = [str(currency) + token.orth_ for token in alt_topic_article if token.pos_=='NUM']
        merged = set(number_list+number_list_sent+alt_paragraph_number_list+alt_topic_number_list)
        wrong_answers = [t2d.convert(str(i)).replace(' ','') for i in merged]
    elif 'DATE' in answer_type:
        if ans_length==1:
            month_list=random.sample(['January', 'Jan', 'February', 'March', 'April', \
                                   'May','June', 'July', 'August', 'September', \
                                   'October','November', \
                                      'December'].remove(answer.capitalize()),3)
        else:
            date_regex = r'<NUM>*<PROPN>+<NUM>*<PUNCT>*<NUM>+'
            date_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, date_regex)]
            sent_date_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, date_regex)]
            alt_paragraph_date_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_paragraph, date_regex)]
            alt_topic_date_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_topic, date_regex)]
            wrong_answers = set(date_list+sent_date_list+alt_paragraph_date_list+alt_topic_date_list)
    elif 'PROPN' in answer_type:
        ent_list = [str(i) for i in list(article.ents)]
#             print(ent_list)
        if answer in ent_list:
            ent_labels = [x.label_ for x in article.ents]
            # get all named entities in sentence
            sent_ent_list = [str(i) for i in list(sentence.ents)]
            sent_labels = [x.label_ for x in sentence.ents]
            alt_topic_ent_list = [str(i) for i in list(alt_topic_article.ents)]
            alt_topic_labels = [x.label_ for x in alt_topic_article.ents]
            alt_article_ent_list = [str(i) for i in list(alt_article.ents)]
            alt_article_labels = [x.label_ for x in alt_article.ents]

            # create table of named entities
            max_length = max(len(ent_list), len(sent_ent_list), len(alt_article_ent_list),len(alt_topic_ent_list))
    #         print(max_length)
            ne_pd = pd.DataFrame()
            ne_pd['entity'] = ent_list + (['NA'] * (max_length - len(ent_list)))
            ne_pd['label'] = list(ent_labels) + (['NA'] * (max_length - len(ent_list)))
            ne_pd['sent_entity'] = sent_ent_list + (['NA'] * (max_length - len(sent_ent_list)))
            ne_pd['sent_label'] = list(sent_labels) + (['NA'] * (max_length - len(sent_ent_list)))
            ne_pd['altp_entity'] = alt_article_ent_list + (['NA'] * (max_length - len(alt_article_ent_list)))
            ne_pd['altp_label'] = list(alt_article_labels) + (['NA'] * (max_length - len(alt_article_ent_list)))
            ne_pd['altt_entity'] = alt_topic_ent_list + (['NA'] * (max_length - len(alt_topic_ent_list)))
            ne_pd['altt_label'] = list(alt_topic_labels) + (['NA'] * (max_length - len(alt_topic_ent_list)))

            ans_label = [ne_pd[ne_pd['entity']==e]['label'].values[0] for e in ne_pd['entity'] if e in answer]
    #         print(ans_label)
            alt_ans_list = list(ne_pd[(ne_pd['label'].isin(ans_label))]['entity'])
            alt_ans_list.extend(list(ne_pd[(ne_pd['sent_label'].isin(ans_label))]['sent_entity']))
            alt_ans_list.extend(list(ne_pd[(ne_pd['altp_label'].isin(ans_label))]['altp_entity']))
            alt_ans_list.extend(list(ne_pd[(ne_pd['altt_label'].isin(ans_label))]['altt_entity']))
            wrong_answers = set(alt_ans_list) - set([str(correct_answer)])
        else:
            correct_ans_pos = str(['r'+str(token.pos_)+'l' for token in \
           nlp(answer)])[1:-1].replace("'r","<").replace("l'",">").replace(',','+',1).replace(',','*').replace(' ','')
            np_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, correct_ans_pos)]
            sent_np_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, correct_ans_pos)]
            alt_paragraph_np_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_paragraph, correct_ans_pos)]
            alt_topic_np_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_topic, correct_ans_pos)]
            wrong_answers = set(np_list+sent_np_list+alt_paragraph_np_list+alt_topic_np_list)
    else:
        correct_ans_pos = str(['r'+str(token.pos_)+'l' for token in \
           nlp(answer)])[1:-1].replace("'r","<").replace("l'",">").replace(',','+',1).replace(',','*').replace(' ','')
        propn_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(doc, correct_ans_pos)]
        sent_propn_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(sent, correct_ans_pos)]
        alt_paragraph_propn_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_paragraph, correct_ans_pos)]
        alt_topic_propn_list = [l.text.lower() for l in textacy.extract.pos_regex_matches(alt_topic, correct_ans_pos)]
        wrong_answers = set(propn_list+sent_propn_list+alt_paragraph_propn_list+alt_topic_propn_list)
        if 'ADP' in answer_type:
            wrong_answers = [i for i in wrong_answers if i[0]==correct_answer[0].orth_]
    wrong_answers = [i for i in wrong_answers for word in range(len(correct_answer)) if correct_answer[word].orth_ not in i]
                
    return (topic, paragraph, question, correct_answer, answer_type, set(wrong_answers))
        