In [28]:
import csv
import numpy as np
import os
import os.path as op
import pandas as pd
import spacy
import ner_pipeline

def csv_to_list(csv_file_path):

    line_list = []
    
    with open(csv_file_path, newline="") as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            line_list.append(row)

    return(line_list)

def make_id_name_dict():
    id_name_dict = {}
    docs_csv = op.join("data", "documents.csv")
    docs_list = csv_to_list(docs_csv)
    for line in docs_list[1:]:
        try:
            if line[2] == "gutenberg":
                id_name_dict[line[0]] = line[6]
        except:
            pass
    return(id_name_dict)


def load_np_pickle(path):
    if not path.endswith(".npy"):
        path += ".npy"
    loadobj = np.load(path, allow_pickle=True)
    return(loadobj)

id2name = make_id_name_dict()
name2id = {v: k for k, v in id2name.items()}

def map_words_to_named_entities(obj_dict, classes = ["ORG","LOC","PERSON"]):

    word2entity = {}

    for ne_class in set(classes)&set(obj_dict.keys()):
        for obj in obj_dict[ne_class]:
            for name in obj.name_variants:
                word2entity[name] = obj

    return(word2entity)

def get_ne_indices(string, word2entity):
    """Returns a dictionary of type { (char_idx_start, char_idx_end) : ne_obj }"""

    string = string.lower()

    nes_longest_first = sorted(word2entity.keys(), key = lambda x: len(x), reverse=True)

    nes_longest_first = [ne for ne in nes_longest_first if not ne.endswith("'s")]

    for ne in nes_longest_first:
        if ne.lower() in string:
            ne_class = word2entity[ne].class_string
            nes = re.findall(r"\b" + ne.lower() + r"\b", ne_class, string)
            print(nes)
            pause = input("y/n")
            if pause == "y":
                import pdb; pdb.set_trace()
            
    return string


In [29]:
ft_dir = "./preprocessed_data/full_texts"
os.listdir(ft_dir)

['bows', 'ne_obj_lists', 'sent_tokenized']

In [30]:
qaps_1 = pd.read_csv(op.join("data",'narrativeqa_qaps_single_answer_1.csv'))
qaps_2 = pd.read_csv(op.join("data",'narrativeqa_qaps_single_answer_2.csv'))

qaps = qaps_1.append(qaps_2)

# Remove data where the answer is not an exact span of the text (for training and validation purposes)
qaps = qaps[qaps['ans_start']!=-1]

In [31]:
summaries = csv_to_list(op.join("data","summaries.csv"))
id2summary = {row[0]: row[3] for row in summaries[1:]} 
summary2id = {v:k for k, v in id2summary.items()}
del(summaries)

In [32]:
qaps.head()

Unnamed: 0.1,Unnamed: 0,set,question_tokenized,summary,answer,ans_start
1,1,test,Where does this radio station take place ?,"Mark Hunter ( Slater ) , a high school student...","Phoenix , Arizona",69
3,3,test,Who commits suicide ?,"Mark Hunter ( Slater ) , a high school student...",Malcolm,1114
7,7,test,Who gets arrested ?,"Mark Hunter ( Slater ) , a high school student...",Mark and Nora,2846
11,11,test,What is Mark s Pirate Station s theme song ?,"Mark Hunter ( Slater ) , a high school student...",Everybody Knows,340
12,12,test,What is Nora Diniro to Mark ?,"Mark Hunter ( Slater ) , a high school student...",a fellow student,1031


In [27]:
nqa_qas = pd.read_csv(op.join("data",'narrativeqa_qas.csv'))
nqa_qas.head()

Unnamed: 0,document_id,set,question,answer1,answer2,question_tokenized,answer1_tokenized,answer2_tokenized
0,0025577043f5090cd603c6aea60f26e236195594,test,Who is Mark Hunter?,He is a high school student in Phoenix.,A loner and outsider student with a radio stat...,Who is Mark Hunter ?,He is a high school student in Phoenix .,A loner and outsider student with a radio stat...
1,0025577043f5090cd603c6aea60f26e236195594,test,Where does this radio station take place?,It takes place in Mark's parents basement.,"Phoenix, Arizona",Where does this radio station take place ?,It takes place in Mark s parents basement .,"Phoenix , Arizona"
2,0025577043f5090cd603c6aea60f26e236195594,test,Why do more students tune into Mark's show?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...,Why do more students tune into Mark s show ?,Mark talks about what goes on at school and in...,Because he has a thing to say about what is ha...
3,0025577043f5090cd603c6aea60f26e236195594,test,Who commits suicide?,Malcolm.,Malcolm.,Who commits suicide ?,Malcolm .,Malcolm .
4,0025577043f5090cd603c6aea60f26e236195594,test,What does Paige jam into her microwave?,She jams her medals and accolades.,Her award medals,What does Paige jam into her microwave ?,She jams her medals and accolades .,Her award medals


In [18]:
books = ["Anna Karenina"]

def get_word2entity(dset, title):

    obj_list = load_np_pickle(op.join(ft_dir, "ne_obj_lists", dset, title))
    obj_types = set([obj.class_string for obj in obj_list])
    obj_dict = {otype: [obj for obj in obj_list if obj.class_string == otype] for otype in obj_types}
    word2entity = map_words_to_named_entities(obj_dict)
    return(word2entity)
    
w2e = get_word2entity("train", "Anna Karenina")

def get_summary_and_questions(book)
    book_id = name2id[book]
    book_summary = id2summary[book_id]
    rows = nqa_qas.loc[nqa_qas["document_id"] == book_id]
    questions = rows.question_tokenized.to_list()
    return(book_summary, questions)


for book in books:
    w2e = get_word2entity("train", book)
    book_summary, questions = get_summary_and_questions(book)
    ne_inds_dict = get_ne_indices(book_summary, w2e)




Anna Karenina
Anna Karenina is the tragic story of a married aristocrat/socialite and her affair with the affluent Count Vronsky . The story opens when she arrives in the midst of a family broken up by her brother s unbridled womanizing—something that prefigures her own later situation , though she would experience less tolerance by others . A bachelor , Vronsky is eager to marry her if she will agree to leave her husband Karenin , a senior government official , but she is vulnerable to the pressures of Russian social norms , the moral laws of the Russian Orthodox Church , her own insecurities , and Karenin s indecision . Although Vronsky and Anna go to Italy , where they can be together , they have trouble making friends . Back in Russia , she is shunned , becoming further isolated and anxious , while Vronsky pursues his social life . Despite Vronsky s reassurances , she grows increasingly possessive and paranoid about his imagined infidelity , fearing loss of control . A parallel sto

In [None]:
print(w2e)
#from qa.question_answering.models.additional_features import *

In [None]:
           
for dset in ["train","test","valid"]:
    save_dir_ft_sents = op.join(ft_dir, "sent_tokenized", dset)
    save_dir_ft_bows  = op.join(ft_dir, "bows", dset)
    save_dir_ft_obj   = op.join(ft_dir, "ne_obj_lists", dset)
    save_dir_sm_sents = op.join(sm_dir, "sent_tokenized", dset)
    save_dir_sm_bows  = op.join(sm_dir, "bows", dset)
    save_dir_dm_obj   = op.join(sm_dir, "ne_obj_lists", dset)
    from_dir_ft       = op.join(full_texts_dir, dset)
    from_dir_sm       = op.join(summary_dir, dset)
    texts = os.listdir(from_dir_sm)
    for i, text in enumerate(texts):
