Imports

In [1]:
import os
import os.path as op
import numpy as np
mimir_dir = os.environ["MIMIR_DIR"]
data_dir = op.join(mimir_dir, "data")
pre_dir = op.join(mimir_dir, "preprocessed_data")
full_texts_dir = op.join(data_dir, "nqa_gutenberg_corpus")
summary_dir = op.join(data_dir, "nqa_summary_text_files")
from sentence_tokenize import file_to_sentence_tokens
from preprocessing_pipeline import pipeline, s2v
from ner_pipeline import *
from utils import spacy_single_line, make_name_url_dict

Basic preprocessing example:

In [2]:
url_dict = make_name_url_dict() 
    

In [3]:
def get_sent_tokens(file_path):
    sent_tokens = file_to_sentence_tokens(file_path)
    sent_tokens = [x for x in sent_tokens if len(x) > 0]
    return(sent_tokens)


def preprocess(sent_tokens, verbose = False):

    if verbose == True:
        print("Step 1: sentence tokenization")
    if verbose == True:
        print(sent_tokens[:3])

    if verbose == True:
        print("\n\nStep 2: Word tokenization, stemming")
    pipelined = pipeline(sent_tokens)
    pipelined = [x for x in pipelined if len(x) > 0]

    if verbose == True:
        print("\n\nStep 3: Vectorize")
    vecs, dictionary = s2v(pipelined)
    if verbose == True:
        print("Vectors")
        print(list(vecs.items())[:3])
        print("Idx2word dictionary")
        print(list(dictionary.items())[:3], "...")
    
    word2idx = {x:y for y,x in dictionary.items()}
    
    return(vecs, word2idx)
    

In [4]:
def sents_list_to_ne_obj_list(sents_list, sent_tokens):

    ner_function = spacy_single_line

    type_threshold = 0.999  #Threshold below which we reject
                            # a candidate Named Entity
    token_threshold = 0.9   #Threshold below which we reject
                            # a candidate NE in second pass

    sents_list = sent_tokens

    ne_token_list = first_pass(ner_function, sents_list)
    high_confidence_types = get_high_confidence_types(ne_token_list, type_threshold)
    obj_list = combine_types_to_entities(high_confidence_types)
    
    obj_dict = obj_list_to_types_dict(obj_list)
    filtered_type_dict = get_filtered_type_dict(obj_dict)
    obj_list = pass_2(ner_function, sents_list, filtered_type_dict, obj_list, token_threshold)
        
    
    return obj_list

In [5]:
ft_dir = op.join(pre_dir, "full_texts")
sm_dir = op.join(pre_dir, "summaries")

for txt_type in [ft_dir, sm_dir]:
    
    if not op.exists(txt_type):
        os.mkdir(txt_type)
    
    obj_list_dir = op.join(txt_type, "ne_obj_lists")
    bows_dir = op.join(txt_type, "bows")
    sents_dir = op.join(txt_type, "sent_tokenized")
        
    for feat_type in [obj_list_dir, bows_dir, sents_dir]:
        if not op.exists(feat_type):
            os.mkdir(feat_type)
            
        for dset in ["train","test","valid"]:
            save_dir = op.join(feat_type, dset)
            if not op.exists(save_dir):
                os.mkdir(save_dir)     
                
                
all_names = []                
for dset in ["train","test","valid"]:
    all_names += os.listdir(op.join(summary_dir, dset))
    
assert len(url_dict.keys()) == len(all_names) # check we have all books in our dataset

           
for dset in ["train","test","valid"]:
    save_dir_ft_sents = op.join(ft_dir, "sent_tokenized", dset)
    save_dir_ft_bows  = op.join(ft_dir, "bows", dset)
    save_dir_ft_obj   = op.join(ft_dir, "ne_obj_lists", dset)
    save_dir_sm_sents = op.join(sm_dir, "sent_tokenized", dset)
    save_dir_sm_bows  = op.join(sm_dir, "bows", dset)
    save_dir_dm_obj   = op.join(sm_dir, "ne_obj_lists", dset)
    from_dir_ft       = op.join(full_texts_dir, dset)
    from_dir_sm       = op.join(summary_dir, dset)
    texts = os.listdir(from_dir_sm)
    for i, text in enumerate(texts):
        print("Doing text no. {}, {} from {} set".format(i, text, dset))
        text_path_ft = op.join(from_dir_ft, text)
        sent_tokens_ft = get_sent_tokens(text_path_ft)
        text_path_sm = op.join(from_dir_sm, text)
        sent_tokens_sm = get_sent_tokens(text_path_sm)
        sent_tokens = sent_tokens_ft + sent_tokens_sm
        vecs, word2idx = preprocess(sent_tokens)
        obj_list = sents_list_to_ne_obj_list(sent_tokens_ft) #There is only one "obj list"
           
        vecs_ft = {x:y for x,y in vecs.items() if x < len(sent_tokens_ft)}
        vecs_sm = {x:y for x,y in vecs.items() if x >= len(sent_tokens_ft)}
           
        np.save(op.join(save_dir_ft_bows, text + ".npy"), np.array([vecs_ft, word2idx]))
        np.save(op.join(save_dir_sm_bows, text + ".npy"), np.array([vecs_sm, word2idx])) 
        np.save(op.join(save_dir_ft_obj, text + ".npy"), np.array(obj_list))
                
        with open(op.join(save_dir_ft_sents, text), "w+") as sents_file:
            for st in sent_tokens_ft:
                sents_file.write(st + "\n")
        with open(op.join(save_dir_sm_sents, text), "w+") as sents_file:
            for st in sent_tokens_sm:
                sents_file.write(st + "\n")
                
                

Doing text no. 0, Bartleby, the Scrivener from train set
Doing text no. 1, Kipps from train set
Doing text no. 2, The Gift of the Magi from train set
Doing text no. 3, Manalive from train set
Doing text no. 4, The Rose and the Ring from train set
Doing text no. 5, How He Lied to Her Husband from train set
Doing text no. 6, A Bride of the Plains from train set


KeyboardInterrupt: 

In [None]:
op.join(full_texts_dir, "train", "White Fang")

In [None]:
print(set([obj.class_string for obj in obj_list]))

In [None]:
for obj in obj_list:
    if obj.class_string == "PERSON":
        print(obj.sents)
        print(obj)
    