In [10]:
import os
import io
import json
from zipfile import ZipFile
import unicodedata

In [2]:
import re
import xapian
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_colwidth = 100

In [4]:
import spacy
import en_core_web_lg

nlp = en_core_web_lg.load()
def parse_with_spacy(text):
    doc = nlp(text)
    record = dict(
        named_entities = [{'entity': span.text, 'label': span.label_, 'root': span.root.text} for span in doc.ents],
        noun_phrases = [{'noun_phrase': span.text, 'root': span.root.text} for span in doc.noun_chunks]
    )
    return record

def preprocess_ner(s):
    s = re.sub("^(The|the|A|a|An|an)\s", "", s)
    s = re.sub("\s", "_", s) 
    return s

def obtain_nouns(v):
    ents = {preprocess_ner(item['entity']) for item in v['named_entities']}
    nps = {preprocess_ner(item['noun_phrase']) for item in v['noun_phrases']}
    return ents.union(nps)

# Extract Named Entities from WIKI corpus

In [5]:
zf = ZipFile("../wiki-pages-text.zip")

In [6]:
files = [item.filename for item in zf.filelist]
len(files), files[1]

(110, 'wiki-pages-text/wiki-009.txt')

In [7]:
def read_shard(zf, path):
    items = []
    fp = zf.open(path, mode='r')
    tfp = io.TextIOWrapper(fp)
    nlines = 0
    for line in tfp.readlines():
        nlines += 1
        
        line = unicodedata.normalize('NFD', line)
        match = re.match("(\S+)\s(\d+)\s(.*)\n", line)
        if match:
            items.append(match.groups())
        else:
            #print(line)
            pass
    fp.close()
    tfp.close()
    return items#, nlines

In [8]:
def read_shard_as_df(zf, path):
    items = read_shard(zf, path)
    raw_df = pd.DataFrame(data=items, columns=['doc_id', 'sentence', 'text'])
    func = lambda x: " ".join(x)
    return raw_df.groupby('doc_id')['text'].agg(func)

In [9]:
#items, nlines = read_shard(zf, files[1])
#len(items), nlines, items[0]
path = files[1]

In [41]:
%%time
for path in sorted(files[1:]):
    print(path)
    
    items = read_shard(zf, path)

    raw_df = pd.DataFrame(data=items, columns=['doc_id', 'sentence', 'text'])
    raw_df['sentence'] = raw_df.sentence.astype(int)
    first_sentences_df = raw_df[raw_df.sentence == 0].set_index('doc_id')


    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("-LRB-\s", "(", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("-LSB-\s", "[", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s-RRB-", ")", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s-RSB-", "]", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s,", ",", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s--\s", "-", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\"\s", "\"", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s\"", "\"", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s\'", "\'", s))
    first_sentences_df['parsed_text'] = first_sentences_df.text.apply(parse_with_spacy)

    shard = ""
    m = re.match(".*wiki-(\d+).*", path)
    if m: shard = m[1]

    first_sentences_df.to_json('data/corpus/{}.json'.format(shard), orient='split')

wiki-pages-text/wiki-001.txt
wiki-pages-text/wiki-002.txt
wiki-pages-text/wiki-003.txt
wiki-pages-text/wiki-004.txt
wiki-pages-text/wiki-005.txt
wiki-pages-text/wiki-006.txt
wiki-pages-text/wiki-007.txt
wiki-pages-text/wiki-008.txt
wiki-pages-text/wiki-009.txt
wiki-pages-text/wiki-010.txt
wiki-pages-text/wiki-011.txt
wiki-pages-text/wiki-012.txt
wiki-pages-text/wiki-013.txt
wiki-pages-text/wiki-014.txt
wiki-pages-text/wiki-015.txt
wiki-pages-text/wiki-016.txt
wiki-pages-text/wiki-017.txt
wiki-pages-text/wiki-018.txt
wiki-pages-text/wiki-019.txt
wiki-pages-text/wiki-020.txt
wiki-pages-text/wiki-021.txt
wiki-pages-text/wiki-022.txt
wiki-pages-text/wiki-023.txt
wiki-pages-text/wiki-024.txt
wiki-pages-text/wiki-025.txt
wiki-pages-text/wiki-026.txt
wiki-pages-text/wiki-027.txt
wiki-pages-text/wiki-028.txt
wiki-pages-text/wiki-029.txt
wiki-pages-text/wiki-030.txt
wiki-pages-text/wiki-031.txt
wiki-pages-text/wiki-032.txt
wiki-pages-text/wiki-033.txt
wiki-pages-text/wiki-034.txt
wiki-pages-tex

In [29]:
%%time
first_sentences_df['entities'] = first_sentences_df.parsed_text.apply(obtain_nouns)

CPU times: user 1.37 s, sys: 23.7 ms, total: 1.39 s
Wall time: 1.39 s


In [30]:
first_sentences_df.sample(100)

Unnamed: 0_level_0,sentence,text,parsed_text,entities
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Amin_Zendegani,0,Amin Zendegani is an Iranian actor who starred as king Solomon in the film The Kingdom of Solomon .,"{'named_entities': [{'entity': 'Amin Zendegani', 'label': 'PERSON', 'root': 'Zendegani'}, {'enti...","{who, film, Kingdom_of_Solomon, Amin_Zendegani, king_Solomon, Iranian, Iranian_actor, Solomon, K..."
Albert_Lippert,0,"Albert Lippert (1901-1978) was a German stage, television and film actor .","{'named_entities': [{'entity': 'Albert Lippert', 'label': 'PERSON', 'root': 'Lippert'}, {'entity...","{television_and_film_actor, 1901-1978, Albert_Lippert, German_stage, German}"
Anna_Akhmatova_Literary_and_Memorial_Museum,0,"The Anna Akhmatova Literary and Memorial Museum is a literary museum in St Petersburg, Russia, d...","{'named_entities': [{'entity': 'The Anna Akhmatova Literary', 'label': 'PERSON', 'root': 'The'},...","{1889-1966, literary_museum, Anna_Akhmatova, Memorial_Museum, Russia, Anna_Akhmatova_Literary_an..."
Aleksi_Heponiemi,0,"Aleksi Heponiemi (born January 9, 1999) is an Finnish major junior ice hockey player .","{'named_entities': [{'entity': 'Aleksi Heponiemi', 'label': 'PERSON', 'root': 'Heponiemi'}, {'en...","{January_9,_1999, Aleksi_Heponiemi, Finnish_major_junior_ice_hockey_player, Finnish}"
Aleksandr_Antonov,0,Aleksandr Antonov may refer to :,"{'named_entities': [{'entity': 'Aleksandr Antonov', 'label': 'PERSON', 'root': 'Antonov'}], 'nou...",{Aleksandr_Antonov}
Amos_Rusie,0,"Amos Wilson Rusie (May 30, 1871-December 6, 1942), nicknamed `` The Hoosier Thunderbolt'', was a...","{'named_entities': [{'entity': 'Amos Wilson Rusie', 'label': 'PERSON', 'root': 'Rusie'}, {'entit...","{`_The_Hoosier_Thunderbolt'', Major_League_Baseball, late_19th_century, American, American_right..."
Aiquin,0,"Aiquin (also spelled Aquin or Acquin), subtitled La conqueste de la Bretaigne par le roy Charlem...","{'named_entities': [{'entity': 'Aquin', 'label': 'PERSON', 'root': 'Aquin'}, {'entity': 'Acquin'...","{rivalry, La, King_Charlemagne, Saracen_king, Acquin, La_conqueste_de_la_Bretaigne_par_le_roy_Ch..."
Armand_Bayou_Nature_Center,0,Armand Bayou Nature Center is an urban preserve located in Pasadena and southeast Houston betwee...,"{'named_entities': [{'entity': 'Armand Bayou Nature Center', 'label': 'FAC', 'root': 'Center'}, ...","{Bayport_Industrial_District, Armand_Bayou_Nature_Center, Johnson_Space_Center, Pasadena, southe..."
Anne-Lie_Rydé,0,Anne-Lie Rydé (born 17 October 1956 in Stockholm) is a Swedish pop and rock singer .,"{'named_entities': [{'entity': 'Anne-Lie Rydé', 'label': 'PERSON', 'root': 'Rydé'}, {'entity': '...","{Swedish_pop_and_rock_singer, Stockholm, 17_October_1956, Anne-Lie_Rydé, Swedish}"
Andrew_Droz_Palermo,0,"Andrew Droz Palermo is an American cinematographer, director, and screenwriter .","{'named_entities': [{'entity': 'Andrew Droz Palermo', 'label': 'PERSON', 'root': 'Palermo'}, {'e...","{screenwriter, American, Andrew_Droz_Palermo, director, American_cinematographer}"
