In [1]:
import os
import io
import json
from zipfile import ZipFile
import unicodedata

In [2]:
import re
import xapian
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_colwidth = 100

In [4]:
import spacy
import en_core_web_lg

nlp = en_core_web_lg.load()
def parse_with_spacy(text):
    doc = nlp(text)
    record = dict(
        named_entities = [{'entity': span.text, 'label': span.label_, 'root': span.root.text} for span in doc.ents],
        noun_phrases = [{'noun_phrase': span.text, 'root': span.root.text} for span in doc.noun_chunks]
    )
    return record

# Extract Named Entities from WIKI corpus

In [5]:
with open('data/top_00-10_docs.json', 'r') as fp:
    target_pages = set(json.load(fp))

In [6]:
len(target_pages)#, target_pages

364894

In [7]:
zf = ZipFile("../wiki-pages-text.zip")

In [8]:
files = [item.filename for item in zf.filelist]
len(files), files[1]

(110, 'wiki-pages-text/wiki-009.txt')

In [9]:
def read_shard(zf, path):
    items = []
    fp = zf.open(path, mode='r')
    tfp = io.TextIOWrapper(fp)
    nlines = 0
    for line in tfp.readlines():
        nlines += 1
        
        line = unicodedata.normalize('NFD', line)
        match = re.match("(\S+)\s(\d+)\s(.*)\n", line)
        if match:
            items.append(match.groups())
        else:
            #print(line)
            pass
    fp.close()
    tfp.close()
    return items#, nlines

In [10]:
#items, nlines = read_shard(zf, files[1])
#len(items), nlines, items[0]
path = files[1]

In [14]:
%%time
for path in sorted(files[1:])[9:10]:
    print(path)
    items = read_shard(zf, path)

    raw_df = pd.DataFrame(data=items, columns=['page_id', 'sentence', 'text'])
    raw_df = raw_df[raw_df.page_id.isin(target_pages)]
    raw_df.set_index('page_id', inplace=True)
    
    raw_df['sentence'] = raw_df.sentence.astype(int)
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("-LRB-\s", "(", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("-LSB-\s", "[", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("\s-RRB-", ")", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("\s-RSB-", "]", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("\s,", ",", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("\s.", ".", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("\s;", ";", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("\s--\s", "-", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("\s\"\s", "\s", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("\s\'\s", "\s", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("\s\`\s", "\s", s))
    raw_df['parsed_text'] = raw_df.text.apply(parse_with_spacy)

    shard = ""
    m = re.match(".*wiki-(\d+).*", path)
    if m: shard = m[1]

    raw_df.to_json('data/corpus_sentences/{}.json'.format(shard), orient='split')

wiki-pages-text/wiki-010.txt
CPU times: user 2min 53s, sys: 6.24 s, total: 2min 59s
Wall time: 3min 7s
