In [1]:
import os
import io
import json
from zipfile import ZipFile
import unicodedata

In [2]:
import re
import xapian
import pandas as pd
from pandarallel import pandarallel
import numpy as np

In [3]:
pd.options.display.max_colwidth = 200

In [4]:
OUTPUT_FOLDER = 'data/corpus'
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)

In [5]:
import spacy
import en_core_web_lg

nlp = en_core_web_lg.load()
def parse_with_spacy(text):
    doc = nlp(text)
    record = dict(
        named_entities = [{'entity': span.text, 'label': span.label_, 'root': span.root.text} for span in doc.ents],
        noun_phrases = [{'noun_phrase': span.text, 'root': span.root.text} for span in doc.noun_chunks]
    )
    return record

def preprocess_ner(s):
    s = re.sub("^(The|the|A|a|An|an)\s", "", s)
    s = re.sub("\s", "_", s) 
    return s

def obtain_nouns(v):
    ents = {preprocess_ner(item['entity']) for item in v['named_entities']}
    ents = {preprocess_ner(item['entity']) for item in v['named_entities']}
    nps = {preprocess_ner(item['noun_phrase']) for item in v['noun_phrases']}
    return ents.union(nps)

# Extract Named Entities from WIKI corpus

In [6]:
zf = ZipFile("wiki-pages-text.zip")

In [7]:
files = [item.filename for item in zf.filelist]
len(files), files[1]

(110, 'wiki-pages-text/wiki-009.txt')

In [8]:
def read_shard(zf, path):
    items = []
    fp = zf.open(path, mode='r')
    tfp = io.TextIOWrapper(fp)
    nlines = 0
    for line in tfp.readlines():
        nlines += 1
        
        line = unicodedata.normalize('NFD', line)
        match = re.match("(\S+)\s(\d+)\s(.*)\n", line)
        if match:
            items.append(match.groups())
        else:
            #print(line)
            pass
    fp.close()
    tfp.close()
    return items#, nlines

In [9]:
def read_shard_as_df(zf, path):
    items = read_shard(zf, path)
    raw_df = pd.DataFrame(data=items, columns=['doc_id', 'sentence', 'text'])
    func = lambda x: " ".join(x)
    return raw_df.groupby('doc_id')['text'].agg(func)

In [10]:
#items, nlines = read_shard(zf, files[1])
#len(items), nlines, items[0]
path = files[1]

In [11]:
pandarallel.initialize(progress_bar=False, nb_workers=6)

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 6 workers


In [None]:
%%time
for path in sorted(files[1:]):
    print(path)
    
    items = read_shard(zf, path)

    raw_df = pd.DataFrame(data=items, columns=['doc_id', 'sentence', 'text'])
    raw_df['sentence'] = raw_df.sentence.astype(int)
    
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("-LRB-", "(", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("-LSB-", "[", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("-RRB-", ")", s))
    raw_df['text'] =  raw_df.text.apply(lambda s: re.sub("-RSB-", "]", s))
    
    doc_df = raw_df.groupby('doc_id')[['text']].agg(lambda x: " ".join(x))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\(\s", "(", s))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\[\s", "[", s))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\s\)", ")", s))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\s\]", "]", s))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\s,", ",", s))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\s--\s", "-", s))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\s\`\s", "\s", s))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\s\"\s", "\s", s))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\s\'\s", "\s", s))
    doc_df['text'] =  doc_df.text.apply(lambda s: re.sub("\s\'", "\'", s))
    

    def get_page_header(s):
        #doc = nlp(s)
        #return nlp(" ".join([item.text for item in doc[:100]]))
        return " ".join(s.split(" ")[:50])
    
    doc_df['header'] = doc_df.text.apply(get_page_header)
    doc_df['parsed_text'] = doc_df.header.parallel_apply(parse_with_spacy)
    
    shard = ""
    m = re.match(".*wiki-(\d+).txt", path)
    if m: shard = m.groups()[0]   

    doc_df[['parsed_text']].to_json('{}/{}.json'.format(OUTPUT_FOLDER, shard), orient='split')

wiki-pages-text/wiki-001.txt
wiki-pages-text/wiki-002.txt
wiki-pages-text/wiki-003.txt
wiki-pages-text/wiki-004.txt
wiki-pages-text/wiki-005.txt
wiki-pages-text/wiki-006.txt
wiki-pages-text/wiki-007.txt
wiki-pages-text/wiki-008.txt


In [None]:
raw_df['text_length'] = raw_df.text.apply(len)
raw_df['words_count'] = raw_df.text.apply(lambda x: len([item for item in x.split(" ") if item.isalpha()]))

In [None]:
bad_sentences_mask = (raw_df.text_length < 40) & (raw_df.words_count < 3)
raw_df = raw_df[~bad_sentences_mask]

In [None]:
raw_df[raw_df.words_count == 3].head()

In [None]:
    
    first_sentences_df = raw_df[raw_df.sentence == 0].set_index('doc_id')


    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("(\s", "(", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("[\s", "[", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s)", ")", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s]", "]", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s,", ",", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s--\s", "-", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s\`\s", "\s", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s\"\s", "\s", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s\'\s", "\s", s))
    first_sentences_df['text'] =  first_sentences_df.text.apply(lambda s: re.sub("\s\'", "\'", s))
    
    first_sentences_df['parsed_text'] = first_sentences_df.text.apply(parse_with_spacy)

    shard = ""
    m = re.match(".*wiki-(\d+).*", path)
    if m: shard = m[1]

    first_sentences_df.to_json('data/corpus/{}.json'.format(shard), orient='split')

In [None]:
%%time
first_sentences_df['entities'] = first_sentences_df.parsed_text.apply(obtain_nouns)

In [None]:
first_sentences_df.sample(100)