In [1]:
import os
import io
import json
from zipfile import ZipFile

In [2]:
import re
import xapian
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_colwidth = 150
pd.options.display.max_rows = 100

In [4]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))

stop_tags = ['DT', 'VB', 'VBP', 'VBD', 'VBN', 'VBZ',  'MD', 'IN', 'TO', 'PRP', 'CC',
             "(", ')', '.', ',', 'PRP$', 'VBG', 'RB', "''", "``", 'WDT', ':', 'WRB']

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vitaly/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
DUMP_FOLDER = 'data/parsed_corpus'

In [6]:
if (not os.path.exists('data')): os.mkdir('data')
if (not os.path.exists(DUMP_FOLDER)): os.mkdir(DUMP_FOLDER)

# Extract Named Entities from WIKI corpus

In [7]:
zf = ZipFile("../wiki-pages-text.zip")

In [8]:
files = [item.filename for item in zf.filelist]
len(files), files[1]

(110, 'wiki-pages-text/wiki-009.txt')

In [9]:
def read_shard(zf, path):
    items = []
    fp = zf.open(path, mode='r')
    tfp = io.TextIOWrapper(fp)
    nlines = 0
    for line in tfp.readlines():
        nlines += 1
        match = re.match("(\S+)\s(\d+)\s(.*)\n", line)
        if match:
            items.append(match.groups())
        else:
            #print(line)
            pass
    fp.close()
    tfp.close()
    return items#, nlines

In [10]:
def read_shard_as_df(zf, path):
    items = read_shard(zf, path)
    raw_df = pd.DataFrame(data=items, columns=['doc_id', 'sentence', 'text'])
    func = lambda x: " ".join(x)
    return raw_df.groupby('doc_id')['text'].agg(func)

In [21]:
%%time
for path in sorted(files[1:]):
    print(path)
    # determine shard
    shard = ""
    m = re.match(".*wiki-(\d+).*", path)
    if m: shard = m[1]
    
    # read shart into dataframe
    items = read_shard(zf, path)
    raw_df = pd.DataFrame(data=items, columns=['page_id', 'sentence', 'text'])
    raw_df['sentence'] = raw_df.sentence.astype(int)

    raw_df.set_index(['page_id', 'sentence'], inplace=True)

    # tokenise and remove stop words
    raw_df['tokens'] = raw_df.text.apply(word_tokenize)
    raw_df['tokens'] = raw_df.tokens.apply(lambda tokens: [w for w in tokens if not w in stop_words])

    # determine parts of speech and remove stop tags
    raw_df['tagged'] = raw_df.tokens.apply(nltk.pos_tag)
    raw_df['tagged'] = raw_df.tagged.apply(lambda pairs: [(w, t) for w, t in pairs if not t in stop_tags])

    # extract keywords
    raw_df['keywords'] = raw_df.tagged.apply(lambda pairs: [w for w, t in pairs if not t in stop_tags])

    # dump keywords
    dump_path = "{}/{}.json".format(DUMP_FOLDER, shard)
    raw_df.keywords.reset_index().to_json(dump_path, orient="split", index=False)

wiki-pages-text/wiki-001.txt
wiki-pages-text/wiki-002.txt
wiki-pages-text/wiki-003.txt
wiki-pages-text/wiki-004.txt
wiki-pages-text/wiki-005.txt
wiki-pages-text/wiki-006.txt
wiki-pages-text/wiki-007.txt
wiki-pages-text/wiki-008.txt
wiki-pages-text/wiki-009.txt
wiki-pages-text/wiki-010.txt
wiki-pages-text/wiki-011.txt
wiki-pages-text/wiki-012.txt
wiki-pages-text/wiki-013.txt
wiki-pages-text/wiki-014.txt
wiki-pages-text/wiki-015.txt
wiki-pages-text/wiki-016.txt
wiki-pages-text/wiki-017.txt
wiki-pages-text/wiki-018.txt
wiki-pages-text/wiki-019.txt
wiki-pages-text/wiki-020.txt
wiki-pages-text/wiki-021.txt
wiki-pages-text/wiki-022.txt
wiki-pages-text/wiki-023.txt
wiki-pages-text/wiki-024.txt
wiki-pages-text/wiki-025.txt
wiki-pages-text/wiki-026.txt
wiki-pages-text/wiki-027.txt
wiki-pages-text/wiki-028.txt
wiki-pages-text/wiki-029.txt
wiki-pages-text/wiki-030.txt
wiki-pages-text/wiki-031.txt
wiki-pages-text/wiki-032.txt
wiki-pages-text/wiki-033.txt
wiki-pages-text/wiki-034.txt
wiki-pages-tex

In [22]:
pd.read_json(dump_path, orient='split').head()

Unnamed: 0,page_id,sentence,keywords
0,Year_book,0,"[Year, book]"
1,Year_book,2,"[Yearbook, book, record, highlight, commemorate, year, school]"
2,Year_book,4,"[Year, Books, earliest, law, reports, England]"
3,Yuneslu,0,"[Yuneslu, -LRB-, يونسلو, Romanized, Yūneslū, -RRB-, village, Solduz, Rural, District, Central, District, Naqadeh, County, West, Azerbaijan, Provin..."
4,Yuneslu,1,"[2006, census, population, 258, 47, families]"


In [97]:
raw_df.keywords.apply(lambda x: list(nltk.trigrams(x)))

0                                [(following, football, soccer), (football, soccer, events), (soccer, events, year), (events, year, 1928), (year, 1928, world)]
1         [(1986, NBA, Finals), (NBA, Finals, championship), (Finals, championship, round), (championship, round, 1985-86), (round, 1985-86, NBA), (1985-86,...
2         [(Eastern, Conference, champion), (Conference, champion, Boston), (champion, Boston, Celtics), (Boston, Celtics, Western), (Celtics, Western, Conf...
3                       [(Celtics, Rockets, four), (Rockets, four, games), (four, games, two), (games, two, 16th), (two, 16th, NBA), (16th, NBA, championship)]
4                                                     [(championship, Celtics, '), (Celtics, ', last), (', last, 2008), (last, 2008, NBA), (2008, NBA, Finals)]
5                                                                                                                  [(Larry, Bird, Finals), (Bird, Finals, MVP)]
6         [(note, series, first), (serie