In [1]:
import os
import io
import json
from zipfile import ZipFile
import unicodedata
import nltk
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

In [2]:
import re
import xapian
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_colwidth = 100

In [4]:
DBPATH = "ner_index"

In [5]:
if not os.path.exists(DBPATH):
    os.mkdir(DBPATH)

# Build Index

In [6]:
zf = ZipFile("../wiki-pages-text.zip")

In [7]:
files = [item.filename for item in zf.filelist]
len(files), files[1]

(110, 'wiki-pages-text/wiki-009.txt')

In [8]:
def read_shard(zf, path):
    items = []
    fp = zf.open(path, mode='r')
    tfp = io.TextIOWrapper(fp)
    nlines = 0
    for line in tfp.readlines():
        nlines += 1
        line = unicodedata.normalize('NFD', line)
        match = re.match("(\S+)\s(\d+)\s(.*)\n", line)
        if match:
            items.append(match.groups())
        else:
            #print(line)
            pass
    fp.close()
    tfp.close()
    return items#, nlines

In [9]:
def read_shard_as_df(zf, path):
    items = read_shard(zf, path)
    raw_df = pd.DataFrame(data=items, columns=['doc_id', 'sentence', 'text'])
    func = lambda x: " ".join(x)
    return raw_df.groupby('doc_id')['text'].agg(func)

In [10]:
def preprocess_ner(s):
    #s = re.sub("^(The|the|A|a|An|an)\s", "", s)
    #s = re.sub("\s", "_", s) 
    return unicodedata.normalize('NFD', s)

def obtain_nouns(v):
    ents = {preprocess_ner(item['entity']) for item in v['named_entities']}
    ents_roots = {preprocess_ner(item['root']) for item in v['named_entities']}
    nps = {preprocess_ner(item['noun_phrase']) for item in v['noun_phrases']}
    nps_roots = {preprocess_ner(item['root']) for item in v['noun_phrases']}
    return ents.union(nps).union(ents_roots).union(nps_roots)

def read_ner_data(shard):
    df = pd.read_json('data/corpus/{}.json'.format(shard), orient="split")
    df.index.name = 'page_id'
    df.reset_index(inplace=True)
    for col in ['page_id']:
        df[col] = df[col].apply(lambda x: unicodedata.normalize('NFD', x))
    df.set_index('page_id', inplace=True)
    df['entities'] = df.parsed_text.apply(obtain_nouns)
    return df.entities.to_dict()

In [11]:
%%time
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(DBPATH, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))

for path in sorted(files[1:]):
    shard = ""
    m = re.match(".*wiki-(\d+).*", path)
    if m: shard = m[1]
    print(path)
    
    documents_df = read_shard_as_df(zf, path)
    keywords_dict = read_ner_data(shard)
    for doc_id, text in documents_df.items():
        try:
            # We make a document and tell the term generator to use this.
            doc = xapian.Document()
            termgenerator.set_document(doc)

            # Index fields without prefixes for general search.
            termgenerator.index_text(text)
            termgenerator.increase_termpos()

            # We use the identifier to ensure each object ends up in the
            # database only once no matter how many times we run the
            # indexer.
            idterm = u"Q" + doc_id
            doc.add_boolean_term(idterm)

            # Indexing keywords
            keywords = []
            if doc_id in keywords_dict:
                keywords = list(keywords_dict[doc_id])
            else:
                print("No keywords found for page_id={}".format(doc_id))
                
            for item in keywords_dict[doc_id]:
                doc.add_term(u"K" + item.lower())
                #doc.add_term(u"K:" + item)

            # save additional data
            data = dict(
                doc_id = doc_id,
                shard = shard,
                text = text,
                keywords = keywords,
            )
            doc.set_data(json.dumps(data))

            db.replace_document(idterm, doc)
        except Exception as e:
            print(doc_id, text, e)
db.commit()
db.close()

wiki-pages-text/wiki-001.txt
wiki-pages-text/wiki-002.txt
wiki-pages-text/wiki-003.txt
wiki-pages-text/wiki-004.txt
wiki-pages-text/wiki-005.txt
wiki-pages-text/wiki-006.txt
wiki-pages-text/wiki-007.txt
wiki-pages-text/wiki-008.txt
wiki-pages-text/wiki-009.txt
wiki-pages-text/wiki-010.txt
wiki-pages-text/wiki-011.txt
wiki-pages-text/wiki-012.txt
wiki-pages-text/wiki-013.txt
wiki-pages-text/wiki-014.txt
wiki-pages-text/wiki-015.txt
No keywords found for page_id=Benjamin_Biaggini
Benjamin_Biaggini Benjamin Franklin Biaggini -LRB- April 15 , 1916 -- May 28 , 2005 -RRB- was president of the Southern Pacific Company , parent company of Southern Pacific Railroad , from 1964 to 1976 and chairman of the Board of Directors from 1976 to 1983 . 'Benjamin_Biaggini'
wiki-pages-text/wiki-016.txt
wiki-pages-text/wiki-017.txt
wiki-pages-text/wiki-018.txt
wiki-pages-text/wiki-019.txt
wiki-pages-text/wiki-020.txt
wiki-pages-text/wiki-021.txt
wiki-pages-text/wiki-022.txt
wiki-pages-text/wiki-023.txt
wiki

wiki-pages-text/wiki-079.txt
wiki-pages-text/wiki-080.txt
wiki-pages-text/wiki-081.txt
wiki-pages-text/wiki-082.txt
wiki-pages-text/wiki-083.txt
wiki-pages-text/wiki-084.txt
Rotenberg_Law The Rotenberg Law - the draft of Federal law of Russian Federation # 607554-6 On amendments to the Federal law `` On compensation for violation of the right to trial within a reasonable time or the right to execution of a judicial act within a reasonable time '' -LRB- clarification of certain provisions of the Federal law in regard to obtaining compensation for the violation of the right to execution of a judicial act within a reasonable time -RRB- . The law is informally named after a Russian businessman Arkady Rotenberg , after Italy has frozen nearly $ 40 million in assets held The bill proposes seizing the Russia-based assets of foreign countries that have sanctioned Russian citizens . Term too long (> 245): Kthe federal law `` on compensation for violation of the right to trial within a reasonabl

In [12]:
!xapian-delve $DBPATH

UUID = 264d9bea-55c9-4caf-9979-abe630f0809a
number of documents = 5365408
average document length = 178.465
document length lower bound = 1
document length upper bound = 68701
highest document id ever used = 5365408
has positional information = true
revision = 537
currently open for writing = false


In [13]:
!xapian-delve -r 3129 -d $DBPATH

Data for record #3129:
{"doc_id": "11th_Gemini_Awards", "shard": "001", "text": "The 11th Gemini Awards was held on June 6 , 1997 , to honour achievements in Canadian television . It was hosted by Albert Schultz , and was broadcast on CBC .", "keywords": ["Awards", "television", "June 6, 1997", "Gemini Awards", "Canadian", "11th", "Canadian television", "The 11th Gemini Awards", "achievements", "June"]}
Term List for record #3129: 11th 1997 6 K11th Kachievements Kawards Kcanadian Kcanadian television Kgemini awards Kjune Kjune 6, 1997 Ktelevision Kthe 11th gemini awards Q11th_Gemini_Awards Zachiev Zalbert Zand Zaward Zbroadcast Zby Zcanadian Zcbc Zgemini Zheld Zhonour Zhost Zin Zit Zjune Zon Zschultz Ztelevis Zthe Zto Zwas achievements albert and awards broadcast by canadian cbc gemini held honour hosted in it june on schultz television the to was
