In [1]:
import os
import io
import json
from zipfile import ZipFile

In [2]:
import re
import xapian
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_colwidth = 200

In [4]:
DBPATH = "ner_sents_index"

In [5]:
if not os.path.exists(DBPATH):
    os.mkdir(DBPATH)

In [6]:
from glob import glob

# Build Index

In [7]:
def preprocess_ner(s):
    #s = re.sub("^(The|the|A|a|An|an)\s", "", s)
    #s = re.sub("\s", "_", s)
    #s = re.sub("-", "_", s)
    return s

def obtain_entities(v):
    ents = {preprocess_ner(item['entity']) for item in v['named_entities']}
    ents_roots = {preprocess_ner(item['root']) for item in v['named_entities']}
    nps = {preprocess_ner(item['noun_phrase']) for item in v['noun_phrases']}
    nps_roots = {preprocess_ner(item['root']) for item in v['noun_phrases']}
    entities = ents.union(nps).union(ents_roots).union(nps_roots)
    return {item for item in entities if len(item) < 100}

def read_ner_data(path):
    df = pd.read_json(path, orient="split")
    df.index.name = 'page_id'
    df['entities'] = df.parsed_text.apply(obtain_entities)
    return df

for path in sorted(glob('data/corpus_sentences/*.json')):
    if re.match(".*\/2.\d+\.json", path):
        continue
    #print(path)
    

    shard = ""
    m = re.match(".*\/(\d+).json", path)
    if m: shard = m[1]

    df = read_ner_data(path)
    df.reset_index(inplace=True)
    df['id'] = df.page_id + '_' + df.sentence.astype(str)
    df.set_index('id', inplace=True)
df.loc['Islam_in_Australia_12'].to_frame().values

In [8]:
%%time
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(DBPATH, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))

for path in sorted(glob('data/corpus_sentences/*.json')):
    if re.match(".*\/3.\d+\.json", path):
        continue
    print(path)
    
    shard = ""
    m = re.match(".*\/(\d+).json", path)
    if m: shard = m[1]
    #print(shard)

    df = read_ner_data(path)
    df.reset_index(inplace=True)
    df['id'] = df.page_id + '_' + df.sentence.astype(str)
    df.set_index('id', inplace=True)
    for doc_id, data in df.iterrows():
        try:
            page_id, sent_id, text, _, keywords = data
            
            # We make a document and tell the term generator to use this.
            doc = xapian.Document()
            termgenerator.set_document(doc)

            # Index fields without prefixes for general search.
            termgenerator.index_text(text)
            termgenerator.increase_termpos()

            # We use the identifier to ensure each object ends up in the
            # database only once no matter how many times we run the
            # indexer.
            idterm = u"Q" + doc_id
            doc.add_boolean_term(idterm)
            doc.add_boolean_term(u"S" + re.sub('-', '_', page_id.lower()))
            
            for item in keywords:
                doc.add_term(u"K" + item.lower())

            # save additional data
            data = dict(
                shard = shard,
                page_id = page_id,
                sentence_id = sent_id,
                text = text,
                keywords = list(keywords),
            )
            doc.set_data(json.dumps(data))
            
            db.replace_document(idterm, doc)
        except Exception as e:
            print(doc_id, text, e)

db.commit()
db.close()

data/corpus_sentences/001.json
data/corpus_sentences/002.json
data/corpus_sentences/003.json
data/corpus_sentences/004.json
data/corpus_sentences/005.json
data/corpus_sentences/006.json
data/corpus_sentences/007.json
data/corpus_sentences/008.json
data/corpus_sentences/009.json
data/corpus_sentences/010.json
data/corpus_sentences/011.json
data/corpus_sentences/012.json
data/corpus_sentences/013.json
data/corpus_sentences/014.json
data/corpus_sentences/015.json
data/corpus_sentences/016.json
data/corpus_sentences/017.json
data/corpus_sentences/018.json
data/corpus_sentences/019.json
data/corpus_sentences/020.json
data/corpus_sentences/021.json
data/corpus_sentences/022.json
data/corpus_sentences/023.json
data/corpus_sentences/024.json
data/corpus_sentences/025.json
data/corpus_sentences/026.json
data/corpus_sentences/027.json
data/corpus_sentences/028.json
data/corpus_sentences/029.json
data/corpus_sentences/030.json
data/corpus_sentences/031.json
data/corpus_sentences/032.json
data/cor

In [9]:
!xapian-delve $DBPATH

UUID = 74b38802-8662-47ba-b908-18f8945fc71e
number of documents = 5342243
average document length = 41.2869
document length lower bound = 1
document length upper bound = 1110
highest document id ever used = 5342243
has positional information = true
revision = 1146
currently open for writing = false


In [10]:
!xapian-delve -r 112 -d $DBPATH

Data for record #112:
{"shard": "001", "page_id": "17_Again_-LRB-film-RRB-", "sentence_id": 0, "text": "17 Again is a 2009 American comedy film directed by Burr Steers .", "keywords": ["2009", "2009_American_comedy_film", "Burr_Steers", "17", "Steers", "American", "film"]}
Term List for record #112: 17 2009 K17 K2009 K2009_american_comedy_film Kamerican Kburr_steers Kfilm Ksteers Q17_Again_-LRB-film-RRB-_0 S17_again__lrb_film_rrb_ Za Zagain Zamerican Zburr Zby Zcomedi Zdirect Zfilm Zis Zsteer a again american burr by comedy directed film is steers
