In [1]:
import os
import io
import json
from zipfile import ZipFile
import unicodedata

In [2]:
import re
import xapian
import pandas as pd
import numpy as np

In [3]:
from tqdm import tqdm

In [4]:
pd.options.display.max_colwidth = 200

In [5]:
DBPATH = "sentence_index"

In [6]:
if not os.path.exists(DBPATH):
    os.mkdir(DBPATH)

# Build Index

In [7]:
zf = ZipFile("wiki-pages-text.zip")

In [8]:
files = [item.filename for item in zf.filelist]
len(files), files[1]

(110, 'wiki-pages-text/wiki-009.txt')

In [9]:
def read_shard(zf, path):
    items = []
    fp = zf.open(path, mode='r')
    tfp = io.TextIOWrapper(fp)
    nlines = 0
    for line in tfp.readlines():
        nlines += 1
        #line = unicodedata.normalize('NFD', line)
        match = re.match("(\S+)\s(\d+)\s(.*)\n", line)
        if match:
            item = list(match.groups())
            item[0] = unicodedata.normalize('NFD', item[0])
            items.append(item)
        else:
            #print(line)
            pass
    fp.close()
    tfp.close()
    return items#, nlines

In [10]:
def page_id_to_text(page_id):
        page_id = re.sub("(-LRB-|-LSB-|-RRB-|-RSB-)", "", page_id)
        page_id = re.sub("_", " ", page_id)
        return page_id
    
def read_shard_as_df(zf, path):
    items = read_shard(zf, path)
    raw_df = pd.DataFrame(data=items, columns=['page_id', 'sentence', 'text'])
    raw_df['sentence'] = raw_df.sentence.astype(int)
    raw_df['topic'] = raw_df.page_id.apply(page_id_to_text)
    raw_df['sentence_tokens_count'] = raw_df.text.apply(lambda x: len(x.split(' ')))
    func = lambda x: len([item for item in x.split(" ") if item.isalnum()])
    raw_df['sentence_words_count'] = raw_df.text.apply(func)
    return raw_df

In [15]:
%%time
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(DBPATH, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))



for path in tqdm(sorted(files[1:])[10:]):
    shard = ""
    m = re.match(".*wiki-(\d+).*", path)
    if m: shard = m.groups()[0]
    # print(path)
    
    sentences_df = read_shard_as_df(zf, path)
    mask = (sentences_df.sentence_tokens_count < 5) | \
            (sentences_df.sentence_tokens_count > 110) | \
            (sentences_df.sentence_words_count < 4)  | \
            ((sentences_df.sentence_words_count < 10) & \
                (1. * sentences_df.sentence_words_count / sentences_df.sentence_tokens_count < 0.3) )
    mask.sum()
    sentences_df = sentences_df[~mask]

    for page_id, sent_id, text, topic, _, _ in sentences_df.values:
        try:
            # We make a document and tell the term generator to use this.
            doc = xapian.Document()
            termgenerator.set_document(doc)

            # Index fields without prefixes for general search.
            termgenerator.index_text("{} {}".format(topic, text))
            termgenerator.increase_termpos()

            # We use the identifier to ensure each object ends up in the
            # database only once no matter how many times we run the
            # indexer.
            idterm = u"Q{}_{}".format(page_id, sent_id)
            doc.add_boolean_term(idterm)
            
            # Index each field with a suitable prefix.
            #termgenerator.index_text(topic, 1, 'S')

            # save additional data
            data = dict(
                page_id = page_id,
                sentence_id = sent_id,
                shard = shard,
                text = text,
                topic = topic
            )
            doc.set_data(json.dumps(data))

            db.replace_document(idterm, doc)
        except Exception as e:
            print(page_id, text, e)
db.commit()
db.close()

100%|██████████| 99/99 [8:31:19<00:00, 342.26s/it]  


CPU times: user 1h 32min 30s, sys: 5h 11min 10s, total: 6h 43min 40s
Wall time: 8h 31min 22s


In [16]:
!xapian-delve $DBPATH

UUID = 2e417945-ee51-4683-9012-a84c3edeb1a1
number of documents = 23931434
average document length = 43.0649
document length lower bound = 6
document length upper bound = 297
highest document id ever used = 23931434
has positional information = true
revision = 2412
currently open for writing = false


In [17]:
!xapian-delve -r 31029 -d $DBPATH

Data for record #31029:
{"page_id": "1892_Princeton_Tigers_football_team", "sentence_id": 0, "shard": "001", "text": "The 1892 Princeton Tigers football team represented Princeton University in the 1892 college football season .", "topic": "1892 Princeton Tigers football team"}
Term List for record #31029: 1892 Q1892_Princeton_Tigers_football_team_0 Zcolleg Zfootbal Zin Zprinceton Zrepres Zseason Zteam Zthe Ztiger Zunivers college football in princeton represented season team the tigers university
