In [1]:
import os
import io
import json
from zipfile import ZipFile

In [2]:
from tqdm import tqdm

In [3]:
import re
import xapian
import pandas as pd
import numpy as np
import nltk

In [4]:
pd.options.display.max_colwidth = 100

In [5]:
DBPATH = "bigrams_index"
PARSED_CORPUS = 'data/parsed_corpus'

In [6]:
if not os.path.exists(DBPATH):
    os.mkdir(DBPATH)

# Build Index

In [7]:
zf = ZipFile("../wiki-pages-text.zip")

In [8]:
files = [item.filename for item in zf.filelist]
len(files), files[1]

(110, 'wiki-pages-text/wiki-009.txt')

In [9]:
def read_shard(zf, path):
    items = []
    fp = zf.open(path, mode='r')
    tfp = io.TextIOWrapper(fp)
    nlines = 0
    for line in tfp.readlines():
        nlines += 1
        match = re.match("(\S+)\s(\d+)\s(.*)\n", line)
        if match:
            items.append(match.groups())
        else:
            #print(line)
            pass
    fp.close()
    tfp.close()
    return items#, nlines

In [10]:
path = sorted(files[1:])[0]

In [11]:
%%time
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(DBPATH, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))

for path in sorted(files[1:]):
    shard = ""
    m = re.match(".*wiki-(\d+).*", path)
    if m: shard = m[1]
    print(path)
    
    items = read_shard(zf, path)
    raw_df = pd.DataFrame(data=items, columns=['page_id', 'sentence', 'text'])

    # text
    func = lambda x: " ".join(x)
    docs_df = raw_df.groupby('page_id')['text'].agg(func)

    # keywords
    keywords_df = pd.read_json('{}/{}.json'.format(PARSED_CORPUS, shard), orient='split')#.set_index(index_cols)
    func = lambda ngrams: ["_".join(ngram).lower() for ngram in ngrams]
    keywords_df['ngrams'] = keywords_df.keywords.apply(lambda x: func(nltk.bigrams(x))) #\
    #                        + keywords_df.keywords.apply(lambda x: func(nltk.trigrams(x)))

    # bi/tri-grams
    def func(items): return set(items)
    ngrams_series = keywords_df.groupby('page_id')['ngrams'].agg(sum).apply(func)

    # sentences
    raw_df['sentence'] = raw_df['sentence'].astype(int)
    raw_df['text'] = raw_df.apply(lambda v: {'sent_id': v['sentence'], 'text': v['text']}, axis=1)
    sentences_series = raw_df.groupby('page_id')['text'].agg(list)
    
    for page_id, text in docs_df.items():
        try:
            # We make a document and tell the term generator to use this.
            doc = xapian.Document()
            termgenerator.set_document(doc)

            # Index fields without prefixes for general search.
            termgenerator.index_text(text)
            termgenerator.increase_termpos()

            # We use the identifier to ensure each object ends up in the
            # database only once no matter how many times we run the
            # indexer.
            idterm = u"Q" + page_id
            doc.add_boolean_term(idterm)

            # Indexing keywords
            keywords = []
            if page_id in ngrams_series:
                for item in ngrams_series.loc[page_id]:
                    #doc.add_term(u"K" + item.lower())
                    doc.add_term(item.lower())
            else:
                print("No keywords found for page_id={}".format(page_id))

            # save additional data
            data = dict(
                page_id = page_id,
                shard = shard,
                text = sentences_series.loc[page_id],
                #keywords = keywords,
            )
            doc.set_data(json.dumps(data))

            db.replace_document(idterm, doc)
        except Exception as e:
            print(page_id, text, e)
db.commit()
db.close()

wiki-pages-text/wiki-001.txt
wiki-pages-text/wiki-002.txt
wiki-pages-text/wiki-003.txt
wiki-pages-text/wiki-004.txt
wiki-pages-text/wiki-005.txt
wiki-pages-text/wiki-006.txt
wiki-pages-text/wiki-007.txt
wiki-pages-text/wiki-008.txt
wiki-pages-text/wiki-009.txt
wiki-pages-text/wiki-010.txt
wiki-pages-text/wiki-011.txt
wiki-pages-text/wiki-012.txt
wiki-pages-text/wiki-013.txt
wiki-pages-text/wiki-014.txt
wiki-pages-text/wiki-015.txt
wiki-pages-text/wiki-016.txt
wiki-pages-text/wiki-017.txt
wiki-pages-text/wiki-018.txt
wiki-pages-text/wiki-019.txt
wiki-pages-text/wiki-020.txt
wiki-pages-text/wiki-021.txt
wiki-pages-text/wiki-022.txt
wiki-pages-text/wiki-023.txt
wiki-pages-text/wiki-024.txt
wiki-pages-text/wiki-025.txt
wiki-pages-text/wiki-026.txt
wiki-pages-text/wiki-027.txt
wiki-pages-text/wiki-028.txt
wiki-pages-text/wiki-029.txt
wiki-pages-text/wiki-030.txt
wiki-pages-text/wiki-031.txt
wiki-pages-text/wiki-032.txt
wiki-pages-text/wiki-033.txt
wiki-pages-text/wiki-034.txt
wiki-pages-tex

wiki-pages-text/wiki-079.txt
wiki-pages-text/wiki-080.txt
wiki-pages-text/wiki-081.txt
wiki-pages-text/wiki-082.txt
wiki-pages-text/wiki-083.txt
wiki-pages-text/wiki-084.txt
wiki-pages-text/wiki-085.txt
wiki-pages-text/wiki-086.txt
wiki-pages-text/wiki-087.txt
wiki-pages-text/wiki-088.txt
wiki-pages-text/wiki-089.txt
wiki-pages-text/wiki-090.txt
wiki-pages-text/wiki-091.txt
wiki-pages-text/wiki-092.txt
wiki-pages-text/wiki-093.txt
wiki-pages-text/wiki-094.txt
wiki-pages-text/wiki-095.txt
wiki-pages-text/wiki-096.txt
wiki-pages-text/wiki-097.txt
wiki-pages-text/wiki-098.txt
wiki-pages-text/wiki-099.txt
wiki-pages-text/wiki-100.txt
wiki-pages-text/wiki-101.txt
wiki-pages-text/wiki-102.txt
wiki-pages-text/wiki-103.txt
wiki-pages-text/wiki-104.txt
wiki-pages-text/wiki-105.txt
wiki-pages-text/wiki-106.txt
wiki-pages-text/wiki-107.txt
wiki-pages-text/wiki-108.txt
wiki-pages-text/wiki-109.txt
CPU times: user 2h 15min 53s, sys: 5h 43min 25s, total: 7h 59min 19s
Wall time: 10h 7min 37s


In [12]:
!xapian-delve $DBPATH

UUID = f8fab680-224e-426c-9efe-cd5abea8d234
number of documents = 5378626
average document length = 205.718
document length lower bound = 1
document length upper bound = 69924
highest document id ever used = 5378626
has positional information = true
revision = 538
currently open for writing = false


In [13]:
!xapian-delve -r 3129 -d $DBPATH

Data for record #3129:
{"page_id": "11th_Gemini_Awards", "shard": "001", "text": [{"sent_id": 0, "text": "The 11th Gemini Awards was held on June 6 , 1997 , to honour achievements in Canadian television ."}, {"sent_id": 1, "text": "It was hosted by Albert Schultz , and was broadcast on CBC ."}]}
Term List for record #3129: 11th 11th_gemini 1997 1997_honour 6 6_1997 Q11th_Gemini_Awards Zachiev Zalbert Zand Zaward Zbroadcast Zby Zcanadian Zcbc Zgemini Zheld Zhonour Zhost Zin Zit Zjune Zon Zschultz Ztelevis Zthe Zto Zwas achievements achievements_canadian albert albert_schultz and awards awards_june broadcast broadcast_cbc by canadian canadian_television cbc gemini gemini_awards held honour honour_achievements hosted in it june june_6 on schultz schultz_broadcast television the to was
