In [16]:
import spacy
import os
import xapian
import re
import io
import shutil
import nltk
import json
import time
from tqdm import tqdm as tqdm

In [15]:
dbpath = 'index/xapIndex_SenID_Text'
datapath = 'data/wiki-pages-text/'

# Shard Paths

In [4]:
filesInDataPath = sorted([datapath + fileName for fileName in os.listdir(datapath)], reverse=True)
filesInDataPath = sorted(filesInDataPath)

In [5]:
assert(filesInDataPath[0]=='data/wiki-pages-text/wiki-001.txt')

# Helper function

In [None]:
def shardToDF(shardPath):
    tempArray = []
    with open(shardPath, 'r') as openedFile:
        for line in openedFile:
            pageTitle, sentenceNo, pageText = line.split(' ', 2)
            try:
                sentenceNo = int(sentenceNo)
            except Exception:
                print(line)
                pass
            tempArray.append([pageTitle, sentenceNo, pageText])
        return pd.DataFrame.from_records(tempArray, columns=['pageTitle','sentenceNo' ,'pageText'])

# XAPIAN

Xapian build Index

In [6]:
x_db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

Set termgenerator for indexing

In [7]:
index = xapian.TermGenerator()

Set Stemmer

In [8]:
index.set_stemmer(xapian.Stem('en'))

Build Index

In [9]:
with tqdm(total=len(filesInDataPath[:1])) as pbar:
    for shardFile in filesInDataPath[:1]:
        shardNumber = shardFile.split('text/wiki-')[1].split('.')[0]
        with open(shardFile, 'r') as openedFile:
            for line in openedFile:
                
                pageTitle, sentenceNo, pageText = line.split(' ', 2)
                indexCounter = u"Q" + pageTitle
                try:
                    pageTitle = normalize(pageTitle)
                    pageText = normalize(pageText)
                    sentenceNo = int(sentenceNo)
                except Exception:
                    pass
                
                '''Set the data that we want to store'''
                xapianDoc = xapian.Document()
                dataDict = {'shard':shardNumber,
                            'pageTitle':pageTitle,
                            'sentenceNo':sentenceNo,
                            'pageText':pageText}
                xapianDoc.set_data(json.dumps(dataDict))
                
                '''Index book-keeping'''
                index.set_document(xapianDoc)
                index.index_text(pageText)
                index.increase_termpos()
                
                x_db.replace_document(indexCounter, xapianDoc)
        x_db.commit()
        pbar.update(1)
x_db.close()

100%|██████████| 1/1 [00:37<00:00, 37.87s/it]


Check DB statistics

In [10]:
!xapian-delve $dbpath

UUID = d9c0b167-f7a9-4da8-8093-4fcdb4e0a341
number of documents = 341096
average document length = 39.5802
document length lower bound = 2
document length upper bound = 412
highest document id ever used = 341096
has positional information = true
revision = 36
currently open for writing = false


Search DB

In [11]:
TOP_RESULTS_LIMIT = 10

def search(dbpath, querystring, offset=0, pagesize=10):
    
    database = xapian.Database(dbpath)
    enquire = xapian.Enquire(database)
    query_string = querystring

    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = qp.parse_query(query_string)
    #print "Parsed query is: %s" % str(query)

    # Find the top results for the query.
    enquire.set_query(query)
    matches = enquire.get_mset(0, TOP_RESULTS_LIMIT)

    # Display the results.
    #print(%i results found." % matches.get_matches_estimated()
    #print "Results 1-%i:" % matches.size()

    for m in matches:
        print('RANK:', m.rank + 1)
        print('PERCENTAGE MATCH:', m.percent)
        print('DOC ID:', m.docid)
        print('DOC TXT:', m.document.get_data())

In [19]:
query = "The Boston Celtics play their home games at TD Garden"
matches = search(dbpath, query)

RANK: 1
PERCENTAGE MATCH: 100
DOC ID: 3627901
DOC TXT: b"The Celtics play their home games at the TD Garden , which they share with the National Hockey League -LRB- NHL -RRB- 's Boston Bruins .\n,,,Boston_Celtics,,,3"
RANK: 2
PERCENTAGE MATCH: 98
DOC ID: 11943886
DOC TXT: b"The Boston Celtics of the NBA currently use `` Circles '' as the intro music for their home games at TD Garden .\n,,,KDrew,,,2"
RANK: 3
PERCENTAGE MATCH: 83
DOC ID: 275093
DOC TXT: b'The 1952 NBA All-Star Game was an exhibition basketball game played on February 11 , 1952 , at Boston Garden in Boston , Massachusetts , home of the Boston Celtics .\n,,,1952_NBA_All-Star_Game,,,0'
RANK: 4
PERCENTAGE MATCH: 83
DOC ID: 301416
DOC TXT: b'The 1951 NBA All-Star Game was an exhibition basketball game played on March 2 , 1951 , at Boston Garden in Boston , Massachusetts , home of the Boston Celtics .\n,,,1951_NBA_All-Star_Game,,,0'
RANK: 5
PERCENTAGE MATCH: 81
DOC ID: 385991
DOC TXT: b"Quarterfinal games were played at home t