In [None]:
import os
import io
from zipfile import ZipFile

In [23]:
import shutil

In [25]:
import re
import xapian
import pandas as pd

# Create the Schema and Build the sckeleton of index

In [72]:
#folder_path = "index"
#if os.path.exists(folder_path):
#    shutil.rmtree(folder_path, ignore_errors=False, onerror=None)
#os.mkdir(folder_path)

In [73]:
dbpath = "full_index"

# Build Index

In [7]:
zf = ZipFile("../wiki-pages-text.zip")

In [8]:
files = [item.filename for item in zf.filelist]
len(files)

110

In [67]:
def read_shard(zf, path):
    items = []
    fp = zf.open(path, mode='r')
    tfp = io.TextIOWrapper(fp)
    for line in tfp.readlines():
        match = re.match("(\w+)\s(\d+)\s(.*)\n", line)
        if match:
            items.append(match.groups())
    fp.close()
    tfp.close()
    raw_df = pd.DataFrame(data=items, columns=['doc_id', 'sentence', 'text'])
    func = lambda x: " ".join(x)
    return raw_df.groupby('doc_id')['text'].agg(func)

In [69]:
%%time
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))

for path in sorted(files[1:]):
    print(path)
    documents_df = read_shard(zf, path)
    for doc_id, text in documents_df.items():
        # We make a document and tell the term generator to use this.
        doc = xapian.Document()
        termgenerator.set_document(doc)

        # Index each field with a suitable prefix.
        termgenerator.index_text(doc_id, 1, 'S')
        #termgenerator.index_text(description, 1, 'XD')

        # Index fields without prefixes for general search.
        termgenerator.index_text(text)
        termgenerator.increase_termpos()
        #termgenerator.index_text(description)

        # We use the identifier to ensure each object ends up in the
        # database only once no matter how many times we run the
        # indexer.
        idterm = u"Q" + doc_id
        doc.add_boolean_term(idterm)
        db.replace_document(idterm, doc)
db.commit()

wiki-pages-text/wiki-009.txt
wiki-pages-text/wiki-021.txt
wiki-pages-text/wiki-035.txt
wiki-pages-text/wiki-034.txt
wiki-pages-text/wiki-020.txt
wiki-pages-text/wiki-008.txt
wiki-pages-text/wiki-036.txt
wiki-pages-text/wiki-022.txt
wiki-pages-text/wiki-023.txt
wiki-pages-text/wiki-037.txt
wiki-pages-text/wiki-033.txt
wiki-pages-text/wiki-027.txt
wiki-pages-text/wiki-026.txt
wiki-pages-text/wiki-032.txt
wiki-pages-text/wiki-024.txt
wiki-pages-text/wiki-030.txt
wiki-pages-text/wiki-018.txt
wiki-pages-text/wiki-019.txt
wiki-pages-text/wiki-031.txt
wiki-pages-text/wiki-025.txt
wiki-pages-text/wiki-042.txt
wiki-pages-text/wiki-056.txt
wiki-pages-text/wiki-081.txt
wiki-pages-text/wiki-095.txt
wiki-pages-text/wiki-094.txt
wiki-pages-text/wiki-080.txt
wiki-pages-text/wiki-057.txt
wiki-pages-text/wiki-043.txt
wiki-pages-text/wiki-069.txt
wiki-pages-text/wiki-055.txt
wiki-pages-text/wiki-041.txt
wiki-pages-text/wiki-096.txt
wiki-pages-text/wiki-082.txt
wiki-pages-text/wiki-109.txt
wiki-pages-tex

In [70]:
db.close()

In [71]:
!xapian-delve $dbpath

UUID = b940719a-3fb5-4d57-8b01-3186790b1ade
number of documents = 3833466
average document length = 170.939
document length lower bound = 1
document length upper bound = 31154
highest document id ever used = 3833466
has positional information = true
revision = 392
currently open for writing = false


# Query Index

In [109]:
def search(dbpath, querystring, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrieve

    # Open the database we're going to search.
    db = xapian.Database(dbpath)

    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stemming_strategy(queryparser.STEM_SOME)
    # Start of prefix configuration.
    queryparser.add_prefix("title", "S")
    #queryparser.add_prefix("description", "XD")
    # End of prefix configuration.

    # And parse the query
    query = queryparser.parse_query(querystring)

    # Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(db)
    enquire.set_query(query)

    # And print out something about each match
    matches = []
    for match in enquire.get_mset(offset, pagesize):
        #fields = json.loads(match.document.get_data())
        print(u"{rank}:  #{docid} {weight}".format(rank=match.rank + 1,
            docid=match.docid,
            weight=match.weight
            #'title': "", # fields.get('TITLE', u''),
            ))
        matches.append(match)
    return matches

In [110]:
query = "Chris Hemsworth appeared in A Perfect Getaway"
matches = search(dbpath, query)

1:  #290771 36.334455075257004
2:  #1833129 22.554101782575767
3:  #2694195 22.07625491199836
4:  #2213441 21.626374102687926
5:  #1884464 18.547556188185897
6:  #752650 18.040562388801664
7:  #1636705 18.025709976962716
8:  #218024 17.94392412974872
9:  #3525993 17.707174950928362
10:  #520098 17.525034614551657


In [128]:
match = matches[-1]
for term in match.document.termlist():
    #print(term.term)
    pass

In [127]:
for item in match.document:
    pass
    #print(item.term)