In [1]:
import spacy
import os
import xapian
import re
import io
import shutil
import datetime
from tqdm import tqdm as tqdm

In [2]:
dbpath = 'data/xapIndex_SenID_Text'
datapath = 'data/wiki-pages-text/'

# Shard Paths

In [3]:
filesInDataPath = sorted([datapath + fileName for fileName in os.listdir(datapath)], reverse=True)
filesInDataPath = sorted(filesInDataPath)

In [4]:
assert(filesInDataPath[0]=='data/wiki-pages-text/wiki-001.txt')

['data/wiki-pages-text/wiki-001.txt',
 'data/wiki-pages-text/wiki-002.txt',
 'data/wiki-pages-text/wiki-003.txt',
 'data/wiki-pages-text/wiki-004.txt',
 'data/wiki-pages-text/wiki-005.txt',
 'data/wiki-pages-text/wiki-006.txt',
 'data/wiki-pages-text/wiki-007.txt',
 'data/wiki-pages-text/wiki-008.txt',
 'data/wiki-pages-text/wiki-009.txt',
 'data/wiki-pages-text/wiki-010.txt',
 'data/wiki-pages-text/wiki-011.txt',
 'data/wiki-pages-text/wiki-012.txt',
 'data/wiki-pages-text/wiki-013.txt',
 'data/wiki-pages-text/wiki-014.txt',
 'data/wiki-pages-text/wiki-015.txt',
 'data/wiki-pages-text/wiki-016.txt',
 'data/wiki-pages-text/wiki-017.txt',
 'data/wiki-pages-text/wiki-018.txt',
 'data/wiki-pages-text/wiki-019.txt',
 'data/wiki-pages-text/wiki-020.txt',
 'data/wiki-pages-text/wiki-021.txt',
 'data/wiki-pages-text/wiki-022.txt',
 'data/wiki-pages-text/wiki-023.txt',
 'data/wiki-pages-text/wiki-024.txt',
 'data/wiki-pages-text/wiki-025.txt',
 'data/wiki-pages-text/wiki-026.txt',
 'data/wiki-

Xapian build Index

In [5]:
x_db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

Set termgenerator for indexing

In [6]:
index = xapian.TermGenerator()

Set Stemmer

In [7]:
index.set_stemmer(xapian.Stem('en'))

Build Index

In [9]:
indexCounter = 0
print('Time started at', datetime.datetime.now().time())
with tqdm(total=len(filesInDataPath)) as pbar:
    for shardFile in filesInDataPath:
        with open(shardFile, 'r') as openedFile:
            print('Now processing:', shardFile)
            for line in openedFile:
                indexCounter += 1
                # Processing line
                docID, sentenceID, text = line.split(' ', 2)
                try:
                    sentenceID = int(sentenceID)
                except Exception:
                    pass
                xapianDoc = xapian.Document()
                xapianDoc.set_data(text + ',,,' + docID +',,,' + str(sentenceID))
                index.set_document(xapianDoc)
                index.index_text(text)
                index.increase_termpos()
                x_db.replace_document(indexCounter, xapianDoc)
        x_db.commit()
        pbar.update(1)
x_db.close()

  0%|          | 0/109 [00:00<?, ?it/s]

Time started at 14:48:27.741458
Now processing: data/wiki-pages-text/wiki-001.txt


  1%|          | 1/109 [00:28<51:34, 28.66s/it]

Now processing: data/wiki-pages-text/wiki-002.txt


  2%|▏         | 2/109 [00:58<51:59, 29.15s/it]

Now processing: data/wiki-pages-text/wiki-003.txt


  3%|▎         | 3/109 [01:38<57:10, 32.36s/it]

Now processing: data/wiki-pages-text/wiki-004.txt


  4%|▎         | 4/109 [02:20<1:01:26, 35.11s/it]

Now processing: data/wiki-pages-text/wiki-005.txt


  5%|▍         | 5/109 [03:07<1:07:04, 38.70s/it]

Now processing: data/wiki-pages-text/wiki-006.txt


  6%|▌         | 6/109 [03:57<1:12:14, 42.08s/it]

Now processing: data/wiki-pages-text/wiki-007.txt


  6%|▋         | 7/109 [04:52<1:18:16, 46.04s/it]

Now processing: data/wiki-pages-text/wiki-008.txt


  7%|▋         | 8/109 [05:52<1:24:25, 50.15s/it]

Now processing: data/wiki-pages-text/wiki-009.txt


  8%|▊         | 9/109 [06:56<1:30:27, 54.28s/it]

Now processing: data/wiki-pages-text/wiki-010.txt


  9%|▉         | 10/109 [08:04<1:36:17, 58.36s/it]

Now processing: data/wiki-pages-text/wiki-011.txt


 10%|█         | 11/109 [09:16<1:42:03, 62.49s/it]

Now processing: data/wiki-pages-text/wiki-012.txt


 11%|█         | 12/109 [10:30<1:46:55, 66.14s/it]

Now processing: data/wiki-pages-text/wiki-013.txt


 12%|█▏        | 13/109 [11:45<1:50:01, 68.76s/it]

Now processing: data/wiki-pages-text/wiki-014.txt


 13%|█▎        | 14/109 [13:07<1:55:08, 72.72s/it]

Now processing: data/wiki-pages-text/wiki-015.txt


 14%|█▍        | 15/109 [14:35<2:01:01, 77.25s/it]

Now processing: data/wiki-pages-text/wiki-016.txt


 15%|█▍        | 16/109 [16:01<2:03:57, 79.97s/it]

Now processing: data/wiki-pages-text/wiki-017.txt


 16%|█▌        | 17/109 [17:30<2:06:33, 82.54s/it]

Now processing: data/wiki-pages-text/wiki-018.txt


 17%|█▋        | 18/109 [19:01<2:09:06, 85.12s/it]

Now processing: data/wiki-pages-text/wiki-019.txt


 17%|█▋        | 19/109 [20:34<2:11:19, 87.55s/it]

Now processing: data/wiki-pages-text/wiki-020.txt


 18%|█▊        | 20/109 [22:13<2:14:40, 90.79s/it]

Now processing: data/wiki-pages-text/wiki-021.txt


 19%|█▉        | 21/109 [23:58<2:19:23, 95.04s/it]

Now processing: data/wiki-pages-text/wiki-022.txt


 20%|██        | 22/109 [25:43<2:22:23, 98.20s/it]

Now processing: data/wiki-pages-text/wiki-023.txt


 21%|██        | 23/109 [27:32<2:25:29, 101.50s/it]

Now processing: data/wiki-pages-text/wiki-024.txt


 22%|██▏       | 24/109 [29:20<2:26:11, 103.20s/it]

Now processing: data/wiki-pages-text/wiki-025.txt


 23%|██▎       | 25/109 [31:18<2:30:57, 107.82s/it]

Now processing: data/wiki-pages-text/wiki-026.txt


 24%|██▍       | 26/109 [33:26<2:37:23, 113.78s/it]

Now processing: data/wiki-pages-text/wiki-027.txt


 25%|██▍       | 27/109 [35:49<2:47:29, 122.56s/it]

Now processing: data/wiki-pages-text/wiki-028.txt


 26%|██▌       | 28/109 [37:59<2:48:18, 124.67s/it]

Now processing: data/wiki-pages-text/wiki-029.txt


 27%|██▋       | 29/109 [40:21<2:53:22, 130.04s/it]

Now processing: data/wiki-pages-text/wiki-030.txt


 28%|██▊       | 30/109 [42:30<2:50:38, 129.61s/it]

Now processing: data/wiki-pages-text/wiki-031.txt


 28%|██▊       | 31/109 [44:30<2:44:43, 126.71s/it]

Now processing: data/wiki-pages-text/wiki-032.txt


 29%|██▉       | 32/109 [46:30<2:40:08, 124.78s/it]

Now processing: data/wiki-pages-text/wiki-033.txt


 30%|███       | 33/109 [48:42<2:40:50, 126.97s/it]

Now processing: data/wiki-pages-text/wiki-034.txt


 31%|███       | 34/109 [50:40<2:35:17, 124.24s/it]

Now processing: data/wiki-pages-text/wiki-035.txt


 32%|███▏      | 35/109 [52:42<2:32:17, 123.48s/it]

Now processing: data/wiki-pages-text/wiki-036.txt


 33%|███▎      | 36/109 [54:43<2:29:34, 122.94s/it]

Now processing: data/wiki-pages-text/wiki-037.txt


 34%|███▍      | 37/109 [56:44<2:26:40, 122.23s/it]

Now processing: data/wiki-pages-text/wiki-038.txt


 35%|███▍      | 38/109 [58:57<2:28:26, 125.44s/it]

Now processing: data/wiki-pages-text/wiki-039.txt


 36%|███▌      | 39/109 [1:01:02<2:26:25, 125.50s/it]

Now processing: data/wiki-pages-text/wiki-040.txt


 37%|███▋      | 40/109 [1:03:01<2:22:03, 123.53s/it]

Now processing: data/wiki-pages-text/wiki-041.txt


 38%|███▊      | 41/109 [1:05:02<2:18:51, 122.52s/it]

Now processing: data/wiki-pages-text/wiki-042.txt


 39%|███▊      | 42/109 [1:07:03<2:16:19, 122.09s/it]

Now processing: data/wiki-pages-text/wiki-043.txt


 39%|███▉      | 43/109 [1:09:07<2:14:58, 122.70s/it]

Now processing: data/wiki-pages-text/wiki-044.txt


 40%|████      | 44/109 [1:11:25<2:17:56, 127.33s/it]

Now processing: data/wiki-pages-text/wiki-045.txt


 41%|████▏     | 45/109 [1:14:03<2:25:50, 136.72s/it]

Now processing: data/wiki-pages-text/wiki-046.txt


 42%|████▏     | 46/109 [1:16:37<2:28:52, 141.78s/it]

Now processing: data/wiki-pages-text/wiki-047.txt


 43%|████▎     | 47/109 [1:19:00<2:26:58, 142.23s/it]

Now processing: data/wiki-pages-text/wiki-048.txt


 44%|████▍     | 48/109 [1:21:24<2:24:53, 142.51s/it]

Now processing: data/wiki-pages-text/wiki-049.txt


 45%|████▍     | 49/109 [1:23:42<2:21:20, 141.34s/it]

Now processing: data/wiki-pages-text/wiki-050.txt


 46%|████▌     | 50/109 [1:26:05<2:19:17, 141.65s/it]

Now processing: data/wiki-pages-text/wiki-051.txt


 47%|████▋     | 51/109 [1:28:21<2:15:26, 140.11s/it]

Now processing: data/wiki-pages-text/wiki-052.txt


 48%|████▊     | 52/109 [1:30:42<2:13:18, 140.32s/it]

Now processing: data/wiki-pages-text/wiki-053.txt


 49%|████▊     | 53/109 [1:33:00<2:10:23, 139.70s/it]

Now processing: data/wiki-pages-text/wiki-054.txt


 50%|████▉     | 54/109 [1:35:13<2:06:16, 137.76s/it]

Now processing: data/wiki-pages-text/wiki-055.txt


 50%|█████     | 55/109 [1:37:20<2:00:53, 134.32s/it]

Now processing: data/wiki-pages-text/wiki-056.txt


 51%|█████▏    | 56/109 [1:39:29<1:57:27, 132.98s/it]

Now processing: data/wiki-pages-text/wiki-057.txt


 52%|█████▏    | 57/109 [1:41:59<1:59:32, 137.92s/it]

Now processing: data/wiki-pages-text/wiki-058.txt


 53%|█████▎    | 58/109 [1:44:25<1:59:23, 140.47s/it]

Now processing: data/wiki-pages-text/wiki-059.txt


 54%|█████▍    | 59/109 [1:47:03<2:01:28, 145.78s/it]

Now processing: data/wiki-pages-text/wiki-060.txt


 55%|█████▌    | 60/109 [1:49:48<2:03:40, 151.44s/it]

Now processing: data/wiki-pages-text/wiki-061.txt


 56%|█████▌    | 61/109 [1:52:33<2:04:23, 155.48s/it]

Now processing: data/wiki-pages-text/wiki-062.txt


 57%|█████▋    | 62/109 [1:55:23<2:05:06, 159.72s/it]

Now processing: data/wiki-pages-text/wiki-063.txt


 58%|█████▊    | 63/109 [1:58:01<2:02:05, 159.24s/it]

Now processing: data/wiki-pages-text/wiki-064.txt


 59%|█████▊    | 64/109 [2:00:36<1:58:25, 157.89s/it]

Now processing: data/wiki-pages-text/wiki-065.txt


 60%|█████▉    | 65/109 [2:03:13<1:55:40, 157.73s/it]

Now processing: data/wiki-pages-text/wiki-066.txt


 61%|██████    | 66/109 [2:10:56<2:58:40, 249.31s/it]

Now processing: data/wiki-pages-text/wiki-067.txt


 61%|██████▏   | 67/109 [2:13:49<2:38:26, 226.35s/it]

Now processing: data/wiki-pages-text/wiki-068.txt


 62%|██████▏   | 68/109 [2:16:39<2:23:14, 209.62s/it]

Now processing: data/wiki-pages-text/wiki-069.txt


 63%|██████▎   | 69/109 [2:19:29<2:11:44, 197.61s/it]

Now processing: data/wiki-pages-text/wiki-070.txt


 64%|██████▍   | 70/109 [2:22:23<2:03:49, 190.50s/it]

Now processing: data/wiki-pages-text/wiki-071.txt


 65%|██████▌   | 71/109 [2:25:20<1:58:03, 186.41s/it]

Now processing: data/wiki-pages-text/wiki-072.txt


 66%|██████▌   | 72/109 [2:28:14<1:52:46, 182.88s/it]

Now processing: data/wiki-pages-text/wiki-073.txt


 67%|██████▋   | 73/109 [2:31:29<1:51:49, 186.36s/it]

Now processing: data/wiki-pages-text/wiki-074.txt


 68%|██████▊   | 74/109 [2:34:43<1:50:08, 188.80s/it]

Now processing: data/wiki-pages-text/wiki-075.txt


 69%|██████▉   | 75/109 [2:37:54<1:47:20, 189.44s/it]

Now processing: data/wiki-pages-text/wiki-076.txt


 70%|██████▉   | 76/109 [2:41:30<1:48:35, 197.43s/it]

Now processing: data/wiki-pages-text/wiki-077.txt


 71%|███████   | 77/109 [2:44:49<1:45:32, 197.90s/it]

Now processing: data/wiki-pages-text/wiki-078.txt


 72%|███████▏  | 78/109 [2:48:16<1:43:41, 200.68s/it]

Now processing: data/wiki-pages-text/wiki-079.txt


 72%|███████▏  | 79/109 [2:52:03<1:44:16, 208.55s/it]

Now processing: data/wiki-pages-text/wiki-080.txt


 73%|███████▎  | 80/109 [2:55:53<1:43:55, 215.01s/it]

Now processing: data/wiki-pages-text/wiki-081.txt


 74%|███████▍  | 81/109 [2:59:33<1:40:56, 216.32s/it]

Now processing: data/wiki-pages-text/wiki-082.txt


 75%|███████▌  | 82/109 [3:03:17<1:38:21, 218.56s/it]

Now processing: data/wiki-pages-text/wiki-083.txt


 76%|███████▌  | 83/109 [3:06:47<1:33:35, 215.98s/it]

Now processing: data/wiki-pages-text/wiki-084.txt


 77%|███████▋  | 84/109 [3:10:08<1:28:07, 211.48s/it]

Now processing: data/wiki-pages-text/wiki-085.txt


 78%|███████▊  | 85/109 [3:13:46<1:25:28, 213.69s/it]

Now processing: data/wiki-pages-text/wiki-086.txt


 79%|███████▉  | 86/109 [3:17:25<1:22:30, 215.24s/it]

Now processing: data/wiki-pages-text/wiki-087.txt


 80%|███████▉  | 87/109 [3:21:10<1:19:55, 217.96s/it]

Now processing: data/wiki-pages-text/wiki-088.txt


 81%|████████  | 88/109 [3:25:19<1:19:34, 227.34s/it]

Now processing: data/wiki-pages-text/wiki-089.txt


 82%|████████▏ | 89/109 [3:29:42<1:19:24, 238.25s/it]

Now processing: data/wiki-pages-text/wiki-090.txt


 83%|████████▎ | 90/109 [3:33:48<1:16:10, 240.57s/it]

Now processing: data/wiki-pages-text/wiki-091.txt


 83%|████████▎ | 91/109 [3:37:56<1:12:46, 242.59s/it]

Now processing: data/wiki-pages-text/wiki-092.txt


 84%|████████▍ | 92/109 [3:41:55<1:08:29, 241.74s/it]

Now processing: data/wiki-pages-text/wiki-093.txt


 85%|████████▌ | 93/109 [3:46:12<1:05:37, 246.10s/it]

Now processing: data/wiki-pages-text/wiki-094.txt


 86%|████████▌ | 94/109 [3:50:09<1:00:53, 243.58s/it]

Now processing: data/wiki-pages-text/wiki-095.txt


 87%|████████▋ | 95/109 [3:54:16<57:03, 244.57s/it]  

Now processing: data/wiki-pages-text/wiki-096.txt


 88%|████████▊ | 96/109 [3:58:21<52:58, 244.52s/it]

Now processing: data/wiki-pages-text/wiki-097.txt


 89%|████████▉ | 97/109 [4:02:20<48:34, 242.89s/it]

Now processing: data/wiki-pages-text/wiki-098.txt


 90%|████████▉ | 98/109 [4:06:31<44:58, 245.32s/it]

Now processing: data/wiki-pages-text/wiki-099.txt


 91%|█████████ | 99/109 [4:11:06<42:21, 254.14s/it]

Now processing: data/wiki-pages-text/wiki-100.txt


 92%|█████████▏| 100/109 [4:15:25<38:20, 255.65s/it]

Now processing: data/wiki-pages-text/wiki-101.txt


 93%|█████████▎| 101/109 [4:19:55<34:39, 259.91s/it]

Now processing: data/wiki-pages-text/wiki-102.txt


 94%|█████████▎| 102/109 [4:24:19<30:27, 261.14s/it]

Now processing: data/wiki-pages-text/wiki-103.txt


 94%|█████████▍| 103/109 [4:28:36<26:00, 260.16s/it]

Now processing: data/wiki-pages-text/wiki-104.txt


 95%|█████████▌| 104/109 [4:33:15<22:08, 265.77s/it]

Now processing: data/wiki-pages-text/wiki-105.txt


 96%|█████████▋| 105/109 [4:37:42<17:44, 266.01s/it]

Now processing: data/wiki-pages-text/wiki-106.txt


 97%|█████████▋| 106/109 [4:42:12<13:21, 267.27s/it]

Now processing: data/wiki-pages-text/wiki-107.txt


 98%|█████████▊| 107/109 [4:46:48<08:59, 269.82s/it]

Now processing: data/wiki-pages-text/wiki-108.txt


 99%|█████████▉| 108/109 [4:51:15<04:29, 269.01s/it]

Now processing: data/wiki-pages-text/wiki-109.txt


100%|██████████| 109/109 [4:52:36<00:00, 212.49s/it]


Check DB statistics

In [10]:
!xapian-delve $dbpath

UUID = aec569e4-d627-4bcd-91f6-b56749d28c15
number of documents = 25248397
average document length = 35.7647
document length lower bound = 1
document length upper bound = 7800
highest document id ever used = 25248397
has positional information = true
revision = 2580
currently open for writing = false


Search DB

In [11]:
TOP_RESULTS_LIMIT = 15

In [12]:
def search(dbpath, querystring, offset=0, pagesize=10):
    
    database = xapian.Database(dbpath)
    enquire = xapian.Enquire(database)
    query_string = querystring

    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = qp.parse_query(query_string)
    #print "Parsed query is: %s" % str(query)

    # Find the top results for the query.
    enquire.set_query(query)
    matches = enquire.get_mset(0, TOP_RESULTS_LIMIT)

    # Display the results.
    #print(%i results found." % matches.get_matches_estimated()
    #print "Results 1-%i:" % matches.size()

    for m in matches:
        print('RANK:', m.rank + 1)
        print('PERCENTAGE MATCH:', m.percent)
        print('DOC ID:', m.docid)
        print('DOC TXT:', m.document.get_data())

In [22]:
query = "When in rome do as romans do"
matches = search(dbpath, query)

RANK: 1
PERCENTAGE MATCH: 100
DOC ID: 24496184
DOC TXT: b"`` When in Rome , do as the Romans do '' , a saying attributed to Ambrose .\n,,,When_in_Rome,,,3"
RANK: 2
PERCENTAGE MATCH: 94
DOC ID: 24431417
DOC TXT: b"That reply is said to have brought about the saying `` When in Rome , do as the Romans do . ''\n,,,When_in_Rome,_do_as_the_Romans_do,,,6"
RANK: 3
PERCENTAGE MATCH: 92
DOC ID: 24431413
DOC TXT: b'When in Rome , do as the Romans do -LRB- often shortened to when in Rome ... -RRB- or a later version when in Rome , do as the Pope does , a proverb attributed to Saint Ambrose , means that it is advisable to follow the conventions of the area in which you are residing or visiting .\n,,,When_in_Rome,_do_as_the_Romans_do,,,0'
RANK: 4
PERCENTAGE MATCH: 85
DOC ID: 24714837
DOC TXT: b"When in Rome Do as The Vandals is the first album by the Huntington Beach punk rock band The Vandals , released in 1984 by National Trust Records , Its title is a play on the phrase `` When in Rome , do as th