In [1]:
import os
from whoosh.index import create_in
from whoosh.fields import *
import sys
from tqdm import tqdm_notebook as tqdm
from whoosh.analysis import StemmingAnalyzer
from whoosh.qparser import QueryParser
import time
import glob

# Create the Schema and Build the sckeleton of index

In [2]:
schema = Schema(page_id=TEXT(stored=True), sentence_num=TEXT(stored=True), shard_num=NUMERIC(stored=True)
                ,content=TEXT,textdata=TEXT(stored=True))

In [3]:
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
ix = create_in("indexdir",schema=schema)

In [4]:
writer = ix.writer()

# Read the files and Write the documents to the Schema

In [5]:
path = "/Users/hima95/Downloads/wiki-pages-text/"
files = [f for f in glob.glob(path + "**/*.txt", recursive=True)]

In [6]:
start = time.time()
for shard_path in files:
    fp = open(shard_path,'r')
    text = fp.readlines()
    shard = shard_path.split("-")[3].split(".txt")[0]
    for line in text:
        #print(line.split(" ")[0],line.split(" ")[1])
        line_list = line.split(" ")
        page_id = line_list.pop(0)
        sentence_id = line_list[0]
        if sentence_id.isnumeric():
            sentence_id = line_list.pop(0)
        else:
            sentence_id = "UNKNOWN"
        sentence = " ".join(line_list)
        sentence = sentence.replace("\n","")

        # Now we add each of the documents
        writer.add_document(page_id=page_id, sentence_num=sentence_id,shard_num=shard, \
                            content=sentence,textdata=sentence)

    fp.close()
writer.commit()
print("TIME:", time.time() - start)

KeyboardInterrupt: 

# Load Stored Index and Run the Query

In [7]:
# Creating Search Index from stored index

In [7]:
from whoosh.qparser import QueryParser
from whoosh import scoring
from whoosh.index import open_dir

In [8]:
ix2 = open_dir("indexdir")

In [9]:
# query_str is query string
query_str = "Fiesta Bowl was played in Sun Devil Stadium "
# Top 'n' documents as result
topN = int(5)

In [10]:
with ix2.searcher(weighting=scoring.Frequency) as searcher:
    query = QueryParser("content", ix2.schema).parse(query_str)
    results = searcher.search(query,limit=topN)
    for i in range(topN):
        if i == len(results):
            break
        print("Page_ID:",results[i]['page_id'],
              "Score:",str(results[i].score),
              "Sentence ID:",str(results[i]["sentence_num"]),"\n",results[i]['textdata'],"\n")

Page_ID: 1986_Fiesta_Bowl Score: 8.0 Sentence ID: 0 
 The 1986 Sunkist Fiesta Bowl was a college football bowl game played on January 1 , 1986 , at Sun Devil Stadium in Tempe , Arizona . 

Page_ID: 1999_Fiesta_Bowl Score: 7.0 Sentence ID: 0 
 The 1999 Fiesta Bowl , the designated BCS National Championship Game for the 1998 season , was played on January 4 , 1999 , in Tempe , Arizona at Sun Devil Stadium . 

Page_ID: 1987_Fiesta_Bowl Score: 7.0 Sentence ID: 1 
 The game was the 16th edition of the Fiesta Bowl , played annually since 1971 at Sun Devil Stadium in Tempe , Arizona . 

