Whoosh is a fast pure python search engine library. The primary design impetus of whoosh is that it is pure python. Whoosh isn't a search engine. Rather, it is a developer library for creating a search engineer. Let's create some search capability within Python. 

In [731]:
import sys, os, os.path
from os.path import join
from whoosh import index
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer

In [732]:
booksource = "datasets/books-lite"
print(os.path.exists(booksource))

True


In [733]:
bookSourceDirectory = os.listdir(booksource)

## Clean up files in the book directory

In [734]:
def cleanUp(bookSourceDirectory):
    for file in bookSourceDirectory:
        if not file.endswith(".txt"):
            filePath = os.path.join(booksource, file)
            print("Deleting:", filePath)        
            os.remove(filePath)

In [735]:
try:
    try:
        cleanUp(bookSourceDirectory)
    except:
        writer.cancel()
        cleanUp(bookSourceDirectory)
except:
    print("ERROR:\n", sys.exc_info()[0])

Deleting: datasets/books-lite\MAIN_18k5gdaboatzal4e.seg
Deleting: datasets/books-lite\MAIN_WRITELOCK
Deleting: datasets/books-lite\_MAIN_1.toc


In [736]:
print(os.listdir(booksource))

['acts.txt', 'numbers.txt', 'romans.txt']


## Create schema and load the data

In [737]:
schema = Schema(filename = ID (stored = True),
                content = TEXT (analyzer = StemmingAnalyzer())
               )

In [738]:
# this clears the existing index in the directory
ix = index.create_in(booksource, schema)

In [739]:
# get a writer from the created index
writer = ix.writer()

In [740]:
def loadFile(writer, fileName):
    # read contents of a file and load it into a database
    with open(fileName, "r") as infile:
        content = infile.read()
        print(content[0:25])
        writer.add_document(filename = fileName, content = content)
        print("Indexed:", fileName)
        print("================")

In [741]:
def processFolder(writer, folder):
    # process a folder for files and subfolders
    print("Processing folder:", folder)
    for root, dirs, files in os.walk(folder):
        print("root = ", root)
        
        # process the files
        for file in files:
            filename = os.path.join(root, file)
            if file.endswith(".txt"):                
                print("Processing File:", filename)
                loadFile(writer, filename)
            else:
                print("Unhandled File:", filename)
                
        # recurse into the subfolders
        for d in dirs:
            print("Recursing into:", d)
            processFolder(writer, d)

In [742]:
processFolder(writer, booksource)
writer.commit()

Processing folder: datasets/books-lite
root =  datasets/books-lite
Processing File: datasets/books-lite\acts.txt
ACTS OF THE APOSTLES 
1:1
Indexed: datasets/books-lite\acts.txt
Unhandled File: datasets/books-lite\MAIN_r56illgb683xbhv5.pst
Unhandled File: datasets/books-lite\MAIN_r56illgb683xbhv5.trm
Unhandled File: datasets/books-lite\MAIN_WRITELOCK
Processing File: datasets/books-lite\numbers.txt
The Fourth Book of Moses 
Indexed: datasets/books-lite\numbers.txt
Processing File: datasets/books-lite\romans.txt
ROMANS
  1:1: Paul, a ser
Indexed: datasets/books-lite\romans.txt
Unhandled File: datasets/books-lite\_MAIN_0.toc
Recursing into: MAIN.tmp
Processing folder: MAIN.tmp
root =  datasets/books-lite\MAIN.tmp
Unhandled File: datasets/books-lite\MAIN.tmp\i40h0vblm6b2dofx8movploakss3.ctmp


## Executing Queries
http://whoosh.readthedocs.io/en/latest/searching.html

In [743]:
from whoosh.qparser import QueryParser

In [744]:
def search(query):
    qp = QueryParser("content", schema = ix.schema)
    q = qp.parse(query)
    with ix.searcher() as s:
        results = s.search(q)
        print("Results:", len(results))
        print("\n")
        for hit in results:
            print(hit)

In [745]:
search(u"judged OR power")

Results: 3


<Hit {'filename': 'datasets/books-lite\\romans.txt'}>
<Hit {'filename': 'datasets/books-lite\\acts.txt'}>
<Hit {'filename': 'datasets/books-lite\\numbers.txt'}>


In [746]:
search(u"abode")

Results: 2


<Hit {'filename': 'datasets/books-lite\\acts.txt'}>
<Hit {'filename': 'datasets/books-lite\\numbers.txt'}>


In [747]:
search(u"wealth")

Results: 1


<Hit {'filename': 'datasets/books-lite\\acts.txt'}>


In [748]:
search(u"strong AND powerful")

Results: 3


<Hit {'filename': 'datasets/books-lite\\romans.txt'}>
<Hit {'filename': 'datasets/books-lite\\numbers.txt'}>
<Hit {'filename': 'datasets/books-lite\\acts.txt'}>


In [749]:
search(u"weak AND powerless")

Results: 0


