## 1. Buiding Whoosh Schema

In [19]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer

schema = Schema(filename=ID(stored=True),
                line_num=ID(stored=True),
                content=TEXT(analyzer=StemmingAnalyzer(), stored=True)
               )

## 2. Loading Data

In [20]:
import os, os.path
from whoosh import index

# Note, this clears the existing index in the directory
ix = index.create_in("hp", schema)

# Get a writer form the created index in 
writer = ix.writer()

In [21]:
def loadFile(writer, fname):
    '''
    Read file contents, load into database.
    '''
    line_no = 1
    with open(fname, 'r') as infile:
        # TODO: create indexes for each line in the input file
        
        for line in infile:
            line = line.rstrip("\n")
            line_no += 1
            writer.add_document(filename = fname, \
                               line_num = str(line_no), \
                                content = line
                               )
        
        print("Indexed: ", fname)


def processFolder(writer,folder):
    print('Processing folder: ',folder)
    for root, dirs, files in os.walk(folder):
        print("root = ", root)
        # Process Files
        for file in files:
            if file.endswith(".txt"):
                filename = os.path.join(root, file)
                loadFile(writer,filename)
            else:
                print("Unhandled File")
        # Recurse into subfolders
        for d in dirs:
            print("recursing into ",d)
            processFolder(writer,d)

In [22]:
processFolder(writer,"hp")
writer.commit() # save changes

Processing folder:  hp
root =  hp
Indexed:  hp/CHAPTER 1.txt
Indexed:  hp/CHAPTER 2.txt
Indexed:  hp/CHAPTER 3.txt
Indexed:  hp/CHAPTER 4.txt
Indexed:  hp/CHAPTER 5.txt
Indexed:  hp/CHAPTER 6.txt
Indexed:  hp/CHAPTER 7.txt
Indexed:  hp/CHAPTER 8.txt
Unhandled File
Unhandled File
Unhandled File
Unhandled File
Unhandled File
recursing into  MAIN.tmp
Processing folder:  MAIN.tmp
root =  hp/MAIN.tmp
Unhandled File


## 3. Executing Queries

In [26]:
from whoosh.qparser import QueryParser

qp = QueryParser("content", schema = ix.schema)
q = qp.parse(u"Harry")

with ix.searcher() as s:
    results = s.search(q)
    for hit in results:
        print(hit["filename"], hit["line_num"], hit["content"])

# Find the indexes of lines where the 
# string 'Harry' is appearing. 

hp/CHAPTER 6.txt 708     "Harry Potter," said Harry.
hp/CHAPTER 2.txt 396 "Harry was talking to it, weren't you, Harry?"
hp/CHAPTER 1.txt 97 son, Harry"
hp/CHAPTER 2.txt 45     Harry groaned.
hp/CHAPTER 3.txt 18 Harry Hunting.
hp/CHAPTER 3.txt 339 Harry in amazement.
hp/CHAPTER 5.txt 916     Harry swallowed.
hp/CHAPTER 6.txt 349     "Harry Potter!"
hp/CHAPTER 6.txt 424     Harry nodded.
hp/CHAPTER 6.txt 756     Harry stared.


In [29]:
from whoosh.qparser import QueryParser
from whoosh import scoring

qp = QueryParser("content", schema = ix.schema)
q = qp.parse(u"Harry")

with ix.searcher(weighting = scoring.TF_IDF()) as s:
    results = s.search(q)
    for hit in results:
        print(hit["filename"], hit["line_num"], hit["content"])

# Find the indexes of lines where the string
#'Harry' using TF_IDF as the scoring mechanism. 

hp/CHAPTER 2.txt 91 newspaper and shouted that Harry needed a haircut. Harry must have
hp/CHAPTER 2.txt 258 to complain about things: people at work, Harry, the council, Harry,
hp/CHAPTER 2.txt 396 "Harry was talking to it, weren't you, Harry?"
hp/CHAPTER 5.txt 296 his great hand on Harry's shoulder and making Harry's knees buckle.
hp/CHAPTER 5.txt 750 supply of some basic potion ingredients for Harry, Harry himself
hp/CHAPTER 5.txt 809     Mr. Ollivander moved closer to Harry. Harry wished he would
hp/CHAPTER 5.txt 933 Harry's lap. Up another escalator, out into Paddington station; Harry
hp/CHAPTER 6.txt 402     Harry shook his head and the boy sat down. He glanced at Harry
hp/CHAPTER 6.txt 708     "Harry Potter," said Harry.
hp/CHAPTER 6.txt 809     He held out his hand to shake Harry's, but Harry didn't take it.


In [30]:
from whoosh.query import *

with ix.searcher(weighting = scoring.TF_IDF()) as s:
    qp = QueryParser("content", ix.schema)
    user_q = qp.parse(u"Harry")
    
    allow_q = Term("filename", "hp/CHAPTER 6.txt")
    
    results = s.search(user_q, filter = allow_q)
    
    for hit in results:
        print(hit["filename"], hit["line_num"], hit["content"])

# Use a filter to list the indexes in chapter 6 
# corresponding to the search string 'Harry' 
# using TF_IDF as the scoring mechanism. 

hp/CHAPTER 6.txt 402     Harry shook his head and the boy sat down. He glanced at Harry
hp/CHAPTER 6.txt 708     "Harry Potter," said Harry.
hp/CHAPTER 6.txt 809     He held out his hand to shake Harry's, but Harry didn't take it.
hp/CHAPTER 6.txt 6     Harry's last month with the Dursleys wasn't fun. True, Dudley was
hp/CHAPTER 6.txt 7 now so scared of Harry he wouldn't stay in the same room, while Aunt
hp/CHAPTER 6.txt 8 Petunia and Uncle Vernon didn't shut Harry in his cupboard, force
hp/CHAPTER 6.txt 11 chair with Harry in it were empty. Although this was an improvement
hp/CHAPTER 6.txt 14     Harry kept to his room, with his new owl for company. He had
hp/CHAPTER 6.txt 20 mice. Every night before he went to sleep, Harry ticked off another
hp/CHAPTER 6.txt 41     Grunt. Harry supposed that meant yes.
