In [64]:
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.analysis import StemmingAnalyzer
import os

# Create index directory if it doesn't exist
if not os.path.exists("indexdir"):
    os.makedirs("indexdir")

schema = Schema(id=ID(stored=True), 
                title=TEXT(stored=True, analyzer=StemmingAnalyzer()), 
                content=TEXT(analyzer=StemmingAnalyzer()),
                author=TEXT(stored=True),
                publisher=TEXT(stored=True),
                url=ID(stored=True),
                tags=KEYWORD(stored=True, commas=True))
# Create index
index = create_in("indexdir", schema)

In [65]:
from collections import defaultdict
from nltk.tokenize import sent_tokenize
import yaml
import re
from pathlib import Path

# Pattern to match the front matter
p = re.compile(r'---(.*?)---', re.DOTALL)

# List to store the documents
documents = []

# Iterate over all .txt files in the data directory
for file_path in Path('data').glob('*.txt'):
    with open(file_path, 'r') as f:
        data = f.read()

    match = p.match(data)
    fm = match.group(1)
    meta = yaml.safe_load(fm)
    
    document = p.sub("", data)
    
    sentences = sent_tokenize(document)
    
    for sentence in sentences:
        # Create a dictionary for each sentence
        doc_dict = {
            'id': str(len(documents)),
            'title': meta.get('title', ''),
            'content': sentence,
            'author': meta.get('author', ''),
            'publisher': meta.get('publisher', ''),
            'url': meta.get('URL', ''),
            'tags': ','.join(meta.get('tags', []))
        }
        
        # Add the dictionary to the list
        documents.append(doc_dict)

In [66]:
from whoosh.fields import Schema, TEXT, ID

# Define schema
schema = Schema(id=ID(stored=True), 
                title=TEXT(stored=True), 
                content=TEXT(stored=True),
                author=TEXT(stored=True),
                publisher=TEXT(stored=True),
                url=ID(stored=True),
                tags=KEYWORD(stored=True, commas=True))

# Add documents to the index
writer = AsyncWriter(index)
for doc in documents:
    writer.add_document(id=doc['id'], 
                        title=doc['title'], 
                        content=doc['content'],
                        author=doc['author'],
                        publisher=doc['publisher'],
                        url=doc['url'],
                        tags=doc['tags'])
writer.commit()

In [67]:
# Define schema
schema = Schema(id=ID(stored=True), 
                title=TEXT(stored=True), 
                content=TEXT(stored=True),
                author=TEXT(stored=True),
                publisher=TEXT(stored=True),
                url=ID(stored=True),
                tags=KEYWORD(stored=True, commas=True))

# Call the search function with the query "burial site"
search("burial site")

Title: The Site


KeyError: 'content'

In [68]:
print(index.schema)

<Schema: ['author', 'content', 'id', 'publisher', 'tags', 'title', 'url']>


In [74]:
from whoosh.fields import Schema, TEXT, ID, KEYWORD

# Define schema
schema = Schema(id=ID(stored=True), 
                title=TEXT(stored=True), 
                content=TEXT(stored=True),
                author=TEXT(stored=True),
                publisher=TEXT(stored=True),
                url=ID(stored=True),
                tags=KEYWORD(stored=True, commas=False))

In [75]:
# Add documents to the index
writer = AsyncWriter(index)
for doc in documents:
    writer.add_document(**doc)
writer.commit()

In [76]:
# Search the index
def search(query):
    with index.searcher() as searcher:
        query = QueryParser("content", index.schema).parse(query)
        results = searcher.search(query)
        print(f"Found {len(results)} results.")  # Print out the number of results
        for hit in results:
            print(hit.fields())  # Print out the fields of the hit
            print("Title:", hit["title"])
            print("Content:", hit["content"])
            print("Author:", hit["author"])
            print("Publisher:", hit["publisher"])
            print("URL:", hit["url"])
            print("Tags:", hit["tags"])
            print("--------------------")

# Call the search function with the query "burial site"
search("burial site")

Found 96 results.
{'author': 'Mohawk Mothers', 'id': '229', 'publisher': 'Mohawk Mothers', 'tags': 'i,n,d,i,g,e,n,o,u,s', 'title': 'The Site', 'url': 'https://www.mohawkmothers.ca/the-site'}
Title: The Site


KeyError: 'content'