In [3]:
from whoosh import index
from whoosh.fields import Schema, ID, TEXT, KEYWORD
import os
import json
from tqdm import tqdm

# Step 1: Define schema
schema = Schema(
    word=ID(stored=True),
    lang=ID(stored=True),
    pos=ID(stored=True),
    etymology_text=TEXT(stored=True),
    etymology_langs=KEYWORD(stored=True, commas=True, lowercase=True),
    glosses=TEXT(stored=True)
)

# Step 2: Create index directory
index_dir = "indexdir"
if not os.path.exists(index_dir):
    os.mkdir(index_dir)
    ix = index.create_in(index_dir, schema)
else:
    ix = index.open_dir(index_dir)

In [5]:
# Step 3: Path to Kaikki JSONL file
jsonl_path = "kaikki.org-dictionary-English.jsonl"

# Step 4: Count lines (optional for tqdm progress bar)
with open(jsonl_path, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)

In [None]:
# Step 5: Index documents
with ix.writer() as writer:
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, total=total_lines, desc="Indexing"):
            try:
                entry = json.loads(line)
                if entry.get("lang") != "English":
                    continue

                word = entry.get("word", "")
                pos = entry.get("pos", "")
                etymology_text = entry.get("etymology_text", "")

                # Extract origin languages from etymology_templates
                ety_langs = set()
                for tpl in entry.get("etymology_templates", []):
                    lang = tpl.get("lang")
                    if lang:
                        ety_langs.add(lang.lower())

                # Extract glosses for context (optional)
                gloss_list = []
                for sense in entry.get("senses", []):
                    gloss_list.extend(sense.get("glosses", []))
                glosses = "; ".join(gloss_list)

                # Add document to index
                writer.add_document(
                    word=word,
                    lang="English",
                    pos=pos,
                    etymology_text=etymology_text,
                    etymology_langs=",".join(ety_langs),
                    glosses=glosses
                )
            except Exception as e:
                print("Skipping entry due to error:", e)

Indexing: 100%|██████████| 1383078/1383078 [07:40<00:00, 3003.49it/s] 


In [None]:
from whoosh.qparser import QueryParser
ix = index.open_dir("indexdir")

with ix.searcher() as searcher:
    query = QueryParser("etymology_text", ix.schema).parse("from Latin")
    results = searcher.search(query, limit=10)
    for r in results:
        print(f"{r['word']} ({r['pos']}): {r['etymology_text'][:100]}...")