In [None]:
from whoosh.fields import Schema, TEXT, ID, KEYWORD
from whoosh import index
import os

schema = Schema(
    word=ID(stored=True),
    lang=ID(stored=True),
    pos=ID(stored=True),
    etymology_text=TEXT(stored=True),
    glosses=TEXT(stored=True),     # Flattened glosses
    tags=KEYWORD(stored=True, commas=True)
)

# Create indexdir if not exists
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
    ix = index.create_in("indexdir", schema)
else:
    ix = index.open_dir("indexdir")



In [4]:
import json
from whoosh.writing import AsyncWriter

# Path to your .jsonl file
jsonl_path = "kaikki.org-dictionary-English-words.jsonl"


with ix.writer() as writer:
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            entry = json.loads(line)
            if entry.get("lang") != "English":
                continue
            word = entry.get("word", "")
            pos = entry.get("pos", "")
            etymology = entry.get("etymology_text", "")
            glosses = []
            tags = set()
            for sense in entry.get("senses", []):
                glosses.extend(sense.get("glosses", []))
                tags.update(sense.get("tags", []))
            writer.add_document(
                word=word,
                lang="English",
                pos=pos,
                etymology_text=etymology,
                glosses="; ".join(glosses),
                tags=",".join(tags)
            )
