In [10]:
TEMPLATE_TYPES = {"inh", "der", "bor", "cal", "cog", "clq", "rel", "root", "dbt", "clipping", "compound", "m", "short for", "back-form", "semantic loan", "blend", "lbor", "obor"}
POSITIONAL_ARG = {
    "cog": "2",
    "clq": "2",
    "rel": "2",
    "m": "2",
    "short for": "2",
    # default for most: 3rd positional argument
    "inh": "3",
    "der": "3",
    "bor": "3",
    "cal": "3",
    "lbor": "3",
    "obor": "3",
    "back-form": "3",
    "semantic loan": "3",
    "dbt": "3",
    "compound": "3",
    "clipping": "3",
    "blend": "3",
    "root": "3",
}

# Step 3: Path to Kaikki JSONL file
jsonl_path = "../kaikki.org-dictionary-English.jsonl"

# Step 4: Count lines (optional for tqdm progress bar)
with open(jsonl_path, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)

In [None]:
from whoosh import index
from whoosh.fields import Schema, ID, TEXT, KEYWORD, STORED
import os
import json
from tqdm import tqdm

# Step 1: Define schema
# schema = Schema(
#     word=ID(stored=True),
#     lang=ID(stored=True),
#     pos=ID(stored=True),
#     etymology_text=TEXT(stored=True),
#     etymology_langs=KEYWORD(stored=True, commas=True, lowercase=True),
#     glosses=TEXT(stored=True)
# )
def get_schema():
    return Schema(
        word=ID(stored=True),
        lang=ID(stored=True),
        lang_code=ID(stored=True),
        pos=ID(stored=True),
        etymology_text=TEXT(stored=True),
        #etymology_langs=KEYWORD(stored=True, commas=True, lowercase=True),
        glosses=TEXT(stored=True),
        related_words=KEYWORD(stored=True, commas=True),
        cognates=KEYWORD(stored=True, commas=True),
        expansion_texts=TEXT(stored=True),
        #template_names=KEYWORD(stored=True, commas=True, lowercase=True),
        etymology_template=STORED
    )


def create_or_open_index(index_dir):
    if not os.path.exists(index_dir):
        os.makedirs(index_dir)
        ix = index.create_in(index_dir, schema=get_schema())
    else:
        ix = index.open_dir(index_dir)
    return ix


In [None]:
def build_index(jsonl_path, index_dir):
    ix = create_or_open_index(index_dir)
    with ix.writer() as writer:
    # Iterate with a progress bar over entries
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in tqdm(f, desc="Indexing", unit="entries"):
                entry = json.loads(line)
                related = []
                expansion_texts = []
                #etym_langs = []
                cognates = []
                templates = []
                
                # Extract source terms for any matching template
                for tpl in entry.get("etymology_templates", []):
                    name = tpl.get("name", "").lower()
                    if name not in TEMPLATE_TYPES:
                        continue
                    args = tpl.get("args", {})

                    # Determine which positional argument contains the source term
                    src = None
                    src_lang = None
                    src_key = POSITIONAL_ARG.get(name)
                    src_lang_key = str(eval(src_key) - 1)
                    if src_key and args.get(src_key):
                        src = args[src_key].strip()
                        src_lang = args.get(src_lang_key, "").strip()
  
                    if not src:
                        continue

                    if name == "cog":
                        cognates.append(src)
                    related.append(src)
                    expansion_text = tpl.get("expansion", "")
                    #print(expansion_text)
                    expansion_texts.append(expansion_text)
                    templates.append({
                        "word": src,
                        "lang": src_lang,
                        "rel": name,
                        "expansion_text": expansion_text
                    })
                # # Build etymology_langs from the first (target) language in each template
                # etyl_langs = [tpl.get("args", {}).get("1", "").lower()
                #               for tpl in entry.get("etymology_templates", [])
                #               if tpl.get("args", {}).get("1")]

                # Add document to index
                writer.add_document(
                    word=entry.get("word", ""),
                    lang=entry.get("lang", ""),
                    lang_code=entry.get("lang_code", ""),
                    pos=entry.get("pos", ""),
                    etymology_text=entry.get("etymology_text", ""),
                    #etymology_langs=",".join(etyl_langs),
                    glosses="; ".join(
                        gloss
                        for sense in entry.get("senses", [])
                        for gloss in sense.get("glosses", [])
                    ),
                    related_words=",".join(related),
                    cognates=",".join(cognates),
                    expansion_texts="; ".join(expansion_texts),
                    etymology_template=json.dumps(templates)
                )


In [26]:
index_dir = "indexdir"
jsonl_english = "../kaikki.org-dictionary-English.jsonl"
jsonl_german = "../kaikki.org-dictionary-German.jsonl"
build_index(jsonl_english, index_dir)
build_index(jsonl_german, index_dir)

Indexing: 0entries [00:00, ?entries/s]

Indexing: 1383078entries [08:07, 2834.74entries/s]
Indexing: 354723entries [02:02, 2891.35entries/s]


In [None]:

# Step 5: Index documents
with ix.writer() as writer:
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, total=total_lines, desc="Indexing"):
            try:
                entry = json.loads(line)
                if entry.get("lang") != "English":
                    continue

                word = entry.get("word", "")
                pos = entry.get("pos", "")
                etymology_text = entry.get("etymology_text", "")

                # Extract origin languages from etymology_templates
                ety_langs = set()
                for tpl in entry.get("etymology_templates", []):
                    lang = tpl.get("lang")
                    if lang:
                        ety_langs.add(lang.lower())

                # Extract glosses for context (optional)
                gloss_list = []
                for sense in entry.get("senses", []):
                    gloss_list.extend(sense.get("glosses", []))
                glosses = "; ".join(gloss_list)

                # Add document to index
                writer.add_document(
                    word=word,
                    lang="English",
                    pos=pos,
                    etymology_text=etymology_text,
                    etymology_langs=",".join(ety_langs),
                    glosses=glosses
                )
            except Exception as e:
                print("Skipping entry due to error:", e)

In [None]:
from whoosh.qparser import QueryParser
from whoosh import index

ix = index.open_dir("indexdir")
with ix.searcher() as searcher:
    query_str = "apple"
    query_info = searcher.document(word=query_str, lang_code="en")

    related = query_info.get("related_words", "").split(",") if query_info else []
    related = [t for t in related if t != "-"]
    related = [w.replace("(", "").replace(")", "") for w in related]
    # Create a unique union between related and query_str
    query_terms = set(related) | {query_str}
    print(query_terms)
    #print(related)rel
    #print(related)
    query_terms = " OR ".join(query_terms)
    print(query_terms)
    #print(query_str)
    query = QueryParser("related_words", ix.schema).parse(query_terms)
    #for result in searcher.search(query, limit=10):
        #print(result["word"], result["related_words"])
    # query = QueryParser("etymology_text", ix.schema).parse("from Latin")
    # results = searcher.search(query, limit=10)
    # for r in result
    #     print(f"{r['word']} ({r['pos']}): {r['etymology_text'][:100]}...")

SyntaxError: '(' was never closed (1775066455.py, line 14)

In [3]:
import requests
import os

os.environ["CEREBRAS_API_KEY"] = "csk-83pdj4yf9tnttnwcdhhpvr2mtvv8k94yvtdvt6jmv83myrvw"  # replace with your actual key


url = "https://api.cerebras.ai/v1/chat/completions"  # Confirm this endpoint

HEADERS = {
    "Authorization": f"Bearer {os.environ['CEREBRAS_API_KEY']}",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Origin": "https://api.cerebras.ai",
    "Referer": "https://api.cerebras.ai/",
}

data = {
    "model": "llama-3.3-70b",
    "messages": [{"role": "user", "content": "hello"}],
    "temperature": 0.2,
    "max_tokens": 1024,
}

r = requests.post(url, headers=HEADERS, json=data)
print(r.status_code)
print(r.text)


403
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<title>Attention Required! | Cloudflare</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/cf.errors.css" />
<!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" /><![endif]-->
<style>body{margin:0;padding:0}</style>


<!--[if gte IE 10]><!-->
<script>
  if (!navigator.cookieEnabled) {
    window.addEventListener('DOMContentLoaded

In [2]:
from wiktionaryparser import WiktionaryParser
parser = WiktionaryParser()
word = parser.fetch('test')
another_word = parser.fetch('test', 'french')
parser.set_default_language('french')
parser.exclude_part_of_speech('noun')
parser.include_relation('alternative forms')

In [14]:
import requests
import mwparserfromhell
import wiktextract

def fetch_etymology_text(word: str) -> str:
    """
    Fetches and returns the plain-text Etymology section(s) from English Wiktionary
    for the given word. If there are multiple Etymology headings (e.g. Etymology 1,
    Etymology 2), they’re concatenated with a blank line.
    """
    # 1. Query the page content (wikitext)
    resp = requests.get(
        "https://en.wiktionary.org/w/api.php",
        params={
            "action": "query",
            "prop": "revisions",
            "titles": word,
            "rvslots": "*",
            "rvprop": "content",
            "formatversion": 2,
            "format": "json",
        }
    )
    resp.raise_for_status()
    pages = resp.json().get("query", {}).get("pages", [])
    if not pages or "missing" in pages[0]:
        return ""  # no such page
    
    wikitext = pages[0]["revisions"][0]["slots"]["main"]["content"]
   
    # 2. Parse and extract level-2 “Etymology” sections
    parsed = mwparserfromhell.parse(wikitext)
    print(parsed.filter_templates())
    ety_sections = []
    for sec in parsed.get_sections(include_headings=True, levels=[2]):
        headers = sec.filter_headings()
        if headers and headers[0].title.strip().lower().startswith("etymology"):
            # strip all remaining wiki-markup
            plain = sec.strip_code().strip()
            if plain:
                ety_sections.append(plain)
    
    # 3. Join multiple etymologies, if present
    return "\n\n".join(ety_sections)


word = "apple"
ety = fetch_etymology_text(word)
#print(f"Etymology of {word!r}:\n\n{ety}")


['{{also|Apple|äpple}}', '{{was wotd|2024|October|21}}', '{{etymon|en|id=Q89|inh|enm>appel>apple|tree=1}}', "{{multiple images\n| direction = vertical\n| image1   = Red Apple.jpg\n| caption1 = An apple ({{senseno|en|Q89}}).\n| image2   = Apple Blossom Time at Oak Glen, CA 3-16 (26094251756).jpg\n| caption2 = An apple ({{senseno|en|apple tree}}) or [[apple tree]] in [[blossom#Noun|blossom]].\n| image3   = Frans Floris - The Fall of Man - Google Art Project.jpg\n| caption3 = ''{{w|The Fall of Man}}'' ({{circa2|1560|short=1}}) by {{w|Frans Floris}},{{ref|group=n|name=n1|From the collection of the {{w|Malmö Art Museum}} in [[Malmö]], [[Scania]], [[Sweden]].}} which depicts [[Adam and Eve]] in the [[Garden of Eden]] with apples ({{senseno|en|forbidden fruit}}) or [[forbidden fruit]].\n}}", '{{senseno|en|Q89}}', '{{senseno|en|apple tree}}', '{{w|The Fall of Man}}', '{{circa2|1560|short=1}}', '{{w|Frans Floris}}', '{{ref|group=n|name=n1|From the collection of the {{w|Malmö Art Museum}} in [[M

In [57]:
from mwparserfromhell import parse

def fetch_etymology_text(word: str) -> str:
    """Fetch the Etymology section as *plain text* from Wiktionary via the MediaWiki API."""
    import requests
    API = "https://en.wiktionary.org/w/api.php"
    # 1) grab the full wikitext
    resp = requests.get(API, params={
        "action":      "query",
        "prop":        "revisions",
        "titles":      word,
        "rvslots":     "*",
        "rvprop":      "content",
        "format":      "json",
        "formatversion":"2",
    }).json()
    try:
        wikitext = resp["query"]["pages"][0]["revisions"][0]["slots"]["main"]["content"]
    except (KeyError, IndexError):
        return ""
    # 2) parse it
    wikicode = parse(wikitext)
    # 3) pull out level-2 sections (== headings ==)
    for section in wikicode.get_sections(include_headings=True, levels=[2]):

        # the first heading in the section
        heads = section.filter_headings()
        #print(heads[1])
        if heads and "etymology" in heads[1].title.strip().lower():
            # strip *all* remaining wiki-markup (templates, refs, links)
            return section.strip()
    return ""

In [58]:
fetch_etymology_text("apple")

'==English==\n{{was wotd|2024|October|21}}\n\n===Etymology===\n{{etymon|en|id=Q89|inh|enm>appel>apple|tree=1}}\n{{multiple images\n| direction = vertical\n| image1   = Red Apple.jpg\n| caption1 = An apple ({{senseno|en|Q89}}).\n| image2   = Apple Blossom Time at Oak Glen, CA 3-16 (26094251756).jpg\n| caption2 = An apple ({{senseno|en|apple tree}}) or [[apple tree]] in [[blossom#Noun|blossom]].\n| image3   = Frans Floris - The Fall of Man - Google Art Project.jpg\n| caption3 = \'\'{{w|The Fall of Man}}\'\' ({{circa2|1560|short=1}}) by {{w|Frans Floris}},{{ref|group=n|name=n1|From the collection of the {{w|Malmö Art Museum}} in [[Malmö]], [[Scania]], [[Sweden]].}} which depicts [[Adam and Eve]] in the [[Garden of Eden]] with apples ({{senseno|en|forbidden fruit}}) or [[forbidden fruit]].\n}}\n\nThe {{glossary|noun}} is derived from {{inh|en|enm|appel|t=\'\'Malus domestica\'\' fruit or tree, apple; any type of fruit, nut, or tuber; tree bearing fruit; (\'\'figurative\'\') ball, sphere; (\