In [24]:
import requests, json, html, re
from typing import List, Dict
import json

OLLAMA_HOST = "http://130.237.3.106:11434"  # or the IP of your Ollama machine
MODEL = "gpt-oss"

def normalize_snippet(s: str) -> str:
    s = s.replace("\u00AD", "")
    s = s.replace("\u200B", "")
    s = s.replace("&amp;", "&")
    s = s.replace("&nbsp;", " ")
    s = s.replace("&shy;", "")
    s = html.unescape(s)
    return s

def extract_sections(text: str) -> List[Dict]:
    """
    Send one text chunk (one or more sections) to Ollama; return parsed JSON list.
    """
    text = normalize_snippet(text)

    # Note the doubled {{ }} below: that's only to show literal braces inside an f-string.
    prompt_head = """
You are a strict extractor. 
Convert Irish phonetic dictionary sections into structured JSON.

WORKS_ALLOWED = [
  "Dinneen","Windisch","Macbain","Atkinson","O’Reilly","Keating",
  "Meyer","Craig","Lloyd","Henebry","Rhys","Pedersen","Finck",
  "G. J.","D. P.","Cl. S.","Sg. Fearn.","Hogan","Molloy","Diss."
]
ABBREVIATIONS = {
  "Wi.": "Windisch", "Wi. Ir. T.": "Windisch", "O’R.": "O’Reilly", "Diss.": "Quiggin Dissertation",
  "Di.": "Dinneen", "G. J.": "Gaelic Journal", "D. P.": "Derry People", "Cl. S.": "Claidheamh Soluis",
  "Sg. Fearn.": "Sgeulaidhe Fearnmhuighe", "Finck": "Finck", "Henebry": "Henebry",
  "Macbain": "Macbain", "Meyer": "Meyer", "Molloy": "Molloy", "Pedersen": "Pedersen",
  "Rhys": "Rhys", "Hogan": "Hogan", "Craig": "Craig", "Lloyd": "Lloyd", "Atkinson": "Atkinson",
  "Keating": "Keating"
}

You are a strict extractor.
Convert Irish phonetic dictionary sections into JSON.

Rules:
- Output ONLY valid JSON (UTF-8), an array of entry objects. No prose.
- Include only data explicitly present in the text.
- If unknown, OMIT the field (do not use null or empty objects).
- Required when present: section, phonetic, gloss, transcription, etymology, source_refs, related, also, examples, derived_from, relations, components, see_section, notes, sic, warning, literature_refs.
- **Always include "raw": the exact substring of the input used for that entry.**
  It should start from the first opening two apostrophes that enclose the headword and include everything up to the next semicolon “;” or the end of that example clause (comma-separated) — exactly as printed (after HTML unescape).
- Etymology is historical stage only, e.g. {"language":"Middle Irish","form":"dér"} or {"language":"Old Irish","forms":[...]}.
- Dictionary/source citations belong in source_refs, not etymology.
- If “s.” (= see) appears, split as {"form":"X","see":"Y"}.
- Relations: only when stated (e.g., {"type":"genitive_singular_of","phonetic":"…"} or {"type":"comparative_of","phonetic":"…"}).
- “also”: variant/emphatic spellings of the same head (strings).
- “examples”: quotations/proverbs with optional gloss/source.
- Copy phonetic strings exactly. Strip soft hyphens/HTML entities before reasoning.

Few-shot:

INPUT
<<<
{{section|s27|§ 27.}} A remarkable reduction of ''uə'' > ''ɔ'' before the stress occurs in ''Lɔχ·pʹi꞉Nʹə'', ‘a pennyworth’ < ''Luəχ'', Di. luach.
>>>
OUTPUT
[
  {
    "section": "s27",
    "phonetic": "Lɔχ·pʹi꞉Nʹə",
    "raw": "''Lɔχ·pʹi꞉Nʹə'', ‘a pennyworth’ < ''Luəχ'', Di. luach",
    "gloss": "a pennyworth",
    "derived_from": {
      "phonetic": "Luəχ",
      "source_refs": { "work": "Dinneen", "form": "luach" }
    },
    "notes": ["uə → ɔ before the stress"]
  }
]

INPUT
<<<
{{section|s32|§ 32.}} ''tʹɔ꞉'', comp. of ''tʹe'', ‘hot’, cp. Wi. teou s. tee.
>>>
OUTPUT
[
  {
    "section": "s32",
    "phonetic": "tʹɔ꞉",
    "raw": "''tʹɔ꞉'', comp. of ''tʹe'', ‘hot’, cp. Wi. teou s. tee",
    "gloss": "hot",
    "relations": { "type": "comparative_of", "phonetic": "tʹe" },
    "source_refs": { "work": "Windisch", "form": "teou", "see": "tee" }
  }
]

INPUT
<<<
{{section|s27|§ 27.}} ''gɔl ·çɔ꞉lʹ'', ‘to sing’ (‘to sing a song’ is ''ɔ꞉rαn ə rα꞉''(''tʹ''), imperative ''αbwirʹ ɔ꞉rαn'') < gabháil cheóil, shews loss of palatalisation in a weakly stressed syllable.
>>>
OUTPUT
[
  {
    "section": "s27",
    "phonetic": "gɔl ·çɔ꞉lʹ",
    "raw": "''gɔl ·çɔ꞉lʹ'', ‘to sing’ (‘to sing a song’ is ''ɔ꞉rαn ə rα꞉''(''tʹ''), imperative ''αbwirʹ ɔ꞉rαn'') < gabháil cheóil",
    "transcription": "gabháil cheóil",
    "gloss": "to sing",
    "related": [
      { "phonetic": "ɔ꞉rαn ə rα꞉(tʹ)", "gloss": "to sing a song" },
      { "type": "imperative", "phonetic": "αbwirʹ ɔ꞉rαn" }
    ],
    "notes": ["Shows loss of palatalisation in weakly stressed syllable."]
  }
]

NOW EXTRACT
<<<
"""

    prompt_tail = """
>>>
""".strip()
    prompt = prompt_head + text.strip() + prompt_tail
    resp = requests.post(
        f"{OLLAMA_HOST}/api/generate",
        headers={"Content-Type": "application/json"},
        json={
            "model": MODEL,
            "prompt": prompt,
            "options": {"temperature": 0.1},
            "stream": False,
        },
        timeout=600,
    )
    resp.raise_for_status()
    data = resp.json()

    # Ollama returns the text under `response`
    raw = data.get("response", "").strip()

    # Try to locate the first JSON array in case the model adds stray text
    start = raw.find("[")
    end = raw.rfind("]")
    if start != -1 and end != -1 and end >= start:
        raw = raw[start:end+1]

    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        # Helpful debug print
        print("⚠️ Could not parse JSON. Raw response:\n", raw)
        return []

In [25]:
from pathlib import Path

json_path = Path("/tmp/pron2/quiggin/raw")
wiki_path = Path("/tmp/pron2/quiggin/wiki")

for file in wiki_path.glob("*.wiki"):
    stem = file.stem
    if (json_path / f"{stem}.json").exists():
        print(f"Skipping {stem}, already done")
        continue
    with open(file, "r") as f:
        text = f.read()
    text = text.strip()
    output = extract_sections(text)
    with open(json_path / f"{stem}.json", "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

Skipping section433, already done
Skipping section126, already done
Skipping section258, already done


HTTPError: 404 Client Error: Not Found for url: http://130.237.3.106:11434/api/generate