In [19]:
import requests, json, html, re
from typing import List, Dict
import json

OLLAMA_HOST = "http://130.237.3.106:11434"  # or the IP of your Ollama machine
MODEL = "phi3"

def normalize_snippet(s: str) -> str:
    s = s.replace("\u00AD", "")
    s = s.replace("\u200B", "")
    s = s.replace("&amp;", "&")
    s = s.replace("&nbsp;", " ")
    s = s.replace("&shy;", "")
    s = html.unescape(s)
    return s

def extract_sections(text: str) -> List[Dict]:
    """
    Send one text chunk (one or more sections) to Ollama; return parsed JSON list.
    """
    text = normalize_snippet(text)

    # Note the doubled {{ }} below: that's only to show literal braces inside an f-string.
    prompt_head = """
You are a strict extractor. 
Convert Irish phonetic dictionary sections into structured JSON.

WORKS_ALLOWED = [
  "Dinneen","Windisch","Macbain","Atkinson","O’Reilly","Keating",
  "Meyer","Craig","Lloyd","Henebry","Rhys","Pedersen","Finck",
  "G. J.","D. P.","Cl. S.","Sg. Fearn.","Hogan","Molloy","Diss."
]
ABBREVIATIONS = {
  "Wi.": "Windisch", "Wi. Ir. T.": "Windisch", "O’R.": "O’Reilly", "Diss.": "Quiggin Dissertation",
  "Di.": "Dinneen", "G. J.": "Gaelic Journal", "D. P.": "Derry People", "Cl. S.": "Claidheamh Soluis",
  "Sg. Fearn.": "Sgeulaidhe Fearnmhuighe", "Finck": "Finck", "Henebry": "Henebry",
  "Macbain": "Macbain", "Meyer": "Meyer", "Molloy": "Molloy", "Pedersen": "Pedersen",
  "Rhys": "Rhys", "Hogan": "Hogan", "Craig": "Craig", "Lloyd": "Lloyd", "Atkinson": "Atkinson",
  "Keating": "Keating"
}

FIELD_CONTRACT
- Allowed keys only: section, phonetic, gloss, transcription, etymology, source_refs, related, also, examples, derived_from, relations, components, see_section, notes, sic, warning, literature_refs.
- Omit unknown fields (no null / {}).
- etymology: {"language":"Old Irish|Middle Irish|Modern Irish|Latin|English|Norse|Scots","form":"…"} OR {"language":"…","forms":[…]}.
- source_refs: ARRAY; each = {"work":"<from WORKS_ALLOWED>","form":"…","pronounced":"…","see":"…"}.
  * Expand any abbreviation using ABBREVIATIONS before writing "work".
  * If you see “s.” (= ‘see’), split into {"form":"X","see":"Y"}.
- related: ARRAY of objects; relations: only if head is dependent.
- also: ARRAY of strings.
- transcription only if explicitly given.
- examples for quotations/proverbs when present.
- No invented data. Copy phonetics exactly; strip HTML entities.
- Output JSON array only.

Rules:
- Only include data explicitly present in the text.
- Do NOT invent sources, etymologies, or glosses.
- If a gloss is present, copy it exactly. If absent, use null.
- If an etymology is present, include { "language": "...", "form": "..." }.
- If a relation is explicit (e.g., "gen. sing. of ..."), encode as:
  "relations": { "type": "genitive_singular_of", "phonetic": "..." }.
- If multiple senses exist, output multiple JSON objects (one per sense).
- Do not add any extra works or dictionary references unless directly named.
- Do not add explanations, comments, or natural language outside of the JSON.

FEW-SHOT EXAMPLES

INPUT
<<<
{{section|s27|§ 27.}} An O.Ir. ó is reduced to ''ɔ'' in syllables with secondary stress, e.g. ''fi꞉dɔrʹ'', ‘weaver’, Di. figheadóir; ''spʹαlədɔrʹ'', ‘mower’; ''ti꞉dɔrʹ'', ‘thatcher’; ''bʹrʹ''ï''ŋlɔdʹ'', ‘dream’, Meyer brinnglóid; ''mα꞉lɔdʹ'', ‘a foolish woman’, Di. málaid; ''tʹrʹ''ï''blɔdʹ'', ‘trouble’, Di. trioblóid, M.Ir. treblait; ''ʃkʹ''ï''bɔl'' (''ʃkʹ''ï''bαl''), ‘barn’, Di. scioból; ''sæLʹɔrʹ'' beside ''sæLʹerʹ'', ‘evident’, Di. soilléir. ''Nʹαmɔrt'', ‘neglect, carelessness’, ''Nʹαmɔrtαχ'', ‘careless’, are peculiar. Dinneen writes neamháird. In the Derry People 2 xii ’05 p. 2 col. 5, we find neamart. A remarkable reduction of ''uə'' > ''ɔ'' before the stress occurs in ''Lɔχ·pʹi꞉Nʹə'', ‘a pennyworth’ < ''Luəχ'', Di. luach; ''krɔχ ·eirʹ'', ‘hay-stack’, < cruach. With this is to be compared ''sLɔ꞉ ʃi꞉'', ‘the fairies’, < sluagh.
>>>
OUTPUT
[
  {
    "section": "s27",
    "phonetic": "fi꞉dɔrʹ",
    "gloss": "weaver",
    "source_refs": {
      "work": "Dinneen",
      "form": "figheadóir"
    }
  },
  {
    "section": "s27",
    "phonetic": "spʹαlədɔrʹ",
    "gloss": "mower"
  },
  {
    "section": "s27",
    "phonetic": "ti꞉dɔrʹ",
    "gloss": "thatcher"
  },
  {
    "section": "s27",
    "phonetic": "bʹrʹïŋlɔdʹ",
    "gloss": "dream",
    "source_refs": {
      "work": "Meyer",
      "form": "brinnglóid"
    }
  },
  {
    "section": "s27",
    "phonetic": "mα꞉lɔdʹ",
    "gloss": "a foolish woman",
    "source_refs": {
      "work": "Dinneen",
      "form": "málaid"
    }
  },
  {
    "section": "s27",
    "phonetic": "tʹrʹïblɔdʹ",
    "gloss": "trouble",
    "source_refs": {
      "work": "Dinneen",
      "form": "trioblóid"
    },
    "etymology": {
      "language": "Middle Irish",
      "form": "treblait"
    }
  },
  {
    "section": "s27",
    "phonetic": "ʃkʹïbɔl",
    "gloss": "barn",
    "also": [
      "ʃkʹïbαl"
    ],
    "source_refs": {
      "work": "Dinneen",
      "form": "scioból"
    }
  },
  {
    "section": "s27",
    "phonetic": "sæLʹɔrʹ",
    "gloss": "evident",
    "also": [
      "sæLʹerʹ"
    ],
    "source_refs": {
      "work": "Dinneen",
      "form": "soilléir"
    }
  },
  {
    "section": "s27",
    "phonetic": "Nʹαmɔrt",
    "gloss": "neglect, carelessness",
    "source_refs": {
      "work": "Dinneen",
      "form": "neamháird"
    },
    "notes": [
      "Derry People (2 Dec 1905) gives neamart."
    ]
  },
  {
    "section": "s27",
    "phonetic": "Nʹαmɔrtαχ",
    "gloss": "careless"
  },
  {
    "section": "s27",
    "phonetic": "Lɔχ·pʹi꞉Nʹə",
    "gloss": "a pennyworth",
    "derived_from": {
      "phonetic": "Luəχ",
      "source_refs": {
        "work": "Dinneen",
        "form": "luach"
      }
    },
    "notes": [
      "uə → ɔ before the stress"
    ]
  },
  {
    "section": "s27",
    "phonetic": "krɔχ·eirʹ",
    "gloss": "hay-stack",
    "derived_from": {
      "transcription": "cruach"
    },
    "notes": [
      "uə → ɔ before the stress"
    ]
  },
  {
    "section": "s27",
    "phonetic": "sLɔ꞉ ʃi꞉",
    "gloss": "the fairies",
    "derived_from": {
      "transcription": "sluagh"
    }
  }
]

INPUT
<<<
{{section|s27|§ 27.}} ''gɔl ·çɔ꞉lʹ'', ‘to sing’ (‘to sing a song’ is ''ɔ꞉rαn ə rα꞉''(''tʹ''), imperative ''αbwirʹ ɔ꞉rαn'') < gabháil cheóil, shews loss of palatalisation in a weakly stressed syllable.
>>>
OUTPUT
[
[
  {
    "section": "s27",
    "phonetic": "gɔl ·çɔ꞉lʹ",
    "transcription": "gabháil cheóil",
    "gloss": "to sing",
    "notes": [
      "Shows loss of palatalisation in weakly stressed syllable."
    ],
    "related": [
      {
        "phonetic": "ɔ꞉rαn ə rα꞉(tʹ)",
        "gloss": "to sing a song"
      },
      {
        "type": "imperative",
        "phonetic": "αbwirʹ ɔ꞉rαn"
      }
    ]
  }
]

INPUT
<<<
{{section|s27|§ 27.}} ''mɔrαn'', ‘many, a quantity of’, Di. mórán is the usual form, as the word principally comes before the stress, but ''mɔ꞉rαn'', ''mo꞉rαn'' are the emphatic forms. Cp. § {{QDD|451}}.
>>>
OUTPUT
[
  {
    "section": "s27",
    "phonetic": "mɔrαn",
    "gloss": "many, a quantity of",
    "source_refs": {
      "work": "Dinneen",
      "form": "mórán"
    },
    "also": [
      "mɔ꞉rαn",
      "mo꞉rαn"
    ],
    "notes": [
      "mórán is the usual form, as the word principally comes before the stress, but ''mɔ꞉rαn'', ''mo꞉rαn'' are the emphatic forms"
    ],
    "see_section": [
      "s451"
    ]
  }
]

INPUT
<<<
{{section|s32|§ 32.}} ''tʹɔ꞉'', comp. of ''tʹe'', ‘hot’, cp. Wi. teou s. tee.
>>>
OUTPUT
[
  {
    "section": "s32",
    "phonetic": "tʹɔ꞉",
    "gloss": "hot",
    "relations": {
      "type": "comparative_of",
      "phonetic": "tʹe"
    },
    "source_refs": {
      "work": "Windisch",
      "form": "teou",
      "see": "tee"
    }
  }
]

INPUT
<<<
{{section|s30|§ 30.}} ''mʹαl̥ɔ꞉'', ‘interruption, delay’, Di. has meathlódh s. meathladh;
>>>
OUTPUT
[
  {
    "section": "s30",
    "phonetic": "mʹαl̥ɔ꞉",
    "gloss": "interruption, delay",
    "source_refs": {
      "work": "Dinneen",
      "pronounced": "meathlódh",
      "form": "meathladh"
    }
  }
]

NOW EXTRACT
<<<
"""

    prompt_tail = """
>>>
""".strip()
    prompt = prompt_head + text.strip() + prompt_tail
    resp = requests.post(
        f"{OLLAMA_HOST}/api/generate",
        headers={"Content-Type": "application/json"},
        json={
            "model": MODEL,
            "prompt": prompt,
            "options": {"temperature": 0.1},
            "stream": False,
        },
        timeout=600,
    )
    resp.raise_for_status()
    data = resp.json()

    # Ollama returns the text under `response`
    raw = data.get("response", "").strip()

    # Try to locate the first JSON array in case the model adds stray text
    start = raw.find("[")
    end = raw.rfind("]")
    if start != -1 and end != -1 and end >= start:
        raw = raw[start:end+1]

    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        # Helpful debug print
        print("⚠️ Could not parse JSON. Raw response:\n", raw)
        return []

In [None]:
from pathlib import Path

json_path = Path("/tmp/pron2/quiggin/raw")
wiki_path = Path("/tmp/pron2/quiggin/wiki")

for file in wiki_path.glob("*.wiki"):
    stem = file.stem
    if (json_path / f"{stem}.json").exists():
        print(f"Skipping {stem}, already done")
        continue
    with open(file, "r") as f:
        text = f.read()
    text = text.strip()
    output = extract_sections(text)
    with open(json_path / f"{stem}.json", "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)