In [2]:
import requests
import json
from pathlib import Path

OLLAMA_HOST = "http://130.237.3.106:11434"  # or the IP of your Ollama machine
MODEL = "phi3"

def extract_sections(text: str) -> list[dict]:
    """
    Send one text chunk (sections) to Ollama, return parsed JSON.
    """
    prompt = f"""
You are a careful extractor. Convert Irish phonetic dictionary snippets into JSON.

Rules:
- Output ONLY JSON (UTF-8), an array of entry objects. No prose.
- Preserve phonetic strings exactly. Strip soft hyphens and HTML entities (&shy;, &nbsp;).
- Fields (include only when present): section, phonetic, gloss, transcription, etymology,
  source_refs, related, also, examples, derived_from, relations, components, see_section,
  notes, sic, warning, literature_refs.
- Etymology = historical language stage, not a bibliography.
- Dictionary citations go in source_refs (not etymology).
- Relations: use explicit *_of when dependent is stated (e.g. "genitive_singular_of").
- Variants/emphatic forms → "also":[...]
- Compounds → "components":[ {...}, {...} ]
- Quotes/proverbs → "examples":[{"quote":"…","gloss":"…","type":"proverb","source":"…"}]
- Don’t invent spellings. If an Irish spelling is given, put it in "transcription".

Now extract the following snippet to JSON:

<<<
{text}
>>>
    """

    r = requests.post(
        f"{OLLAMA_HOST}/api/generate",
        headers={"Content-Type": "application/json"},
        json={
            "model": MODEL,
            "prompt": prompt,
            "options": {"temperature": 0.1},
            "stream": False,
        },
        timeout=600,
    )
    r.raise_for_status()
    data = r.json()

    # Ollama wraps the text inside `response` as a string
    try:
        return json.loads(data["response"])
    except json.JSONDecodeError:
        print("⚠️ Could not parse response:")
        print(data["response"])
        return []


In [3]:
from pathlib import Path

json_path = Path("/tmp/pron2/quiggin/raw")
wiki_path = Path("/tmp/pron2/quiggin/wiki")

for file in wiki_path.glob("*.wiki"):
    stem = file.stem
    if (json_path / f"{stem}.json").exists():
        print(f"Skipping {stem}, already done")
        continue
    with open(file, "r") as f:
        text = f.read()
    text = text.strip()
    output = extract_sections(text)
    with open(json_path / f"{stem}.json", "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

ValueError: Invalid format specifier