In [34]:
import requests, json, html, re
from typing import List, Dict
import json

OLLAMA_HOST = "http://130.237.3.106:11434"  # or the IP of your Ollama machine
MODEL = "quiggin-extractor:latest"

def normalize_snippet(s: str) -> str:
    s = s.replace("\u00AD", "")
    s = s.replace("\u200B", "")
    s = s.replace("&amp;", "&")
    s = s.replace("&nbsp;", " ")
    s = s.replace("&shy;", "")
    s = html.unescape(s)
    return s

def extract_sections(text: str) -> List[Dict]:
    """
    Send one text chunk (one or more sections) to Ollama; return parsed JSON list.
    """
    text = normalize_snippet(text)

    resp = requests.post(
        f"{OLLAMA_HOST}/api/generate",
        headers={"Content-Type": "application/json"},
        json={
            "model": MODEL,
            "prompt": text,
            "options": {"temperature": 0.1, "num_ctx": 8192},
            "stream": False,
        },
        timeout=1200,
    )
    resp.raise_for_status()
    data = resp.json()

    raw = data.get("response", "").strip()

    start = raw.find("[")
    end = raw.rfind("]")
    if start != -1 and end != -1 and end >= start:
        raw = raw[start:end+1]

    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        print("⚠️ Could not parse JSON. Raw response:\n", raw)
        return []

In [None]:
from pathlib import Path

json_path = Path("/tmp/pron2/quiggin/raw")
wiki_path = Path("/tmp/pron2/quiggin/wiki")

for file in wiki_path.glob("*.wiki"):
    stem = file.stem
    if (json_path / f"{stem}.json").exists():
        print(f"Skipping {stem}, already done")
        continue
    with open(file, "r") as f:
        text = f.read()
    text = text.strip()
    output = extract_sections(text)
    with open(json_path / f"{stem}.json", "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

Skipping section433, already done
Skipping section126, already done
Skipping section258, already done
