In [2]:
import re
from typing import List, Dict, Optional

LANG_MAP = {
    "air.": "Old Irish",
    "mir.": "Middle Irish",
    "nir.": "Modern Irish",
    "aengl.": "Old English",
    "mengl.": "Middle English",
    "anord.": "Old Norse",
    "aisl.": "Old Icelandic",
    "aschott.": "Old Scots",
    "lat.": "Latin",
    "kymr.": "Welsh",
    "korn.": "Cornish",
    "bret.": "Breton",
    "span.": "Spanish",
}

WORK_MAP = {
    "Molloy": "Molloy",
    "Keat.": "Keating",
    "O’R.": "O’Reilly",
    "O'R.": "O’Reilly",
    "O’Clery": "O’Clery",
    "O'Clery": "O’Clery",
    "Atk.": "Atkinson",
    "Bk. of Deer": "Book of Deer",
    "Book of Deer": "Book of Deer",
    "Joyce": "Joyce",
}

ROMAN_RE = r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)"

def split_top_level_semicolons(text: str) -> List[str]:
    parts, buf, depth = [], [], 0
    for ch in text:
        if ch == '(':
            depth += 1
        elif ch == ')':
            depth = max(0, depth - 1)
        if ch == ';' and depth == 0:
            chunk = ''.join(buf).strip()
            if chunk:
                parts.append(chunk)
            buf = []
        else:
            buf.append(ch)
    last = ''.join(buf).strip()
    if last:
        parts.append(last)
    return parts

def parse_neben(chunk: str) -> List[str]:
    alts = re.findall(r"neben\s+''([^']+)''", chunk)
    return alts

def parse_phonetic_head(chunk: str) -> Optional[List[str]]:
    m = re.search(r"^''([^']+)''", chunk.strip())
    if not m:
        return None
    return [p for p in m.group(1).split() if p]

def parse_gloss(chunk: str) -> Optional[str]:
    m = re.search(r"„([^“]+)“", chunk)
    return m.group(1).strip() if m else None

def parse_gender(chunk: str) -> Optional[str]:
    m = re.search(r"\b([mfn])\.\b", chunk)
    if not m: return None
    return {"m": "masculine", "f": "feminine", "n": "neuter"}[m.group(1)]

def parse_vgl_crossrefs(chunk: str) -> List[Dict[str, str]]:
    """
    Handles: (vgl. II 251, 15), (vgl. II 251, 15. 266, 5), (vgl. I 263)
    Returns list of {volume,page[,line]}
    """
    out = []
    for par in re.findall(r"\(([^)]*vgl\.[^)]*)\)", chunk):
        s = par
        vgl_m = re.search(r"vgl\.\s*(.*)$", s)
        if not vgl_m: 
            continue
        tail = vgl_m.group(1).strip()

        tokens = [t for t in re.split(r"[,\.\s]+", tail) if t]
        cur_vol = None
        i = 0
        while i < len(tokens):
            tok = tokens[i]
            if re.fullmatch(ROMAN_RE, tok):
                cur_vol = tok
                i += 1
                continue
            if tok.isdigit():
                page = tok
                line = None

                if i + 1 < len(tokens) and tokens[i+1].isdigit():
                    line = tokens[i+1]
                    i += 1
                if cur_vol:
                    entry = {"volume": cur_vol, "page": page}
                    if line: entry["line"] = line
                    out.append(entry)
                i += 1
                continue
            i += 1
    return out

def parse_etymology(chunk: str) -> List[Dict[str, str]]:
    """
    Collects historical stages like: air. bél; mir. bláth; aengl. bróc; anord. brókr
    Splits multiple forms after the same label (e.g. 'air. biaid bieid').
    """
    out = []

    labels = sorted(LANG_MAP.keys(), key=len, reverse=True)
    pattern = r"\b(" + "|".join(map(re.escape, labels)) + r")\s+([^;,()]+)"
    for abbr, forms_blob in re.findall(pattern, chunk):
        forms = [f.strip() for f in forms_blob.split() if f.strip()]
        for f in forms:
            f = f.rstrip(".,:;")
            if f:
                out.append({"language": LANG_MAP[abbr], "form": f})
    return out

def parse_sources(chunk: str) -> List[Dict[str, object]]:
    """
    Captures easy modern source references:
      - 'Molloy 49: áthúil' → {work:"Molloy", page:"49", forms:["áthúil"]}
      - 'Keat. breódhaim, breóghaim' → {work:"Keating", forms:[...]}
      - 'O’R.' → {work:"O’Reilly"} (form optional)
      - '(Bk. of Deer)' → {work:"Book of Deer"}
    """
    out = []
    for m in re.finditer(r"\b(Molloy)\s+(\d+)(?::\s*([^);]+))?", chunk):
        work = WORK_MAP[m.group(1)]
        page = m.group(2)
        forms = []
        if m.group(3):
            forms = [f.strip() for f in re.split(r",\s*", m.group(3).strip()) if f.strip()]
        entry = {"work": work, "page": page}
        if forms: entry["forms"] = forms
        out.append(entry)

    for key, label in WORK_MAP.items():
        if key == "Molloy":  # already handled
            continue

        for m in re.finditer(r"(?:\(|\b)"+re.escape(key)+r"(?:\)|\b)(?::\s*([^);]+))?", chunk):
            forms_blob = m.group(1)
            entry = {"work": label}
            if forms_blob:
                forms = [f.strip() for f in re.split(r",\s*", forms_blob.strip()) if f.strip()]
                if forms: entry["forms"] = forms

            if entry not in out:
                out.append(entry)
    return out

def extract_easy_entries(volume: str, page: str, section: str, page_text: str) -> List[Dict[str, object]]:
    """
    Minimal-but-useful pass:
      - returns a list of dicts with `volume`, `page`, `raw`
      - plus: phonetic, alongside (from 'neben'), gloss, gender, see_section, etymology, source_refs
    """
    results = []
    for raw in split_top_level_semicolons(page_text):
        item = {
            "volume": volume,
            "page": page,
            "section": section,
            "raw": raw.strip()
        }
        head = parse_phonetic_head(raw)
        if head: item["phonetic"] = head
        alts = parse_neben(raw)
        if alts:
            item["alongside"] = alts[0] if len(alts) == 1 else alts
        gloss = parse_gloss(raw)
        if gloss: item["gloss"] = gloss
        gender = parse_gender(raw)
        if gender: item["gender"] = gender
        refs = parse_vgl_crossrefs(raw)
        if refs: item["see_section"] = refs
        ety = parse_etymology(raw)
        if ety: item["etymology"] = ety
        src = parse_sources(raw)
        if src: item["source_refs"] = src
        results.append(item)
    return results

In [8]:
P4 = """''bīń'' „angenehm klingend“, air. bind; ''əḱī́nc'' „quidam“ mir. écin; ''īm'' „butter“, m.ir. imb; ''īnšīm'' „erzähle“ (neben ''inšīm inšĭm''), mir. innisim indisim; ''mīĺĭm'' „verderbe“ (transitiv), air. millim; ''cīń'' „krank“, mir. tind; ''drīm'' „rücken“, air. druimm; ''rīńc'' „anteil“, aus einem <sup>ei̯</sup>/i-stamme zu air. rann (vgl. mir. roinded); ''kīvnīm'' „erinnere mich“ (neben ''kivnīm kimnīm'' mir. cúimnigim; ''sīvnəx sīmnəx'' „ruhig“, suaimhneach, Keat."""

In [9]:
entries = []
for e in extract_easy_entries("I", "9", "11", P4):
    entries.append(e)


In [10]:
def enumerate_entries(entries: List[Dict[str, object]], start: int = 1) -> None:
    for i, entry in enumerate(entries):
        entry["id"] = f"{entry['volume']}_{entry['section']}_{i + start}"
enumerate_entries(entries)

In [13]:
def write_json(path = "/private/tmp/irish-attested-pronunciations/finck/raw/", data = [], section = "4"):
    with open(f"{path}/section{section}.json", "w", encoding="utf-8") as f:
        import json
        json.dump(data, f, ensure_ascii=False, indent=2)
    

In [None]:
import json
print(json.dumps(entries, indent=2, ensure_ascii=False))

[
  {
    "volume": "I",
    "page": "9",
    "section": "11",
    "raw": "''bīń'' „angenehm klingend“, air. bind",
    "phonetic": [
      "bīń"
    ],
    "gloss": "angenehm klingend",
    "etymology": [
      {
        "language": "Old Irish",
        "form": "bind"
      }
    ],
    "id": "I_11_1"
  },
  {
    "volume": "I",
    "page": "9",
    "section": "11",
    "raw": "''əḱī́nc'' „quidam“ mir. écin",
    "phonetic": [
      "əḱī́nc"
    ],
    "gloss": "quidam",
    "etymology": [
      {
        "language": "Middle Irish",
        "form": "écin"
      }
    ],
    "id": "I_11_2"
  },
  {
    "volume": "I",
    "page": "9",
    "section": "11",
    "raw": "''īm'' „butter“, m.ir. imb",
    "phonetic": [
      "īm"
    ],
    "gloss": "butter",
    "gender": "masculine",
    "id": "I_11_3"
  },
  {
    "volume": "I",
    "page": "9",
    "section": "11",
    "raw": "''īnšīm'' „erzähle“ (neben ''inšīm inšĭm''), mir. innisim indisim",
    "phonetic": [
      "īnšīm"
    ],
    "a