In [15]:
import re
from typing import List, Dict, Optional

# --- mappings (trim/add as needed) ---
LANG_MAP = {
    "air.": "Old Irish",
    "mir.": "Middle Irish",
    "nir.": "Modern Irish",
    "aengl.": "Old English",
    "mengl.": "Middle English",
    "anord.": "Old Norse",
    "aisl.": "Old Icelandic",
    "aschott.": "Old Scots",
    "lat.": "Latin",
    "kymr.": "Welsh",
    "korn.": "Cornish",
    "bret.": "Breton",
    "span.": "Spanish",
}

WORK_MAP = {  # abbreviations → preferred surname/work label
    "Molloy": "Molloy",
    "Keat.": "Keating",
    "O’R.": "O’Reilly",
    "O'R.": "O’Reilly",
    "O’Clery": "O’Clery",
    "O'Clery": "O’Clery",
    "Atk.": "Atkinson",
    "Bk. of Deer": "Book of Deer",
    "Book of Deer": "Book of Deer",
    "Joyce": "Joyce",
}

ROMAN_RE = r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)"  # enough for I/II

def split_top_level_semicolons(text: str) -> List[str]:
    parts, buf, depth = [], [], 0
    for ch in text:
        if ch == '(':
            depth += 1
        elif ch == ')':
            depth = max(0, depth - 1)
        if ch == ';' and depth == 0:
            chunk = ''.join(buf).strip()
            if chunk:
                parts.append(chunk)
            buf = []
        else:
            buf.append(ch)
    last = ''.join(buf).strip()
    if last:
        parts.append(last)
    return parts

def parse_neben(chunk: str) -> List[str]:
    # e.g. (neben ''biəl'') or … neben ''X'' …
    alts = re.findall(r"neben\s+''([^']+)''", chunk)
    return alts

def parse_phonetic_head(chunk: str) -> Optional[List[str]]:
    m = re.search(r"^''([^']+)''", chunk.strip())
    if not m:
        return None
    return [p for p in m.group(1).split() if p]

def parse_gloss(chunk: str) -> Optional[str]:
    m = re.search(r"„([^“]+)“", chunk)
    return m.group(1).strip() if m else None

def parse_gender(chunk: str) -> Optional[str]:
    m = re.search(r"\b([mfn])\.\b", chunk)
    if not m: return None
    return {"m": "masculine", "f": "feminine", "n": "neuter"}[m.group(1)]

def parse_vgl_crossrefs(chunk: str) -> List[Dict[str, str]]:
    """
    Handles: (vgl. II 251, 15), (vgl. II 251, 15. 266, 5), (vgl. I 263)
    Returns list of {volume,page[,line]}
    """
    out = []
    for par in re.findall(r"\(([^)]*vgl\.[^)]*)\)", chunk):
        s = par
        # Find the 'vgl.' portion and parse tokens after it
        vgl_m = re.search(r"vgl\.\s*(.*)$", s)
        if not vgl_m: 
            continue
        tail = vgl_m.group(1).strip()
        # Tokenize on spaces, commas, and periods; keep roman numerals and numbers
        tokens = [t for t in re.split(r"[,\.\s]+", tail) if t]
        cur_vol = None
        i = 0
        while i < len(tokens):
            tok = tokens[i]
            if re.fullmatch(ROMAN_RE, tok):
                cur_vol = tok
                i += 1
                continue
            if tok.isdigit():
                page = tok
                line = None
                # next token may be a line number
                if i + 1 < len(tokens) and tokens[i+1].isdigit():
                    line = tokens[i+1]
                    i += 1
                if cur_vol:
                    entry = {"volume": cur_vol, "page": page}
                    if line: entry["line"] = line
                    out.append(entry)
                i += 1
                continue
            i += 1
    return out

def parse_etymology(chunk: str) -> List[Dict[str, str]]:
    """
    Collects historical stages like: air. bél; mir. bláth; aengl. bróc; anord. brókr
    Splits multiple forms after the same label (e.g. 'air. biaid bieid').
    """
    out = []
    # order keys by length to avoid prefix bleed (e.g. 'mir.' vs 'm.')
    labels = sorted(LANG_MAP.keys(), key=len, reverse=True)
    pattern = r"\b(" + "|".join(map(re.escape, labels)) + r")\s+([^;,()]+)"
    for abbr, forms_blob in re.findall(pattern, chunk):
        forms = [f.strip() for f in forms_blob.split() if f.strip()]
        for f in forms:
            # stop on trailing punctuation
            f = f.rstrip(".,:;")
            if f:
                out.append({"language": LANG_MAP[abbr], "form": f})
    return out

def parse_sources(chunk: str) -> List[Dict[str, object]]:
    """
    Captures easy modern source references:
      - 'Molloy 49: áthúil' → {work:"Molloy", page:"49", forms:["áthúil"]}
      - 'Keat. breódhaim, breóghaim' → {work:"Keating", forms:[...]}
      - 'O’R.' → {work:"O’Reilly"} (form optional)
      - '(Bk. of Deer)' → {work:"Book of Deer"}
    """
    out = []
    # Molloy with page and optional forms
    for m in re.finditer(r"\b(Molloy)\s+(\d+)(?::\s*([^);]+))?", chunk):
        work = WORK_MAP[m.group(1)]
        page = m.group(2)
        forms = []
        if m.group(3):
            forms = [f.strip() for f in re.split(r",\s*", m.group(3).strip()) if f.strip()]
        entry = {"work": work, "page": page}
        if forms: entry["forms"] = forms
        out.append(entry)
    # Other named works possibly followed by forms, no page required
    for key, label in WORK_MAP.items():
        if key == "Molloy":  # already handled
            continue
        # find occurrences like 'Keat. xxx' or '(Bk. of Deer)'
        for m in re.finditer(r"(?:\(|\b)"+re.escape(key)+r"(?:\)|\b)(?::\s*([^);]+))?", chunk):
            forms_blob = m.group(1)
            entry = {"work": label}
            if forms_blob:
                forms = [f.strip() for f in re.split(r",\s*", forms_blob.strip()) if f.strip()]
                if forms: entry["forms"] = forms
            # avoid duplicates (e.g., repeated parentheses)
            if entry not in out:
                out.append(entry)
    return out

def extract_easy_entries(volume: str, page: str, section: str, page_text: str) -> List[Dict[str, object]]:
    """
    Minimal-but-useful pass:
      - returns a list of dicts with `volume`, `page`, `raw`
      - plus: phonetic, alongside (from 'neben'), gloss, gender, see_section, etymology, source_refs
    """
    results = []
    for raw in split_top_level_semicolons(page_text):
        item = {
            "volume": volume,
            "page": page,
            "section": section,
            "raw": raw.strip()
        }
        head = parse_phonetic_head(raw)
        if head: item["phonetic"] = head
        alts = parse_neben(raw)
        if alts:
            # keep single string for simplest downstream merging
            item["alongside"] = alts[0] if len(alts) == 1 else alts
        gloss = parse_gloss(raw)
        if gloss: item["gloss"] = gloss
        gender = parse_gender(raw)
        if gender: item["gender"] = gender
        refs = parse_vgl_crossrefs(raw)
        if refs: item["see_section"] = refs
        ety = parse_etymology(raw)
        if ety: item["etymology"] = ety
        src = parse_sources(raw)
        if src: item["source_refs"] = src
        results.append(item)
    return results

# --- example ---
if __name__ == "__main__":
    sample = """''bēl'' (neben ''biəl'') „beil“, air. biail; ''āwul āwl̥'' „glücklich“, ághamhal (Molloy 49: áthúil); ''dauəx daux'', f., „fass“ (vgl. II 251, 15. 266, 5), aschott. dabach (Bk. of Deer)"""
    for e in extract_easy_entries("I", "4", "4", sample):
        print(e)


{'volume': 'I', 'page': '4', 'section': '4', 'raw': "''bēl'' (neben ''biəl'') „beil“, air. biail", 'phonetic': ['bēl'], 'alongside': 'biəl', 'gloss': 'beil', 'etymology': [{'language': 'Old Irish', 'form': 'biail'}]}
{'volume': 'I', 'page': '4', 'section': '4', 'raw': "''āwul āwl̥'' „glücklich“, ághamhal (Molloy 49: áthúil)", 'phonetic': ['āwul', 'āwl̥'], 'gloss': 'glücklich', 'source_refs': [{'work': 'Molloy', 'page': '49', 'forms': ['áthúil']}]}
{'volume': 'I', 'page': '4', 'section': '4', 'raw': "''dauəx daux'', f., „fass“ (vgl. II 251, 15. 266, 5), aschott. dabach (Bk. of Deer)", 'phonetic': ['dauəx', 'daux'], 'gloss': 'fass', 'see_section': [{'volume': 'II', 'page': '251', 'line': '15'}, {'volume': 'II', 'page': '266', 'line': '5'}], 'etymology': [{'language': 'Old Scots', 'form': 'dabach'}], 'source_refs': [{'work': 'Book of Deer'}]}


In [6]:
P4 = """''ai aiə'' „gesicht“, air. aged; ''aiəl̄ ail̄'' „hitze“ (vgl. II 251, 14), air. adall?; ''aiən'' „kessel“, air. aigen; ''aiərk airk'' (vgl. II 251, 15) „horn, geweih“, air. adarc; ''auən̄ aun̄'' (vgl. II 251, 15) „fluss“, air. abann; ''ā'' „glück, darrofen, furt“, mir. ág, mir. áith, air. áth; ''āwul āwl̥'' „glücklich“, ághamhal (Molloy 49: áthúil); ''bau'' „bogen“, mir. boga, aengl. boga; ''bauər baur'' (vgl. II 251, 16) „taub“, air. bodar; ''bā'' „zuneigung“, mir. báid báde (Molloy 35: báighe); ''bā'' „ertränken, baden“ (Molloy 81: bághthadh), air. bádud; ''bāĭm'' „ertränke, mache nass“, mir. báidim; ''bēl'' „mund“, air. bél; ''bēl'' (neben ''biəl'') „beil“, air. biail; ''bēs bēsə'' (vgl. II 262, 26 und Molloy 33) „sitte, betragen, gewohnheit“, air. bés; ''bǡ'' „nahrung“ (vgl. II 263, 15), beathughadh, Keat., ''bĭaiəx'' „tier“, beathadhach, Keat., beide von beatha, air. bethu; ''bĭai bai'', 3. sing. fut. zu ''tāĭm'' „ich bin“, air. biaid bieid; ''bĭō'' „lebendig“ air. béo bíu; ''blā'' „blüte, blume“, mir. bláth; ''blāklī́ blāḱlī́'' „Dublin“, Baile-atha-cliath (die erste silbe als schwach betonte ist eingipflig); ''blāx'' „buttermilch“, mir. bláthach; ''blā'' „melken“, wohl neugebildete infinitivform zu bleaghaim, O’R., statt blighim nach bleaghan aus mir. blegon, dabei unter einfluss von ''blāx'' „buttermilch“ (''blān'' „melken“, mir. blegon, scheint eingipflig zu sein); ''blīm'' „melke“, mir. bligim; ''bō'' „kuh“, air. bó; ''bōhr̥'' „weg“, mir. bóthar; ''bōhŕīn'' „feldweg, gasse“, von ''bōhr̥''; ''brāhŕ̥'' „klosterbruder“, air. bráthir; ''brāx'' „jüngstes gericht“, air. bráth; ''brōg'' „schuh“, mir. bróc, aengl. bróc, anord. brókr; ''bŕǡ'' „schön“, mir. breagha (O’Clery); ''bŕǡxə bŕǡxcə'' „schönheit“, von ''bŕǡ''; ''bŕēg'' „lüge“, air. bréc; ''bŕēgān'' „spielzeug“, von ''bŕēg''; ''bŕēgəx'' „lügnerisch“, mir. brécach; ''bŕīȷ'' „Brigitta“, air. Brigit; ''bŕō'' „bedrücken“, Keat. breódh; ''bŕōĭm'' „bedrücke“, Keat. breódhaim, breóghaim; ''dauəx daux'', f., „fass“ (vgl. II 251, 15. 266, 5), aschott. dabach (Bk. of Deer); ''dauən daun'' „welt“ (vgl. II 251, 16), air. domun; ''dauən daun'' „tiefe“, air. domain; ''dəlī́ ȷlī'' (neben ''dəlíə'') „gesetz“ (vgl. II 251, 17), air. dliged; ''dəlū''? „kette, werfte“ (des gewebes) (vgl. II 266, 23), air. dlúth; ''dō'' „brennen“, Molloy 81: dóghadh, ''dōĭm'' „brenne, verbrenne“, dóghaim dóighim, mir. dóud dóim; ''ȷauəl ȷaul'' „teufel“, air. diabul; ''ȷēgə'' „göttlich“, diaga (Molloy 50) (Keat. diadha aus air. diade); ''ȷēŕḱə'' (nicht ''ȷēŕḱ'', wie II 80, 32) „almosen“, déirc (Molloy 35: déirce) mir. déarc, desheirc, air. dearc, deircc, desercc aus de-shercc"""

In [16]:
entries = []
for e in extract_easy_entries("I", "4", "4", P4):
    entries.append(e)


In [18]:
def enumerate_entries(entries: List[Dict[str, object]]) -> None:
    for i, entry in enumerate(entries):
        entry["id"] = f"{entry['volume']}_{entry['section']}_{i + 1}"
enumerate_entries(entries)

In [19]:
import json
print(json.dumps(entries, indent=2, ensure_ascii=False))

[
  {
    "volume": "I",
    "page": "4",
    "section": "4",
    "raw": "''ai aiə'' „gesicht“, air. aged",
    "phonetic": [
      "ai",
      "aiə"
    ],
    "gloss": "gesicht",
    "etymology": [
      {
        "language": "Old Irish",
        "form": "aged"
      }
    ],
    "id": "I_4_1"
  },
  {
    "volume": "I",
    "page": "4",
    "section": "4",
    "raw": "''aiəl̄ ail̄'' „hitze“ (vgl. II 251, 14), air. adall?",
    "phonetic": [
      "aiəl̄",
      "ail̄"
    ],
    "gloss": "hitze",
    "see_section": [
      {
        "volume": "II",
        "page": "251",
        "line": "14"
      }
    ],
    "etymology": [
      {
        "language": "Old Irish",
        "form": "adall?"
      }
    ],
    "id": "I_4_2"
  },
  {
    "volume": "I",
    "page": "4",
    "section": "4",
    "raw": "''aiən'' „kessel“, air. aigen",
    "phonetic": [
      "aiən"
    ],
    "gloss": "kessel",
    "etymology": [
      {
        "language": "Old Irish",
        "form": "aigen"
      }
   