In [1]:
example_text = '''
{{a|Ulster}} {{IPA|ga|/mˠaːsˠ/|/mˠaːʃ/|qual2=before {{m|ga|é}}, {{m|ga|ea}}, {{m|ga|í}}, {{m|ga|iad}} and their emphatic equivalents}}
{{a|Galway}} {{IPA|ga|/lʲoːbˠ/}} {{a|corresponding to the spelling {{m|ga|leob}}}}<ref>{{R:ga:Finck|I|196}}</ref><ref>{{R:ga:GCFD|308}}</ref>
{{IPA|ga|/n̪ˠõːsˠ/|ref={{R:ga:Quiggin|17}}}}
'''

In [62]:
REFERENCE_TEMPLATES = {
    "Quiggin": {
        "positional": ["page", "section"],
    },
    "Finck": {
        "positional": ["volume", "page"],
        "named": ["section"]
    },
    "SjPh": {
        "positional": ["page", "page_end"],
        "named": ["section"]
    },
    "Achill": {
        "named": ["page", "section"]
    },
    "GCFD": {
        "positional": ["section"],
        "named": ["page", "edition"]
    },
    "IGT i": {
        "positional": ["page", "page_end", "section", "section_end", "text"],
        "named": ["page", "pages", "section", "sections", "text"]
    },
    "IGT ii": {
        "positional": ["page", "page_end", "section", "section_end"],
        "named": ["page", "pages", "section", "sections", "part"]
    },
    "IGT iii": {
        "positional": ["page", "page_end", "section", "section_end"],
        "named": ["page", "pages", "section"]
    },
    "TBhU": {
        "positional": ["page"],
        "named": ["section", "passage"]
    },
    "Ó Searcaigh": {
        "positional": ["section"],
        "named": ["page"]
    },
    "Ó Dónaill": {
        "positional": ["term"]
    },
    "Muskerry": {
        "named": ["page", "section", "line"]
    },
    "Torr": {
        "named": ["page", "section"]
    },
    "Tourmakeady": {
        "named": ["page", "section"]
    },
    "Corpas": {
        "positional": ["search_type", "term"],
        "named": ["pos"]
    },
    "DIL": {
        "positional": ["entry_id"],
        "named": ["head"]
    },
    "TYI": {
        "positional": ["page"]
    },
    "Ros Goill": {
        "named": ["page", "section"]
    },
    "Ros Muc": {
        "positional": ["page"]
    },
    "SnaG": {
        "named": ["page", "pages", "chapter", "section", "text", "t"]
    },
    "IPD": {
        "positional": ["term"]
    },
    "NEID": {
        "positional": ["term"]
    },
    "Foras": {
        "positional": ["term"]
    },
    "GCD": {
        "positional": ["section"],
        "named": ["page"]
    },
    "FFG": {
        "positional": ["page"]
    },
    "MacBain": {
        "positional": ["page", "page_end"],
        "named": ["head"]
    },
    "Gaelic Personal Names": {
        "positional": ["page"]
    },
    "Greene 1976": {
        "positional": ["term"]
    },
    "EID": {
        "positional": ["term"]
    },
    "EDPC": {
        "named": ["page", "pages", "head"]
    },
    "KPV": {
        "named": ["page", "pages", "head"]
    },
    "Breatnach": {
        "positional": ["page"],
        "named": ["page", "pages", "section", "line"]
    },
    "Hughes": {
        "positional": ["page"],
        "named": ["chapter", "passage"]
    },
}


In [64]:
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
import json

# PEG grammar
grammar = Grammar(
    r"""
    block           = (template / ref / text)+
    template        = "{{" name ( "|" param )* "}}"
    ref             = "<ref>" template "</ref>"
    param           = named / positional
    named           = key "=" value
    positional      = value
    value           = (template / text)+
    name            = ~r"[a-zA-Z0-9:_]+"
    key             = ~r"[a-zA-Z0-9_]+"
    text            = ~r"[^\{\}\|\=<]+"
    """
)

# Parse tree visitor
class ParseTreeVisitor(NodeVisitor):
    def visit_block(self, node, children):
        return [item for child in children for item in (child if isinstance(child, list) else [child]) if item]

    def visit_template(self, node, children):
        _, name, *param_parts, _ = children
        params = []
        for part in param_parts:
            if isinstance(part, tuple) and len(part) == 2:
                _, param = part
                params.append(param)
            elif part:
                params.append(part)
        return {"template": name, "args": params}

    def visit_ref(self, node, children):
        _, content, _ = children
        return {"type": "ref", "content": content}

    def visit_param(self, node, children):
        return children[0]

    def visit_named(self, node, children):
        key, _, value = children
        return {key: value}

    def visit_positional(self, node, children):
        return children[0]

    def visit_value(self, node, children):
        return children

    def visit_name(self, node, _):
        return node.text.strip()

    def visit_key(self, node, _):
        return node.text.strip()

    def visit_text(self, node, _):
        return node.text.strip()

    def generic_visit(self, node, visited_children):
        return visited_children or node.text

# Flatten utility
def flatten(x):
    if isinstance(x, str):
        return x
    elif isinstance(x, dict):
        if x.get("template") and x.get("args"):
            return {
                "template": x["template"],
                "args": [flatten(a) for a in x["args"]]
            }
        return x
    elif isinstance(x, list):
        flat = []
        for item in x:
            f = flatten(item)
            if isinstance(f, list):
                flat.extend(f)
            else:
                flat.append(f)
        return flat
    return x

# Qualifier renderer
def render_qualifier(value):
    parts = []
    for v in value:
        if isinstance(v, str):
            parts.append(v)
        elif isinstance(v, dict) and v.get("template") == "m":
            args = flatten(v["args"])
            parts.append(args[-1])
    return "".join(parts).strip()

def normalize_reference(template):
    if not isinstance(template, dict) or template.get("template") is None:
        return {"raw": flatten(template)}

    template_name = template["template"]
    flat_args = flatten(template.get("args", []))
    raw_named = [arg for arg in flat_args if isinstance(arg, dict)]
    raw_positional = [arg for arg in flat_args if isinstance(arg, str)]

    # Strip common prefixes
    source = re.sub(r"^R:(?:ga:|ghc:|cel:)?", "", template_name)
    result = {"source": source}

    spec = REFERENCE_TEMPLATES.get(source)

    if spec:
        pos_keys = spec.get("positional", [])
        for i, key in enumerate(pos_keys):
            if i < len(raw_positional):
                result[key] = raw_positional[i]

        for arg in raw_named:
            for k, v in arg.items():
                if "named" not in spec or k in spec["named"]:
                    result[k] = v
    else:
        # fallback
        result["args"] = flat_args

    return result


# Dialect extractor
def extract_dialects(parsed):
    dialects = []
    for item in parsed:
        if isinstance(item, dict) and item.get("template") == "a":
            flat_args = flatten(item["args"])
            for arg in flat_args:
                if isinstance(arg, str) and arg.lower() != "also":
                    dialects.append(arg)
    return dialects

# Final normalization logic
def normalize_with_dialect(parsed):
    out = []
    current_ipa = None
    dialects = extract_dialects(parsed)

    for item in parsed:
        if isinstance(item, dict) and item.get("template") == "IPA":
            variants = []
            qual_map = {}
            ref_map = {}
            args = flatten(item["args"])
            for i, arg in enumerate(args):
                if isinstance(arg, dict) and len(arg) == 1:
                    k, v = list(arg.items())[0]
                    if k.startswith("qual") and len(k) > 4 and k[4:].isdigit():
                        idx = int(k[4:]) - 1
                        qual_map[idx] = flatten(v)
                    elif k.startswith("ref") and len(k) > 3 and k[3:].isdigit():
                        idx = int(k[3:]) - 1
                        ref_map.setdefault(idx, []).append(flatten(v))
                    elif k == "ref":
                        idx = len(variants) - 1
                        ref_map.setdefault(idx, []).append(flatten(v))
                elif isinstance(arg, str) and arg.startswith("/"):
                    variants.append({"ipa": arg.strip("/")})
                elif isinstance(arg, list):
                    for val in arg:
                        if isinstance(val, str) and val.startswith("/"):
                            variants.append({"ipa": val.strip("/")})

            for i, var in enumerate(variants):
                if i in qual_map:
                    var["qualifier"] = render_qualifier(qual_map[i])
                if i in ref_map:
                    var["refs"] = [normalize_reference(r) for r in ref_map[i]]
                if dialects:
                    var["dialects"] = dialects
            current_ipa = {"type": "ipa", "variants": variants}
            out.append(current_ipa)

        elif isinstance(item, dict) and item.get("type") == "ref":
            if current_ipa and current_ipa["variants"]:
                current_ipa["variants"][-1].setdefault("refs", []).append(normalize_reference(item["content"]))

    return out

# Main entry point
def parse_and_normalize_with_dialects(text):
    tree = grammar.parse(text)
    parsed = ParseTreeVisitor().visit(tree)
    print("=== PARSED STRUCTURE ===")
    print(json.dumps(parsed, indent=2, ensure_ascii=False))
    print("=========================")
    result = normalize_with_dialect(parsed)
    return json.dumps(result, indent=2, ensure_ascii=False)

text = "{{a|Ulster|Mayo}}{{IPA|ga|/x/|ref={{R:ga:Quiggin|17}}}}<ref>{{R:ga:Finck|I|196}}</ref>"
print(parse_and_normalize_with_dialects(text))


=== PARSED STRUCTURE ===
[
  {
    "template": "a",
    "args": [
      [
        [
          "|",
          [
            [
              "Ulster"
            ]
          ]
        ],
        [
          "|",
          [
            [
              "Mayo"
            ]
          ]
        ]
      ]
    ]
  },
  {
    "template": "IPA",
    "args": [
      [
        [
          "|",
          [
            [
              "ga"
            ]
          ]
        ],
        [
          "|",
          [
            [
              "/x/"
            ]
          ]
        ],
        [
          "|",
          {
            "ref": [
              [
                {
                  "template": "R:ga:Quiggin",
                  "args": [
                    [
                      [
                        "|",
                        [
                          [
                            "17"
                          ]
                        ]
                      ]
                  

In [65]:
result = parse_and_normalize_with_dialects("{{a|Ulster|Mayo}}{{IPA|ga|/x/|ref={{R:ga:Quiggin|17}}}}<ref>{{R:ga:Finck|I|196}}</ref>")
print(result)

=== PARSED STRUCTURE ===
[
  {
    "template": "a",
    "args": [
      [
        [
          "|",
          [
            [
              "Ulster"
            ]
          ]
        ],
        [
          "|",
          [
            [
              "Mayo"
            ]
          ]
        ]
      ]
    ]
  },
  {
    "template": "IPA",
    "args": [
      [
        [
          "|",
          [
            [
              "ga"
            ]
          ]
        ],
        [
          "|",
          [
            [
              "/x/"
            ]
          ]
        ],
        [
          "|",
          {
            "ref": [
              [
                {
                  "template": "R:ga:Quiggin",
                  "args": [
                    [
                      [
                        "|",
                        [
                          [
                            "17"
                          ]
                        ]
                      ]
                  

In [66]:
# res = parse_and_normalize_with_dialects(text)
print(res)

[
  {
    "type": "ipa",
    "variants": [
      {
        "ipa": "x",
        "refs": [
          {
            "raw": [
              {
                "template": "R:ga:Quiggin",
                "args": [
                  [
                    "|",
                    "17"
                  ]
                ]
              }
            ]
          },
          {
            "source": "R:ga:Finck",
            "book": "|",
            "page": "I",
            "extra": [
              "|",
              "196"
            ]
          }
        ],
        "dialects": [
          "|",
          "Ulster",
          "|",
          "Mayo"
        ]
      }
    ]
  }
]
