In [1]:
example_text = '''
{{a|Ulster}} {{IPA|ga|/mˠaːsˠ/|/mˠaːʃ/|qual2=before {{m|ga|é}}, {{m|ga|ea}}, {{m|ga|í}}, {{m|ga|iad}} and their emphatic equivalents}}
{{a|Galway}} {{IPA|ga|/lʲoːbˠ/}} {{a|corresponding to the spelling {{m|ga|leob}}}}<ref>{{R:ga:Finck|I|196}}</ref><ref>{{R:ga:GCFD|308}}</ref>
{{IPA|ga|/n̪ˠõːsˠ/|ref={{R:ga:Quiggin|17}}}}
'''

In [14]:
!pip install lark

Collecting lark
  Downloading lark-1.2.2-py3-none-any.whl (111 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lark
Successfully installed lark-1.2.2


In [25]:
from lark import Lark, Transformer, v_args
import json

grammar = r"""
start: template

template: "{{" name ( "|" param )* "}}"

param: key "=" value   -> named_param
     | value           -> positional_param

?value: (template | INLINE_TEXT)+

INLINE_TEXT: /[^{}|=]+(?=(}}|\|)|$)/

key: /[a-zA-Z0-9_]+/
name: /[a-zA-Z0-9_:]+/

%import common.WS
%ignore WS
"""

@v_args(inline=True)
class WiktionaryTransformer(Transformer):
    def template(self, name, *params):
        return {"template": name, "args": list(params)}

    def named_param(self, key, value):
        return {key: value if isinstance(value, list) else [value]}

    def positional_param(self, value):
        return value if isinstance(value, list) else [value]

    def INLINE_TEXT(self, token):
        return token.value

    def key(self, token):
        return token.value

    def name(self, token):
        return token.value

def parse_wiktionary(text):
    parser = Lark(grammar, start="start", parser="lalr")
    tree = parser.parse(text)
    return WiktionaryTransformer().transform(tree)

input_text = "{{IPA|ga|/x/|qual2=before {{m|ga|é}}, {{m|ga|ea}} and {{m|ga|í}}}}"
result = parse_wiktionary(input_text)
print(json.dumps(result, indent=2, ensure_ascii=False))


UnexpectedToken: Unexpected token Token('__ANON_3', 'before') at line 1, column 20.
Expected one of: 
	* "{{"
	* INLINE_TEXT
Previous tokens: [Token('EQUAL', '=')]


In [38]:
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
import json

grammar = Grammar(
    r"""
    block           = (template / ref / text)+
    template        = "{{" name ( "|" param )* "}}"
    ref             = "<ref>" template "</ref>"
    param           = named / positional
    named           = key "=" value
    positional      = value
    value           = (template / text)+
    name            = ~r"[a-zA-Z0-9:_]+"
    key             = ~r"[a-zA-Z0-9_]+"
    text            = ~r"[^\{\}\|\=<]+"
    """
)

class ParseTreeVisitor(NodeVisitor):
    def visit_block(self, node, children):
        return children

    def visit_template(self, node, children):
        _, name, *param_parts, _ = children
        params = [p for part in param_parts if part for p in (part if isinstance(part, list) else [part])]
        return {"template": name.text, "args": params}

    def visit_ref(self, node, children):
        return {"type": "ref", "content": children[0]}

    def visit_param(self, node, children):
        return children[0]

    def visit_named(self, node, children):
        key, _, value = children
        return {key.text: value}

    def visit_positional(self, node, children):
        return children[0]

    def visit_value(self, node, children):
        return children

    def visit_text(self, node, _):
        return node.text

    def generic_visit(self, node, visited_children):
        return visited_children or node

def flatten(x):
    if isinstance(x, str):
        return x
    elif isinstance(x, dict):
        if x.get("template") and x.get("args"):
            return {
                "template": x["template"],
                "args": [flatten(a) for a in x["args"]]
            }
        return x
    elif isinstance(x, list):
        flat = []
        for item in x:
            f = flatten(item)
            if isinstance(f, list):
                flat.extend(f)
            else:
                flat.append(f)
        return flat
    return x

def render_qualifier(value):
    parts = []
    for v in value:
        if isinstance(v, str):
            parts.append(v)
        elif isinstance(v, dict) and v.get("template") == "m":
            parts.append(flatten(v["args"])[-1])
    return "".join(parts).strip()

def normalize(parsed):
    out = []
    current_ipa = None

    for item in parsed:
        if isinstance(item, dict) and item.get("template") == "IPA":
            variants = []
            qual_map = {}
            ref_map = {}
            for i, arg in enumerate(item["args"]):
                if isinstance(arg, dict) and len(arg) == 1:
                    k, v = list(arg.items())[0]
                    if k.startswith("qual"):
                        idx = int(k[4:]) - 1
                        qual_map[idx] = flatten(v)
                    elif k.startswith("ref"):
                        idx = int(k[3:]) - 1
                        ref_map.setdefault(idx, []).append(flatten(v))
                elif isinstance(arg, str) and arg.startswith("/"):
                    variants.append({"ipa": arg.strip("/")})
                elif isinstance(arg, list):
                    for val in arg:
                        if isinstance(val, str) and val.startswith("/"):
                            variants.append({"ipa": val.strip("/")})

            for i, var in enumerate(variants):
                if i in qual_map:
                    var["qualifier"] = render_qualifier(qual_map[i])
                if i in ref_map:
                    var["refs"] = ref_map[i]
            current_ipa = {"type": "ipa", "variants": variants}
            out.append(current_ipa)

        elif isinstance(item, dict) and item.get("type") == "ref":
            if current_ipa and current_ipa["variants"]:
                current_ipa["variants"][-1].setdefault("refs", []).append(flatten(item["content"]))

    return out

def parse_and_normalize(text):
    tree = grammar.parse(text)
    parsed = ParseTreeVisitor().visit(tree)
    return json.dumps(normalize(parsed), indent=2, ensure_ascii=False)

test_input = "{{IPA|ga|/x/}}"
print(parse_and_normalize(test_input))


[]
