In [1]:
example_text = '''
{{a|Ulster}} {{IPA|ga|/mˠaːsˠ/|/mˠaːʃ/|qual2=before {{m|ga|é}}, {{m|ga|ea}}, {{m|ga|í}}, {{m|ga|iad}} and their emphatic equivalents}}
{{a|Galway}} {{IPA|ga|/lʲoːbˠ/}} {{a|corresponding to the spelling {{m|ga|leob}}}}<ref>{{R:ga:Finck|I|196}}</ref><ref>{{R:ga:GCFD|308}}</ref>
{{IPA|ga|/n̪ˠõːsˠ/|ref={{R:ga:Quiggin|17}}}}
'''

In [14]:
!pip install lark

Collecting lark
  Downloading lark-1.2.2-py3-none-any.whl (111 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lark
Successfully installed lark-1.2.2


In [26]:
from lark import Lark, Transformer, v_args
import json

grammar = r"""
start: template

template: "{{" name ( "|" param )* "}}"

param: key "=" value   -> named_param
     | value           -> positional_param

?value: (template | INLINE_TEXT)+

INLINE_TEXT: /(.+?)(?={{|}}|\|)/

key: /[a-zA-Z0-9_]+/
name: /[a-zA-Z0-9_:]+/

%import common.WS
%ignore WS
"""

@v_args(inline=True)
class WiktionaryTransformer(Transformer):
    def template(self, name, *params):
        return {"template": name, "args": list(params)}

    def named_param(self, key, value):
        return {key: value if isinstance(value, list) else [value]}

    def positional_param(self, value):
        return value if isinstance(value, list) else [value]

    def INLINE_TEXT(self, token):
        return token.value

    def key(self, token):
        return token.value

    def name(self, token):
        return token.value

def parse_wiktionary(text):
    parser = Lark(grammar, start="start", parser="lalr")
    tree = parser.parse(text)
    return WiktionaryTransformer().transform(tree)

input_text = "{{IPA|ga|/x/|qual2=before {{m|ga|é}}, {{m|ga|ea}} and {{m|ga|í}}}}"
result = parse_wiktionary(input_text)
print(json.dumps(result, indent=2, ensure_ascii=False))


TypeError: Object of type Tree is not JSON serializable

In [27]:
result

Tree(Token('RULE', 'start'), [{'template': 'IPA', 'args': [[Tree(Token('RULE', 'value'), ['ga', '|/x/', '|qual2=before ', '{{m', '|ga', '|é', '}}, ', '{{m', '|ga', '|ea', '}} and ', '{{m', '|ga', '|í', '}', '}'])]]}])