In [2]:
!pip install Lark

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
from lark import Lark, Tree, Token

# Определение грамматики EBNF в формате Lark
ebnf_grammar = """
    start: rule+
    rule: ID "=" expression "."
    expression: term ("|" term)*
    term: factor+
    factor: ID
          | STRING
          | grouping
          | optional
          | repetition
          | range
    grouping: "(" expression ")"
    optional: "[" expression "]"
    repetition: "{" expression "}"
    ID: /[a-zA-Z_][a-zA-Z0-9_]*/
    STRING: (/"[^"]*"/) | (/'[^']*'/)
    range: STRING "." "." STRING
    %import common.WS
    %ignore WS
"""

# Создаем парсер
ebnf_parser = Lark(ebnf_grammar, start='start')

# Пример EBNF для разбора




In [25]:
example_ebnf = """
Module 	=	MODULE ident ";" [ImportList] DeclSeq [BEGIN StatementSeq] [CLOSE StatementSeq] END ident ".".
ImportList 	=	IMPORT [ident ":="] ident {"," [ident ":="] ident} ";".
DeclSeq 	=	{ CONST {ConstDecl ";" } | TYPE {TypeDecl ";"} | VAR {VarDecl ";"}} {ProcDecl ";" | ForwardDecl ";"}.
ConstDecl	=	IdentDef "=" ConstExpr.
TypeDecl	=	IdentDef "=" Type.
VarDecl	=	IdentList ":" Type.
ProcDecl 	=	PROCEDURE [Receiver] IdentDef [FormalPars] MethAttributes [";" DeclSeq [BEGIN StatementSeq] END ident].
MethAttributes	=	["," NEW] ["," (ABSTRACT | EMPTY | EXTENSIBLE)].
ForwardDecl 	=	PROCEDURE " ^ " [Receiver] IdentDef [FormalPars] MethAttributes.
FormalPars 	=	"(" [FPSection {";" FPSection}] ")" [":" Type].
FPSection 	=	[VAR | IN | OUT] ident {"," ident} ":" Type.
Receiver	=	"(" [VAR | IN] ident ":" ident ")".
Type 	=	Qualident | ARRAY [ConstExpr {"," ConstExpr}] OF Type | [ABSTRACT | EXTENSIBLE | LIMITED]| RECORD ["("Qualident")"] FieldList {";" FieldList} END | POINTER TO Type | PROCEDURE [FormalPars].
FieldList 	=	[IdentList ":" Type].
StatementSeq	=	Statement {";" Statement}.
Statement 	=	[ Designator ":=" Expr | Designator ["(" [ExprList] ")"] | IF Expr THEN StatementSeq {ELSIF Expr THEN StatementSeq}	[ELSE StatementSeq] END | CASE Expr OF Case {"|" Case} [ELSE StatementSeq] END | WHILE Expr DO StatementSeq END | REPEAT StatementSeq UNTIL Expr 
		| FOR ident ":=" Expr TO Expr [BY ConstExpr] DO StatementSeq END | LOOP StatementSeq END | WITH [ Guard DO StatementSeq ] {"|" [ Guard DO StatementSeq ] } [ELSE StatementSeq] END | EXIT | RETURN [Expr] ].
Case 	=	[CaseLabels {"," CaseLabels} ":" StatementSeq].
CaseLabels 	=	ConstExpr [".." ConstExpr].
Guard	=	Qualident ":" Qualident.
ConstExpr	=	Expr.
Expr 	=	SimpleExpr [Relation SimpleExpr].
SimpleExpr	=	["+" | "-"] Term {AddOp Term}.
Term 	=	Factor {MulOp Factor}.
Factor 	=	Designator | number | character | string | NIL | Set | "(" Expr ")" | " ~ " Factor.
Set	=	"{" [Element {"," Element}] "}".
Element 	=	Expr [".." Expr].
Relation 	=	"=" | "#" | "<" | "<=" | ">" | ">=" | IN | IS.
AddOp 	=	"+" | "-" | OR.
MulOp 	=	" * " | "/" | DIV | MOD | "&".
Designator 	=	Qualident {"." ident | "[" ExprList "]" | " ^ " | "(" Qualident ")" | "(" [ExprList] ")"} [ "$" ].
ExprList 	=	Expr {"," Expr}.
IdentList 	=	IdentDef {"," IdentDef}.
Qualident 	=	[ident "."] ident.
IdentDef 	=	ident [" * " | "-"].

ident = (letter | "_") {letter | "_" | digit}.
letter = "A" .. "Z" | "a" .. "z" | UnicodeLetter.
digit 	= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9".
number 	= integer | real.
integer 	= digit {digit} | digit {hexDigit} ( "H" | "L" ).
real 	= digit {digit} "." {digit} [ScaleFactor].
ScaleFactor 	= "E" ["+" | "-"] digit {digit}.
hexDigit 	= digit | "A" | "B" | "C" | "D" | "E" | "F".
character  = digit {hexDigit} "X".
string  = ' " ' {char} ' " ' | " ' " {char} " ' ".
"""

In [26]:
parsed = ebnf_parser.parse(example_ebnf)
#print(parsed.pretty())

In [38]:
# Функция для восстановления оригинального текста из дерева разбора

def rebuild_ebnf(tree):
    rule_names = [c.children[0].value for c in tree.children if c.data == 'rule']
    
    result = []
    for child in tree.children:
        if child.data == "rule":
            result.append(rebuild_rule(child, rule_names))
        else:
            result.append(rebuild_expression(child, rule_names))
    return "\n".join(result)

def rebuild_rule(rule_node, rule_names):
    id_node = rule_node.children[0]
    expression_node = rule_node.children[1]
    return f"{id_node.value} ::= {rebuild_expression(expression_node, rule_names)}"

def rebuild_expression(expression_node, rule_names, wrap = False):
    terms = []
    for term_node in expression_node.children:
        terms.append(rebuild_term(term_node, rule_names))
    joined = " | ".join(terms)
    return f"({joined})" if wrap else joined

def rebuild_term(term_node, rule_names):
    factors = []
    for factor_node in term_node.children:
        factors.append(rebuild_factor(factor_node, rule_names))
    return " ".join(factors)

def rebuild_optional(node, rule_names):
    expr = rebuild_expression(node.children[0], rule_names, False)
    return f"({expr})?" 
def rebuild_grouping(node, rule_names):
    expr = rebuild_expression(node.children[0], rule_names, False)
    return f"({expr})" 
def rebuild_repetition(node, rule_names):
    expr = rebuild_expression(node.children[0], rule_names, False)
    return f"({expr})*" 
def rebuild_range(node, rule_names):
    return f"[{node.children[0].value.strip('"')}-{node.children[1].value.strip('"')}]" 
def rebuild_factor(factor_node, rule_names):
    result = []
    
    for child in factor_node.children:
        if isinstance(child, Tree):
            if child.data == "optional":
                result.append(rebuild_optional(child,rule_names))
            elif child.data == "grouping":
                result.append(rebuild_grouping(child,rule_names))
            elif child.data == "repetition":
                result.append(rebuild_repetition(child,rule_names))
            elif child.data == "range":
                result.append(rebuild_range(child,rule_names))
            #items.append(rebuild_expression(ch, True))
        elif isinstance(child, Token):
            if child.value in rule_names:
                result.append(child.value)
            elif child.value.startswith("\""):
                result.append(child.value)
            else:
                result.append(f"\"{child.value}\"")
        else:
            print("UPS")
    return "".join(result)
    



In [39]:
# Восстанавливаем и выводим оригинальный текст
# original  Module 	=   MODULE     ident ";"     [ImportList]  DeclSeq     [BEGIN StatementSeq]    [CLOSE StatementSeq]     END     ident ".".
# expected: Module ::= "MODULE" ws ident ";" wsc (ImportList)? DeclSeq wsc ("BEGIN" StatementSeq)? ("CLOSE" StatementSeq)? "END" ws ident "."
#original:    Type 	=  Qualident |   ARRAY  [    ConstExpr {"," ConstExpr}]      OF     Type  | [ABSTRACT | EXTENSIBLE | LIMITED]| RECORD ["("Qualident")"] FieldList {";" FieldList} END | POINTER TO Type | PROCEDURE [FormalPars].
#expected:    Type ::= Qualident | ("ARRAY" (ws ConstExpr ("," ConstExpr)*)? ws "OF" ws Type) | (("ABSTRACT" | "EXTENSIBLE" | "LIMITED") ws)? "RECORD" ws ("(" Qualident ")")? ws FieldList (";" wsc FieldList)* wsc "END" | "POINTER" ws "TO" ws Type | "PROCEDURE" (FormalPars)?
#result:      Type ::= Qualident |  "ARRAY" (   ConstExpr ("," ConstExpr)*)?    "OF"    Type  |  ("ABSTRACT" | "EXTENSIBLE" | "LIMITED")? | "RECORD" ("(" Qualident ")")? FieldList (";" FieldList)* "END" | "POINTER" "TO" Type | "PROCEDURE" (FormalPars)?


rebuilt_ebnf = rebuild_ebnf(parsed)
print(rebuilt_ebnf)

Module ::= "MODULE" ident ";" (ImportList)? DeclSeq ("BEGIN" StatementSeq)? ("CLOSE" StatementSeq)? "END" ident "."
ImportList ::= "IMPORT" (ident ":=")? ident ("," (ident ":=")? ident)* ";"
DeclSeq ::= ("CONST" (ConstDecl ";")* | "TYPE" (TypeDecl ";")* | "VAR" (VarDecl ";")*)* (ProcDecl ";" | ForwardDecl ";")*
ConstDecl ::= IdentDef "=" ConstExpr
TypeDecl ::= IdentDef "=" Type
VarDecl ::= IdentList ":" Type
ProcDecl ::= "PROCEDURE" (Receiver)? IdentDef (FormalPars)? MethAttributes (";" DeclSeq ("BEGIN" StatementSeq)? "END" ident)?
MethAttributes ::= ("," "NEW")? ("," ("ABSTRACT" | "EMPTY" | "EXTENSIBLE"))?
ForwardDecl ::= "PROCEDURE" " ^ " (Receiver)? IdentDef (FormalPars)? MethAttributes
FormalPars ::= "(" (FPSection (";" FPSection)*)? ")" (":" Type)?
FPSection ::= ("VAR" | "IN" | "OUT")? ident ("," ident)* ":" Type
Receiver ::= "(" ("VAR" | "IN")? ident ":" ident ")"
Type ::= Qualident | "ARRAY" (ConstExpr ("," ConstExpr)*)? "OF" Type | ("ABSTRACT" | "EXTENSIBLE" | "LIMITED")? | "RE