In [91]:
from lark import Lark, UnexpectedEOF

ebnf_grammar = """
    document: module ANY
    module: "MODULE" IDENT ";" (import_list)? decl_seq ("BEGIN" statement_seq)? ("CLOSE" statement_seq)? "END" IDENT "."
    import_list: "IMPORT" (IDENT ":=")? IDENT ("," (IDENT ":=")? IDENT)* ";"
    decl_seq: ("CONST" (const_decl ";")* | "TYPE" (type_decl ";")* | "VAR" (var_decl ";")*)* (proc_decl ";" | forward_decl ";")*
    const_decl: ident_def "=" const_expr
    type_decl: ident_def "=" type
    var_decl: ident_list ":" type
    proc_decl: "PROCEDURE" (receiver)? ident_def (formal_pars)? meth_attributes (";" decl_seq ("BEGIN" statement_seq)? "END" IDENT)?
    meth_attributes: ("," "NEW")? ("," ("ABSTRACT" | "EMPTY" | "EXTENSIBLE"))?
    forward_decl: "PROCEDURE" " ^ " (receiver)? ident_def (formal_pars)? meth_attributes
    formal_pars: "(" (f_p_section (";" f_p_section)*)? ")" (":" type)?
    f_p_section: ("VAR" | "IN" | "OUT")? IDENT ("," IDENT)* ":" type
    receiver: "(" ("VAR" | "IN")? IDENT ":" IDENT ")"
    type: qualident | "ARRAY" (const_expr ("," const_expr)*)? "OF" type | ("ABSTRACT" | "EXTENSIBLE" | "LIMITED")? | "RECORD" ("(" qualident ")")? field_list (";" field_list)* "END" | "POINTER" "TO" type | "PROCEDURE" (formal_pars)?
    field_list: (ident_list ":" type)?
    statement_seq: statement (";" statement)*
    statement: (designator ":=" expr | designator ("(" (expr_list)? ")")? 
            | "IF" expr "THEN" statement_seq ("ELSIF" expr "THEN" statement_seq)* ("ELSE" statement_seq)? "END" 
            | "CASE" expr "OF" case ("|" case)* ("ELSE" statement_seq)? "END" 
            | "WHILE" expr "DO" statement_seq "END" 
            | "REPEAT" statement_seq "UNTIL" expr 
            | "FOR" IDENT ":=" expr "TO" expr ("BY" const_expr)? "DO" statement_seq "END" 
            | "LOOP" statement_seq "END" 
            | "WITH" (guard "DO" statement_seq)? ("|" (guard "DO" statement_seq)?)* ("ELSE" statement_seq)? "END" 
            | "EXIT" 
            | "RETURN" (expr)?)?
    case: (case_labels ("," case_labels)* ":" statement_seq)?
    case_labels: const_expr (".." const_expr)?
    guard: qualident ":" qualident
    const_expr: expr
    expr: simple_expr (relation simple_expr)?
    simple_expr: ("+" | "-")? term (add_op term)*
    term: factor (mul_op factor)*
    factor: designator | NUMBER | CHARACTER | STRING | "NIL" | set | "(" expr ")" | " ~ " factor
    set: "{" (element ("," element)*)? "}"
    element: expr (".." expr)?
    relation: "=" | "#" | "<" | "<=" | ">" | ">=" | "IN" | "IS"
    add_op: "+" | "-" | "OR"
    mul_op: " * " | "/" | "DIV" | "MOD" | "&"
    designator: qualident ("." IDENT | "[" expr_list "]" | " ^ " | "(" qualident ")" | "(" (expr_list)? ")")* ("$")?
    expr_list: expr ("," expr)*
    ident_list: ident_def ("," ident_def)*
    qualident: (IDENT ".")? IDENT
    ident_def: IDENT (" * " | "-")?

    ANY: /[^ ]+/    
    IDENT: CNAME
    NUMBER: INT | FLOAT | HEX
    HEX: /0x[0-9A-Fa-f]+/
    CHARACTER: /'[^']*'/
    STRING: /"[^"]*"/
    COMMENT: "(*" /(.|\n)*?/ "*)"
    %import common.CNAME
    %import common.INT
    %import common.FLOAT
    %import common.WS
    %ignore COMMENT
    %ignore WS
"""

comment_grammar = """
    COMMENT : "(*" /(.|\n)*?/ "*)"
    %import common.WS
    %ignore WS
"""


In [92]:
#, parser='lalr'
# lexer='basic', 

starts = [
    "module", 
    "import_list",
    "expr",
    "statement"
]





In [93]:
def rule_match(grammar, rule, text):
    parser = Lark(grammar, start=rule)
    try:
        tree = parser.parse(code)
        return 1
    except UnexpectedEOF:
        return 2
    except:
        return 0

def is_document(text):
    return rule_match(ebnf_grammar, "document",text) == 1

def is_module(text):
    return rule_match(ebnf_grammar, "module",text) == 1

def is_comment(text):
    return rule_match(comment_grammar, "COMMENT",text) == 1

def is_comment_start(text):
    return rule_match(comment_grammar, "COMMENT",text) == 2



def try_module(text, rule):
    parser = Lark(ebnf_grammar, start=rule)
    try:
        tree = parser.parse(code)
        return 1
    except UnexpectedEOF:
        return 2
    except:
        return 0

In [105]:
# Пример использования
code = """
MODULE HelloWorld;
IMPORT Out;
BEGIN
  a := b;
END HelloWorld.
"""
rule_match(code, "document")

1

In [106]:
parser = Lark(ebnf_grammar, start="document", propagate_positions=True)
tree = parser.parse(code)

In [108]:
tree.children[0]

Token('ANY', '\n')

In [88]:
tree._meta.__dict__

{'empty': False,
 'line': 2,
 'column': 1,
 'start_pos': 1,
 'container_line': 2,
 'container_column': 1,
 'container_start_pos': 1,
 'end_line': 6,
 'end_column': 16,
 'end_pos': 63,
 'container_end_line': 6,
 'container_end_column': 16,
 'container_end_pos': 63}

In [86]:
with open('/app/datasets/oberon/docs/bb_ru/Dev/Mod/AlienTool.odc', 'r', encoding='utf-8') as f:
   text = f.read()

In [104]:
rule_match(text, "document")

2