In [173]:
from lark import Lark, UnexpectedEOF

ebnf_grammar = """
    document: module TAIL
    module: "MODULE" IDENT ";" (import_list)? decl_seq ("BEGIN" statement_seq)? ("CLOSE" statement_seq)? "END" IDENT "."
    import_list: "IMPORT" (IDENT ":=")? IDENT ("," (IDENT ":=")? IDENT)* ";"
    decl_seq: ("CONST" (const_decl ";")* | "TYPE" (type_decl ";")* | "VAR" (var_decl ";")*)* (proc_decl ";" | forward_decl ";")*
    const_decl: ident_def "=" const_expr
    type_decl: ident_def "=" type
    var_decl: ident_list ":" type
    proc_decl: "PROCEDURE" (receiver)? ident_def (formal_pars)? meth_attributes (";" decl_seq ("BEGIN" statement_seq)? "END" IDENT)?
    meth_attributes: ("," "NEW")? ("," ("ABSTRACT" | "EMPTY" | "EXTENSIBLE"))?
    forward_decl: "PROCEDURE" "^" (receiver)? ident_def (formal_pars)? meth_attributes
    formal_pars: "(" (f_p_section (";" f_p_section)*)? ")" (":" type)?
    f_p_section: ("VAR" | "IN" | "OUT")? IDENT ("," IDENT)* ":" type
    receiver: "(" ("VAR" | "IN")? IDENT ":" IDENT ")"
    type: qualident | "ARRAY" (const_expr ("," const_expr)*)? "OF" type | ("ABSTRACT" | "EXTENSIBLE" | "LIMITED")? | "RECORD" ("(" qualident ")")? field_list (";" field_list)* "END" | "POINTER" "TO" type | "PROCEDURE" (formal_pars)?
    field_list: (ident_list ":" type)?
    statement_seq: statement (";" statement)*
    statement: (designator ":=" expr | designator ("(" (expr_list)? ")")? 
            | "IF" expr "THEN" statement_seq ("ELSIF" expr "THEN" statement_seq)* ("ELSE" statement_seq)? "END" 
            | "CASE" expr "OF" case ("|" case)* ("ELSE" statement_seq)? "END" 
            | "WHILE" expr "DO" statement_seq "END" 
            | "REPEAT" statement_seq "UNTIL" expr 
            | "FOR" IDENT ":=" expr "TO" expr ("BY" const_expr)? "DO" statement_seq "END" 
            | "LOOP" statement_seq "END" 
            | "WITH" (guard "DO" statement_seq)? ("|" (guard "DO" statement_seq)?)* ("ELSE" statement_seq)? "END" 
            | "EXIT" 
            | "RETURN" (expr)?)?
    case: (case_labels ("," case_labels)* ":" statement_seq)?
    case_labels: const_expr (".." const_expr)?
    guard: qualident ":" qualident
    const_expr: expr
    expr: simple_expr (relation simple_expr)?
    simple_expr: ("+" | "-")? term (add_op term)*
    term: factor (mul_op factor)*
    factor: designator | NUMBER | CHARACTER | STRING | "NIL" | set | "(" expr ")" | " ~ " factor
    set: "{" (element ("," element)*)? "}"
    element: expr (".." expr)?
    relation: "=" | "#" | "<" | "<=" | ">" | ">=" | "IN" | "IS"
    add_op: "+" | "-" | "OR"
    mul_op: "*" | "/" | "DIV" | "MOD" | "&"
    designator: qualident ("." IDENT | "[" expr_list "]" | " ^ " | "(" qualident ")" | "(" (expr_list)? ")")* ("$")?
    expr_list: expr ("," expr)*
    ident_list: ident_def ("," ident_def)*
    qualident: (IDENT ".")? IDENT
    ident_def: IDENT ("*" | "-")?
    TAIL: /[\s\S]+/
    ANY: /[^ ]+/    
    IDENT: CNAME
    NUMBER: INT | FLOAT | HEX
    HEX: /0x[0-9A-Fa-f]+/
    CHARACTER: /'[^']*'/
    STRING: /"[^"]*"/

    COMMENT: /\(\*([^*]|\*+[^*)])*\*+\)/
    %import common.NEWLINE
    %import common.CNAME
    %import common.INT
    %import common.FLOAT
    %import common.WS
    
    %ignore WS
    %ignore COMMENT
"""
#
#%ignore COMMENT



  ebnf_grammar = """


In [174]:
comment_grammar = """
    _comment: COMMENT?
    COMMENT: /\(\*([^*]|\*+[^*)])*\*+\)/
    %import common.WS
    %import common.NEWLINE
    %ignore WS
"""


parser = Lark(comment_grammar, start="_comment")
parser.parse("""

(**
s 
**)

""")

  comment_grammar = """


Tree(Token('RULE', '_comment'), [Token('COMMENT', '(**\ns \n**)')])

In [175]:
starts = [
    "module", 
    "import_list",
    "expr",
    "statement"
]

In [188]:
def rule_match(grammar, rule, text):
    parser = Lark(grammar, start=rule)
    try:
        tree = parser.parse(text)
        return 1
    except UnexpectedEOF:
        return 2
    except:
        return 0

def is_code_document(text):
    return rule_match(ebnf_grammar, "document",text) == 1

def is_module(text):
    return rule_match(ebnf_grammar, "module",text) == 1

def is_statement(text):
    return rule_match(ebnf_grammar, "statement",text) == 1

def get_module_text_len(text):
    parser = Lark(ebnf_grammar, start="document")
    tree = parser.parse(text)
    return len(text) - len(tree.children[1].value)
    
def is_comment(text):
    return rule_match(comment_grammar, "COMMENT",text) == 1

def is_comment_start(text):
    return rule_match(comment_grammar, "COMMENT",text) == 2



def try_module(text, rule):
    parser = Lark(ebnf_grammar, start=rule)
    try:
        tree = parser.parse(code)
        return 1
    except UnexpectedEOF:
        return 2
    except:
        return 0

def print_lines_with_prefix(text, pref):
    for l in text.split('\n'):
        print(f"{pref}:\t{l}")

def load_file(path):
   with open(path, 'r', encoding='utf-8') as f:
      return f.read()

In [189]:
# Пример использования
text = """MODULE HelloWorld;
(**
hello
**)
IMPORT Out;
BEGIN
  a := b;
END HelloWorld.
"""
parser = Lark(ebnf_grammar, start="document")
parser.parse(text)

Tree(Token('RULE', 'document'), [Tree(Token('RULE', 'module'), [Token('IDENT', 'HelloWorld'), Tree(Token('RULE', 'import_list'), [Token('IDENT', 'Out')]), Tree(Token('RULE', 'decl_seq'), []), Tree(Token('RULE', 'statement_seq'), [Tree(Token('RULE', 'statement'), [Tree(Token('RULE', 'designator'), [Tree(Token('RULE', 'qualident'), [Token('IDENT', 'a')])]), Tree(Token('RULE', 'expr'), [Tree(Token('RULE', 'simple_expr'), [Tree(Token('RULE', 'term'), [Tree(Token('RULE', 'factor'), [Tree(Token('RULE', 'designator'), [Tree(Token('RULE', 'qualident'), [Token('IDENT', 'b')])])])])])])]), Tree(Token('RULE', 'statement'), [])]), Token('IDENT', 'HelloWorld')]), Token('TAIL', '\n')])

In [190]:
text

'MODULE HelloWorld;\n(**\nhello\n**)\nIMPORT Out;\nBEGIN\n  a := b;\nEND HelloWorld.\n'

In [191]:
parser = Lark(ebnf_grammar, start="document", propagate_positions=True)
tree = parser.parse(text)

In [211]:
orig_text = load_file('/app/datasets/oberon/docs/bb_ru/Dev/Mod/AlienTool.odc')

In [212]:
orig_text.find("END DevAlienTool.") + len("END DevAlienTool.")

4760

In [213]:
#print(text)

In [214]:
parser = Lark(ebnf_grammar, start="document")
_= parser.parse(orig_text)

In [215]:
is_code_document(orig_text)

True

In [216]:
is_module(orig_text)

False

In [218]:
if is_code_document(orig_text):
    l = get_module_text_len(orig_text)
    code = orig_text[:l]
    text = orig_text[l:]
elif is_module(text):
    code = orig_text
    text = ""
else:
    code = ""
    text = orig_text

print_lines_with_prefix(code, "CODE")

for l in text.split('\n'):
    
    if len(l) == 0 or l.isspace():
        print(f"EMPT:")
    elif is_statement(l):
        print(f"CODE:\t{l}")
    else:
        print(f"TEXT:\t{l}")


CODE:	MODULE DevAlienTool;
CODE:	(**
CODE:		project	= "BlackBox"
CODE:		organization	= "www.oberon.ch"
CODE:		contributors	= "Oberon microsystems"
CODE:		version	= "System/Rsrc/About"
CODE:		copyright	= "System/Rsrc/About"
CODE:		license	= "Docu/BB-License"
CODE:		changes	= "
CODE:		- YYYYMMDD, nn, ...
CODE:		"
CODE:		issues	= "
CODE:		- ...
CODE:		"
CODE:	
CODE:	**)
CODE:	
CODE:		IMPORT
CODE:			Services, Ports, Stores, Models, Views, Controllers, Properties, Dialog, Containers, Documents,
CODE:			TextModels, TextMappers, TextViews, StdFolds;
CODE:	
CODE:		PROCEDURE Indent (VAR f: TextMappers.Formatter; level: INTEGER);
CODE:		BEGIN
CODE:			WHILE level > 0 DO f.WriteTab; DEC(level) END
CODE:		END Indent;
CODE:	
CODE:		PROCEDURE WriteCause (VAR f: TextMappers.Formatter; cause: INTEGER);
CODE:		BEGIN
CODE:			f.rider.SetAttr(TextModels.NewColor(f.rider.attr, Ports.red));
CODE:			CASE cause OF
CODE:			| Stores.typeNotFound: f.WriteString("type in module not found")
CODE:			| Stores.inconsM

In [209]:
text.split('\n')[0].isspace()

False

In [11]:
is_document(text)

True