In [216]:
from lark import Lark, UnexpectedEOF, Token

ebnf_grammar = """
    document_t: module [TAIL]
    procedure_t: proc_decl [TAIL]
    statement_seq_t: statement ((";" statement_seq_t) | [TAIL])
    decl_seq_t: decl_seq [TAIL]
    
    module: MODULE IDENT ";" (import_list)? decl_seq (BEGIN statement_seq)? (CLOSE statement_seq)? END IDENT "."
    import_list: IMPORT (IDENT ":=")? IDENT ("," (IDENT ":=")? IDENT)* ";"
    decl_seq: (CONST (const_decl ";")* | TYPE (type_decl ";")* | VAR (var_decl ";")*)* (proc_decl ";" | forward_decl ";")*
    const_decl: ident_def "=" const_expr
    type_decl: ident_def "=" type
    var_decl: ident_list ":" type
    proc_decl: PROCEDURE (receiver)? ident_def (formal_pars)? meth_attributes (";" decl_seq (BEGIN statement_seq)? END IDENT)?
    meth_attributes: ("," NEW)? ("," (ABSTRACT | EMPTY | EXTENSIBLE))?
    forward_decl: PROCEDURE "^" (receiver)? ident_def (formal_pars)? meth_attributes
    formal_pars: "(" (f_p_section (";" f_p_section)*)? ")" (":" type)?
    f_p_section: (VAR | IN | OUT)? IDENT ("," IDENT)* ":" type
    receiver: "(" (VAR | IN)? IDENT ":" IDENT ")"
    type: qualident | ARRAY (const_expr ("," const_expr)*)? OF type | (ABSTRACT | EXTENSIBLE | LIMITED)? | RECORD ("(" qualident ")")? field_list (";" field_list)* END | POINTER TO type | PROCEDURE (formal_pars)?
    field_list: (ident_list ":" type)?
    statement_seq: (statement)? (";" (statement)?)*
    statement: designator ":=" expr 
            | IF expr THEN statement_seq (ELSIF expr THEN statement_seq)* (ELSE statement_seq)? END
            | CASE expr OF case ("|" case)* (ELSE statement_seq)? END 
            | WHILE expr DO statement_seq END 
            | REPEAT statement_seq UNTIL expr 
            | FOR IDENT ":=" expr TO expr ("BY" const_expr)? DO statement_seq END 
            | LOOP statement_seq END 
            | WITH (guard DO statement_seq)? ("|" (guard DO statement_seq)?)* (ELSE statement_seq)? END 
            | EXIT 
            | RETURN (expr)?
            | designator ("(" (expr_list)? ")")? 
    
    case: (case_labels ("," case_labels)* ":" statement_seq)?
    case_labels: const_expr (".." const_expr)?
    guard: qualident ":" qualident
    const_expr: expr
    expr: simple_expr (relation simple_expr)?
    simple_expr: ("+" | "-")? term (add_op term)*
    term: factor (mul_op factor)*
    factor: designator | NUMBER | CHARACTER | STRING | NIL | set | "(" expr ")" | " ~ " factor
    set: "{" (element ("," element)*)? "}"
    element: expr (".." expr)?
    relation: "=" | "#" | "<" | "<=" | ">" | ">=" | IN | IS
    add_op: "+" | "-" | OR
    mul_op: "*" | "/" | DIV | MOD | "&"
    designator: qualident ("." IDENT | "[" expr_list "]" | " ^ " | "(" qualident ")" | "(" (expr_list)? ")")* ("$")?
    expr_list: expr ("," expr)*
    ident_list: ident_def ("," ident_def)*
    qualident: (IDENT ".")? IDENT
    ident_def: IDENT ("*" | "-")?
    TAIL: /[\s\S]+/
    ANY: /[^ ]+/    
    
    NUMBER: INT | FLOAT | HEX
    HEX: /0x[0-9A-Fa-f]+/
    CHARACTER: /'[^']*'/
    STRING: /"[^"]*"/

    IN.1: "IN"
    IS.1: "IS"
    OR.1: "OR"
    DIV.1: "DIV"
    MOD.1: "MOD"
    NIL.1: "NIL"
    VAR.1: "VAR"
    CONST.1: "CONST"
    TYPE.1: "TYPE"
    PROCEDURE.1: "PROCEDURE"
    ABSTRACT.1: "ABSTRACT"
    EMPTY.1: "EMPTY"
    EXTENSIBLE.1: "EXTENSIBLE"
    LIMITED.1: "LIMITED"
    RECORD.1: "RECORD"
    POINTER.1: "POINTER"
    TO.1: "TO"
    IF.1: "IF"
    THEN.1: "THEN"
    ELSIF.1: "ELSIF"
    ELSE.1: "ELSE"
    CASE.1: "CASE"
    OF.1: "OF"
    WHILE.1: "WHILE"
    DO.1: "DO"
    REPEAT.1: "REPEAT"
    UNTIL.1: "UNTIL"
    FOR.1: "FOR"
    LOOP.1: "LOOP"
    WITH.1: "WITH"
    EXIT.1: "EXIT"
    RETURN.1: "RETURN"
    BEGIN.1: "BEGIN"
    CLOSE.1: "CLOSE"
    END.1: "END"
    IMPORT.1: "IMPORT"
    NEW.1: "NEW"
    MODULE.1: "MODULE"
    OUT.1: "OUT"
    ARRAY.1: "ARRAY"
    IDENT.0: CNAME
    COMMENT: /\(\*([^*]|\*+[^*)])*\*+\)/
    %import common.NEWLINE
    %import common.CNAME
    %import common.INT
    %import common.FLOAT
    %import common.WS
    
    %ignore WS
    %ignore COMMENT
"""
#
#%ignore COMMENT



  ebnf_grammar = """


In [217]:
comment_grammar = """
    _comment: COMMENT?
    COMMENT: /\(\*([^*]|\*+[^*)])*\*+\)/
    %import common.WS
    %import common.NEWLINE
    %ignore WS
"""


parser = Lark(comment_grammar, start="_comment")
parser.parse("""

(**
s 
**)

""")

  comment_grammar = """


Tree(Token('RULE', '_comment'), [Token('COMMENT', '(**\ns \n**)')])

In [218]:
starts = [
    "document_t",
    "procedure_t",
    "statement_seq_t",
    "decl_seq_t"
]
parsers = [Lark(ebnf_grammar, start=s) for s in starts]

In [227]:
MAX_TEXT_SIZE = 512
def get_code_and_tail(orig_text, max_size = MAX_TEXT_SIZE):
    for parser in parsers:
        try:
            if len(orig_text) > max_size:
                text = orig_text[:max_size]
            else:
                text = orig_text
            #print(f"checking rule {rule}")
            tree = parser.parse(text)
            #print(f"matched rule {rule}")
            split_idx = len(text)-len(tree.children[1].value)
            return orig_text[:split_idx], orig_text[split_idx:]
        except UnexpectedEOF:
            if len(orig_text) > max_size:
                return get_code_and_tail(orig_text, max_size * 2)
            else:
                return text, ""#whole text is code
        except:
            pass
    return "", text # no code

MD_CODE_START = "```oberon\n"
MD_CODE_END = "```\n"

def load_file(path):
   with open(path, 'r', encoding='utf-8') as f:
      return f.read()

In [228]:
# Пример использования
text = """MODULE HelloWorld;
(**
hello
**)
IMPORT Out;
BEGIN
  a := b;
END HelloWorld.
"""
get_code_and_tail(text)

('MODULE HelloWorld;\n(**\nhello\n**)\nIMPORT Out;\nBEGIN\n  a := b;\nEND HelloWorld.',
 '\n')

In [229]:
parser = Lark(ebnf_grammar, start="document_t", propagate_positions=True)
tree = parser.parse(text)
text[tree.children[1].start_pos:]


'\n'

In [238]:
#datasets\oberon\docs\bb_ru\Docu\ru\CP-Lang.odc
fname = '/app/datasets/oberon/docs/bb_ru/Dev/Mod/AlienTool.odc'
fname = '/app/datasets/oberon/docs/bb_ru/Docu/ru/CP-Lang.odc'
orig_text = load_file(fname)

In [239]:
t = """CASE ch OF
	"A" .. "Z": ReadIdentifier 
|	"0" .. "9": ReadNumber 
|	"'", '"': ReadString
ELSE SpecialCharacter
END

"""

get_code_and_tail(t)
#Lark(ebnf_grammar, start="statement_seq_t").parse(orig_text[:MAX_TEXT_SIZE])

('CASE ch OF\n\t"A" .. "Z": ReadIdentifier \n|\t"0" .. "9": ReadNumber \n|\t"\'", \'"\': ReadString\nELSE SpecialCharacter\nEND',
 '\n\n')

In [241]:
parser = Lark(ebnf_grammar, start="statement_seq_t", propagate_positions=True)#, ambiguity="explicit")
r = parser.parse(t)
r

Tree(Token('RULE', 'statement_seq_t'), [Tree(Token('RULE', 'statement'), [Token('CASE', 'CASE'), Tree(Token('RULE', 'expr'), [Tree(Token('RULE', 'simple_expr'), [Tree(Token('RULE', 'term'), [Tree(Token('RULE', 'factor'), [Tree(Token('RULE', 'designator'), [Tree(Token('RULE', 'qualident'), [Token('IDENT', 'ch')])])])])])]), Token('OF', 'OF'), Tree(Token('RULE', 'case'), [Tree(Token('RULE', 'case_labels'), [Tree(Token('RULE', 'const_expr'), [Tree(Token('RULE', 'expr'), [Tree(Token('RULE', 'simple_expr'), [Tree(Token('RULE', 'term'), [Tree(Token('RULE', 'factor'), [Token('STRING', '"A"')])])])])]), Tree(Token('RULE', 'const_expr'), [Tree(Token('RULE', 'expr'), [Tree(Token('RULE', 'simple_expr'), [Tree(Token('RULE', 'term'), [Tree(Token('RULE', 'factor'), [Token('STRING', '"Z"')])])])])])]), Tree(Token('RULE', 'statement_seq'), [Tree(Token('RULE', 'statement'), [Tree(Token('RULE', 'designator'), [Tree(Token('RULE', 'qualident'), [Token('IDENT', 'ReadIdentifier')])])])])]), Tree(Token('RULE

In [242]:
code, tail = get_code_and_tail(orig_text)
code[:100], code[-100:], tail[:100]

('',
 '',
 'Сообщение о языке Компонентный Паскаль\nENGLISH\nCopyright © 1994-2001 by Oberon microsystems, Inc., S')

In [None]:
is_code_started = False
md = ""
text = orig_text
while len(text) > 0 and not text.isspace():
    #print(f"text len:{len(text)}")
    code, text = get_code_and_tail(text)
    if len(code) > 0:
        if not is_code_started:
            md = md + '\n' + MD_CODE_START + '\n'
            is_code_started = True
        md += code.strip('\n\r')
    else:
        if is_code_started:
            md = md + '\n' + MD_CODE_END + '\n'
            is_code_started = False
        text = text.strip('\n\r')
        if len(text) > 0 and not text.isspace():
            nl_idx = text.find('\n')
            if nl_idx == -1:
                nl_idx = len(text)
            else:
                nl_idx += 1
            
            md += text[:nl_idx]
            text = text[nl_idx:]

In [244]:
print(md)

Сообщение о языке Компонентный Паскаль

```oberon

ENGLISHCopyright
```

 © 1994-2001 by Oberon microsystems, Inc., Switzerland.

```oberon

All rights reserved
```

. No part of this publication may be reproduced in any form or by any means, without prior written permission by Oberon microsystems. The only exception is the free electronic distribution of the education version of BlackBox (see the accompanying copyright notice for details).

```oberon

Oberon microsystems
```

, Inc. 

```oberon

Technoparkstrasse
```

 1

```oberon

CH
```

-8005 Zuerich

```oberon

SwitzerlandOberon is a trademark of Prof
```

. Niklaus Wirth.

```oberon

Component Pascal is a trademark of Oberon microsystems
```

, Inc.

```oberon

All other trademarks and registered trademarks belong to their respective owners
```

.

```oberon

Authors	Oberon microsystems
```

, Inc.

```oberon

	March
```

 2001

```oberon

Authors of Oberon
```

-2 report

```oberon

	H
```

. Moessenboeck, N. Wirth

```oberon



In [236]:
from IPython.display import display, Markdown, Latex
display(Markdown(md))


```oberon

CASE ch OF
	"A" .. "Z": ReadIdentifier 
|	"0" .. "9": ReadNumber 
|	"'", '"': ReadString
ELSE SpecialCharacter
END