In [21]:
from lark import Lark, UnexpectedEOF

ebnf_grammar = """
    document_t: module [TAIL]
    procedure_t: proc_decl [TAIL]
    statement_seq_t: statement_seq [TAIL]
    decl_seq_t: decl_seq [TAIL]
    
    module: "MODULE" IDENT ";" (import_list)? decl_seq ("BEGIN" statement_seq)? ("CLOSE" statement_seq)? "END" IDENT "."
    import_list: "IMPORT" (IDENT ":=")? IDENT ("," (IDENT ":=")? IDENT)* ";"
    decl_seq: ("CONST" (const_decl ";")* | "TYPE" (type_decl ";")* | "VAR" (var_decl ";")*)* (proc_decl ";" | forward_decl ";")*
    const_decl: ident_def "=" const_expr
    type_decl: ident_def "=" type
    var_decl: ident_list ":" type
    proc_decl: "PROCEDURE" (receiver)? ident_def (formal_pars)? meth_attributes (";" decl_seq ("BEGIN" statement_seq)? "END" IDENT)?
    meth_attributes: ("," "NEW")? ("," ("ABSTRACT" | "EMPTY" | "EXTENSIBLE"))?
    forward_decl: "PROCEDURE" "^" (receiver)? ident_def (formal_pars)? meth_attributes
    formal_pars: "(" (f_p_section (";" f_p_section)*)? ")" (":" type)?
    f_p_section: ("VAR" | "IN" | "OUT")? IDENT ("," IDENT)* ":" type
    receiver: "(" ("VAR" | "IN")? IDENT ":" IDENT ")"
    type: qualident | "ARRAY" (const_expr ("," const_expr)*)? "OF" type | ("ABSTRACT" | "EXTENSIBLE" | "LIMITED")? | "RECORD" ("(" qualident ")")? field_list (";" field_list)* "END" | "POINTER" "TO" type | "PROCEDURE" (formal_pars)?
    field_list: (ident_list ":" type)?
    statement_seq: statement (";" statement)*
    statement: (designator ":=" expr | designator ("(" (expr_list)? ")")? 
            | "IF" expr "THEN" statement_seq ("ELSIF" expr "THEN" statement_seq)* ("ELSE" statement_seq)? "END" 
            | "CASE" expr "OF" case ("|" case)* ("ELSE" statement_seq)? "END" 
            | "WHILE" expr "DO" statement_seq "END" 
            | "REPEAT" statement_seq "UNTIL" expr 
            | "FOR" IDENT ":=" expr "TO" expr ("BY" const_expr)? "DO" statement_seq "END" 
            | "LOOP" statement_seq "END" 
            | "WITH" (guard "DO" statement_seq)? ("|" (guard "DO" statement_seq)?)* ("ELSE" statement_seq)? "END" 
            | "EXIT" 
            | "RETURN" (expr)?)?
    case: (case_labels ("," case_labels)* ":" statement_seq)?
    case_labels: const_expr (".." const_expr)?
    guard: qualident ":" qualident
    const_expr: expr
    expr: simple_expr (relation simple_expr)?
    simple_expr: ("+" | "-")? term (add_op term)*
    term: factor (mul_op factor)*
    factor: designator | NUMBER | CHARACTER | STRING | "NIL" | set | "(" expr ")" | " ~ " factor
    set: "{" (element ("," element)*)? "}"
    element: expr (".." expr)?
    relation: "=" | "#" | "<" | "<=" | ">" | ">=" | "IN" | "IS"
    add_op: "+" | "-" | "OR"
    mul_op: "*" | "/" | "DIV" | "MOD" | "&"
    designator: qualident ("." IDENT | "[" expr_list "]" | " ^ " | "(" qualident ")" | "(" (expr_list)? ")")* ("$")?
    expr_list: expr ("," expr)*
    ident_list: ident_def ("," ident_def)*
    qualident: (IDENT ".")? IDENT
    ident_def: IDENT ("*" | "-")?
    TAIL: /[\s\S]+/
    ANY: /[^ ]+/    
    IDENT: CNAME
    NUMBER: INT | FLOAT | HEX
    HEX: /0x[0-9A-Fa-f]+/
    CHARACTER: /'[^']*'/
    STRING: /"[^"]*"/

    COMMENT: /\(\*([^*]|\*+[^*)])*\*+\)/
    %import common.NEWLINE
    %import common.CNAME
    %import common.INT
    %import common.FLOAT
    %import common.WS
    
    %ignore WS
    %ignore COMMENT
"""
#
#%ignore COMMENT



  ebnf_grammar = """


In [23]:
comment_grammar = """
    _comment: COMMENT?
    COMMENT: /\(\*([^*]|\*+[^*)])*\*+\)/
    %import common.WS
    %import common.NEWLINE
    %ignore WS
"""


parser = Lark(comment_grammar, start="_comment")
parser.parse("""

(**
s 
**)

""")

  comment_grammar = """


Tree(Token('RULE', '_comment'), [Token('COMMENT', '(**\ns \n**)')])

In [24]:
starts = [
    "document_t",
    "procedure_t",
    "statement_seq_t",
    "decl_seq_t"
]

In [41]:
def get_code_and_tail(text):
    for rule in starts:
        parser = Lark(ebnf_grammar, start=rule)
        try:
            #print(f"checking rule {rule}")
            tree = parser.parse(text)
            #print(f"matched rule {rule}")
            return text[:-len(tree.children[1].value)], tree.children[1].value
        except UnexpectedEOF:
            return text, ""
        except:
            pass
    return "", text

MD_CODE_START = "```oberon\n"
MD_CODE_END = "```\n"

def load_file(path):
   with open(path, 'r', encoding='utf-8') as f:
      return f.read()

In [26]:
# Пример использования
text = """MODULE HelloWorld;
(**
hello
**)
IMPORT Out;
BEGIN
  a := b;
END HelloWorld.

текст
"""
get_code_and_tail(text)

checking rule document_t
matched rule document_t


('MODULE HelloWorld;\n(**\nhello\n**)\nIMPORT Out;\nBEGIN\n  a := b;\nEND HelloWorld.',
 '\n\nтекст\n')

In [22]:
parser = Lark(ebnf_grammar, start="document_t", propagate_positions=True)
tree = parser.parse(text)


Tree(Token('RULE', 'document_t'), [Tree(Token('RULE', 'module'), [Token('IDENT', 'HelloWorld'), Tree(Token('RULE', 'import_list'), [Token('IDENT', 'Out')]), Tree(Token('RULE', 'decl_seq'), []), Tree(Token('RULE', 'statement_seq'), [Tree(Token('RULE', 'statement'), [Tree(Token('RULE', 'designator'), [Tree(Token('RULE', 'qualident'), [Token('IDENT', 'a')])]), Tree(Token('RULE', 'expr'), [Tree(Token('RULE', 'simple_expr'), [Tree(Token('RULE', 'term'), [Tree(Token('RULE', 'factor'), [Tree(Token('RULE', 'designator'), [Tree(Token('RULE', 'qualident'), [Token('IDENT', 'b')])])])])])])]), Tree(Token('RULE', 'statement'), [])]), Token('IDENT', 'HelloWorld')]), Token('TAIL', '\n\nтекст\n')])

In [61]:
#datasets\oberon\docs\bb_ru\Docu\ru\CP-Lang.odc
fname = '/app/datasets/oberon/docs/bb_ru/Dev/Mod/AlienTool.odc'
fname = '/app/datasets/oberon/docs/bb_ru/Docu/ru/CP-Lang.odc'
orig_text = load_file(fname)

In [62]:
code, tail = get_code_and_tail(orig_text)
code[:100], code[-100:], tail[:100]

('',
 '',
 'Сообщение о языке Компонентный Паскаль\nENGLISH\nCopyright © 1994-2001 by Oberon microsystems, Inc., S')

In [None]:
is_code_started = False
md = ""
text = orig_text
while len(text) > 0 and not text.isspace():
    print(f"text len:{len(text)}")
    code, text = get_code_and_tail(text)
    if len(code) > 0:
        if not is_code_started:
            md = md + '\n' + MD_CODE_START + '\n'
            is_code_started = True
        md += code.strip('\n\r')
    else:
        if is_code_started:
            md = md + '\n' + MD_CODE_END + '\n'
            is_code_started = False
        if len(text) > 0 and not text.isspace():
            if (text[0] == '\n'):
                text = text[1:]
            nl_idx = text.find('\n')
            if nl_idx == -1:
                nl_idx = len(text)
            md += text[:nl_idx].strip('\n\r')
            text = text[nl_idx:]

text len:67902
text len:67864
text len:67856
text len:67791
text len:67790
text len:67492
text len:67491
text len:67464
text len:67444
text len:67428
text len:67416
text len:67415
text len:67369
text len:67308
text len:67226
text len:67225
text len:67217
text len:67190
text len:67178
text len:67177
text len:67150
text len:67123
text len:67079
text len:67065
text len:67064
text len:67040
text len:67030
text len:66986
text len:66980
text len:66979
text len:66822
text len:66821
text len:66624
text len:66577
text len:66078
text len:65853
text len:65852
text len:65851
text len:65840
text len:65828
text len:65815
text len:65790
text len:65758
text len:65737
text len:65719
text len:65696
text len:65675
text len:65655
text len:65632
text len:65610
text len:65583
text len:65560
text len:65547
text len:65533
text len:65519
text len:65506
text len:65488
text len:65467
text len:65431
text len:65404
text len:65377
text len:65338
text len:65300
text len:65278
text len:65251
text len:65205
text len:6

In [60]:
print(md)


```oberon

MODULE DevAlienTool;
(**
	project	= "BlackBox"
	organization	= "www.oberon.ch"
	contributors	= "Oberon microsystems"
	version	= "System/Rsrc/About"
	copyright	= "System/Rsrc/About"
	license	= "Docu/BB-License"
	changes	= "
	- YYYYMMDD, nn, ...
	"
	issues	= "
	- ...
	"

**)

	IMPORT
		Services, Ports, Stores, Models, Views, Controllers, Properties, Dialog, Containers, Documents,
		TextModels, TextMappers, TextViews, StdFolds;

	PROCEDURE Indent (VAR f: TextMappers.Formatter; level: INTEGER);
	BEGIN
		WHILE level > 0 DO f.WriteTab; DEC(level) END
	END Indent;

	PROCEDURE WriteCause (VAR f: TextMappers.Formatter; cause: INTEGER);
	BEGIN
		f.rider.SetAttr(TextModels.NewColor(f.rider.attr, Ports.red));
		CASE cause OF
		| Stores.typeNotFound: f.WriteString("type in module not found")
		| Stores.inconsModuleVersion: f.WriteString("inconsistent module version")
		| Stores.invalidModuleFile: f.WriteString("invalid module file")
		| Stores.moduleFileNotFound: f.WriteString("module n

In [55]:
from IPython.display import display, Markdown, Latex
display(Markdown(md))

```oberon
MODULE DevAlienTool;
(**
	project	= "BlackBox"
	organization	= "www.oberon.ch"
	contributors	= "Oberon microsystems"
	version	= "System/Rsrc/About"
	copyright	= "System/Rsrc/About"
	license	= "Docu/BB-License"
	changes	= "
	- YYYYMMDD, nn, ...
	"
	issues	= "
	- ...
	"

**)

	IMPORT
		Services, Ports, Stores, Models, Views, Controllers, Properties, Dialog, Containers, Documents,
		TextModels, TextMappers, TextViews, StdFolds;

	PROCEDURE Indent (VAR f: TextMappers.Formatter; level: INTEGER);
	BEGIN
		WHILE level > 0 DO f.WriteTab; DEC(level) END
	END Indent;

	PROCEDURE WriteCause (VAR f: TextMappers.Formatter; cause: INTEGER);
	BEGIN
		f.rider.SetAttr(TextModels.NewColor(f.rider.attr, Ports.red));
		CASE cause OF
		| Stores.typeNotFound: f.WriteString("type in module not found")
		| Stores.inconsModuleVersion: f.WriteString("inconsistent module version")
		| Stores.invalidModuleFile: f.WriteString("invalid module file")
		| Stores.moduleFileNotFound: f.WriteString("module not found")
		| Stores.inconsistentType: f.WriteString("type path in module inconsistent with stored version")
		| Stores.inconsistentVersion: f.WriteString("inconsistent version / program error")
		| Stores.alienVersion: f.WriteString("alien version - outdated program")
		| Stores.alienComponent: f.WriteString("alien component - required sub-part failed to internalize")
		ELSE f.WriteString("unknown (code"); f.WriteInt(cause); f.WriteChar(")")
		END;
		f.rider.SetAttr(TextModels.NewColor(f.rider.attr, Ports.black));
	END WriteCause;

	PROCEDURE Out (VAR f: TextMappers.Formatter; level: INTEGER; st: Stores.Store);
		VAR t: Stores.TypeName;

		PROCEDURE OutAlien (VAR f: TextMappers.Formatter; 
											path: Stores.TypePath; cause: INTEGER; c: Stores.AlienComp);
			VAR i: INTEGER; t: TextModels.Model; form: TextMappers.Formatter;
		BEGIN
			f.WriteString(" ");
			t := TextModels.dir.New(); form.ConnectTo(t);

			form.rider.SetAttr(TextModels.NewColor(form.rider.attr, Ports.blue));
			form.WriteString(path[0]); 
			form.rider.SetAttr(TextModels.NewColor(form.rider.attr, Ports.black));
			form.WriteLn;
			INC(level);
			IF path[1] # "" THEN
				Indent(form, level); form.WriteString("path: (");
				i := 1;
				WHILE path[i] # "" DO
					form.WriteString(path[i]);
					INC(i);
					IF path[i] # "" THEN form.WriteString(", ") END
				END;
				form.WriteChar(")"); form.WriteLn
			END;
			Indent(form, level); form.WriteString("cause: "); WriteCause(form, cause); form.WriteLn;
			Indent(form, level); form.WriteString("comps: "); form.WriteLn;
			INC(level);
			WHILE c # NIL DO
				WITH c: Stores.AlienPiece DO
					Indent(form, level); form.WriteInt(c.len); form.WriteString(" bytes data"); form.WriteLn
				| c: Stores.AlienPart DO
					IF c.store # NIL THEN
						Out(form, level, c.store)
					ELSE Indent(form, level); form.WriteString("NIL reference"); form.WriteLn
					END
				END;
				c := c.next
			END;
			DEC(level, 2);
			Indent(form, level);
			
			f.WriteView(StdFolds.dir.New(StdFolds.collapsed, "", t));
			f.rider.SetAttr(TextModels.NewColor(f.rider.attr, Ports.blue));
			f.WriteString(path[0]); 
			f.rider.SetAttr(TextModels.NewColor(f.rider.attr, Ports.black));
			f.WriteView(StdFolds.dir.New(StdFolds.collapsed, "", NIL));
			f.WriteLn;
		END OutAlien;

	BEGIN
		Indent(f, level);
		WITH st: Stores.Alien DO
			f.WriteString("Alien Store"); OutAlien(f, st.path, st.cause, st.comps)
		ELSE
			Services.GetTypeName(st, t);
			WITH st: Documents.Document DO f.WriteString("Document")
			| st: Containers.Controller DO f.WriteString("Container Controller")
			| st: Containers.View DO f.WriteString("Container View")
			| st: Containers.Model DO f.WriteString("Container Model")
			| st: Controllers.Controller DO f.WriteString("Controller")
			| st: Views.View DO f.WriteString("View")
			| st: Models.Model DO f.WriteString("Model")
			ELSE f.WriteString("Store")
			END;
			f.WriteString(' "'); f.WriteString(t); f.WriteChar('"'); f.WriteLn
		END
	END Out;

	PROCEDURE Analyze*;
		VAR v: Views.View; f: TextMappers.Formatter; d: Documents.Document;
			ops: Controllers.PollOpsMsg; bp: Properties.BoundsPref; t: TextModels.Model;
	BEGIN
		Controllers.PollOps(ops); v := ops.singleton;
		IF v # NIL THEN
			IF v IS Views.Alien THEN
				t := TextModels.dir.New();
				f.ConnectTo(t);
				Out(f, 0, v(Views.Alien).store);
				StdFolds.ExpandFolds(t, FALSE, "");
				v := TextViews.dir.New(t);
				Views.OpenAux(v, "Alien Info");
(*
				bp.w := Views.undefined; bp.h := Views.undefined; 
				Views.HandlePropMsg(v, bp);
				d := Documents.dir.New(v, bp.w, bp.h);
				Views.OpenAux(d, "Alien Info")
*)
			ELSE Dialog.ShowMsg("#Dev:NoAlienView")
			END
		ELSE Dialog.ShowMsg("#Dev:NoSingletonFound")
		END
	END Analyze;

END DevAlienTool.```
StringsNoAlienView	no alien viewNoSingletonFound	no singleton foundInfoSEPARATOR"&Aliens" "" "DevAlienTool.Analyze" "DevAlienTool.SingletonGuard"ENDC