From d48e037ca736cca471e313b9676a7415aaa4f207 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Apr 2019 14:25:44 +0300 Subject: [PATCH] Updated standalone example --- examples/standalone/create_standalone.sh | 2 +- examples/standalone/json_parser.py | 1827 +++++++++++++++++----- 2 files changed, 1444 insertions(+), 385 deletions(-) diff --git a/examples/standalone/create_standalone.sh b/examples/standalone/create_standalone.sh index a4fa8799..141ab895 100755 --- a/examples/standalone/create_standalone.sh +++ b/examples/standalone/create_standalone.sh @@ -1 +1 @@ -python -m lark.tools.standalone json.lark > json_parser.py +PYTHONPATH=../.. python -m lark.tools.standalone json.lark > json_parser.py diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index 8c51baf7..d424f1bf 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1,4 +1,4 @@ -# The file was automatically generated by Lark v0.5.5 +# The file was automatically generated by Lark v0.7.0 # # # Lark Stand-alone Generator Tool @@ -18,6 +18,9 @@ # If you wish to purchase a commercial license for this tool and its # generated code, contact me via email. # +# If GPL is incompatible with your free or open-source project, +# contact me and we'll work it out (for free). +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or @@ -32,91 +35,218 @@ # # +class LarkError(Exception): + pass -import types -import functools -from contextlib import contextmanager +class GrammarError(LarkError): + pass -Str = type(u'') +class ParseError(LarkError): + pass -def inline_args(f): - # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) - if isinstance(f, types.FunctionType): - @functools.wraps(f) - def _f_func(self, args): - return f(self, *args) - return _f_func - elif isinstance(f, (type, types.BuiltinFunctionType)): - @functools.wraps(f) - def _f_builtin(_self, args): - return f(*args) - return _f_builtin - elif isinstance(f, types.MethodType): - @functools.wraps(f.__func__) - def _f(self, args): - return f.__func__(self, *args) - return _f - else: - @functools.wraps(f.__call__.__func__) - def _f(self, args): - return f.__call__.__func__(self, *args) - return _f +class LexError(LarkError): + pass +class UnexpectedInput(LarkError): + pos_in_stream = None + + def get_context(self, text, span=40): + pos = self.pos_in_stream + start = max(pos - span, 0) + end = pos + span + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before) + '^\n' + + def match_examples(self, parse_fn, examples): + """ Given a parser instance and a dictionary mapping some label with + some malformed syntax examples, it'll return the label for the + example that bests matches the current error. + """ + assert self.state is not None, "Not supported for this exception" -try: - from contextlib import suppress # Python 3 -except ImportError: - @contextmanager - def suppress(*excs): - '''Catch and dismiss the provided exception - - >>> x = 'hello' - >>> with suppress(IndexError): - ... x = x[10] - >>> x - 'hello' - ''' - try: - yield - except excs: - pass + candidate = None + for label, example in examples.items(): + assert not isinstance(example, STRING_TYPE) + for malformed in example: + try: + parse_fn(malformed) + except UnexpectedInput as ut: + if ut.state == self.state: + try: + if ut.token == self.token: # Try exact match first + return label + except AttributeError: + pass + if not candidate: + candidate = label -def is_terminal(sym): - return sym.isupper() + return candidate -class GrammarError(Exception): - pass -class ParseError(Exception): - pass +class UnexpectedCharacters(LexError, UnexpectedInput): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None): + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + + self.line = line + self.column = column + self.allowed = allowed + self.considered_tokens = considered_tokens + self.pos_in_stream = lex_pos + self.state = state -class UnexpectedToken(ParseError): - def __init__(self, token, expected, seq, index, considered_rules=None): + message += '\n\n' + self.get_context(seq) + if allowed: + message += '\nExpecting: %s\n' % allowed + + super(UnexpectedCharacters, self).__init__(message) + + + +class UnexpectedToken(ParseError, UnexpectedInput): + def __init__(self, token, expected, considered_rules=None, state=None): self.token = token - self.expected = expected + self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') self.considered_rules = considered_rules + self.state = state + self.pos_in_stream = getattr(token, 'pos_in_stream', None) - try: - context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) - except AttributeError: - context = seq[index:index+5] - except TypeError: - context = "" message = ("Unexpected token %r at line %s, column %s.\n" - "Expected: %s\n" - "Context: %s" % (token, self.line, self.column, expected, context)) + "Expected one of: \n\t* %s\n" + % (token, self.line, self.column, '\n\t* '.join(self.expected))) super(UnexpectedToken, self).__init__(message) +class VisitError(LarkError): + def __init__(self, tree, orig_exc): + self.tree = tree + self.orig_exc = orig_exc + + message = 'Error trying to process rule "%s":\n\n%s' % (tree.data, orig_exc) + super(VisitError, self).__init__(message) + +def _deserialize(data, namespace, memo): + if isinstance(data, dict): + if '__type__' in data: # Object + class_ = namespace[data['__type__']] + return class_.deserialize(data, memo) + elif '@' in data: + return memo[data['@']] + return {key:_deserialize(value, namespace, memo) for key, value in data.items()} + elif isinstance(data, list): + return [_deserialize(value, namespace, memo) for value in data] + return data + + +class Serialize(object): + def memo_serialize(self, types_to_memoize): + memo = SerializeMemoizer(types_to_memoize) + return self.serialize(memo), memo.serialize() + + def serialize(self, memo=None): + if memo and memo.in_types(self): + return {'@': memo.memoized.get(self)} + + fields = getattr(self, '__serialize_fields__') + res = {f: _serialize(getattr(self, f), memo) for f in fields} + res['__type__'] = type(self).__name__ + postprocess = getattr(self, '_serialize', None) + if postprocess: + postprocess(res, memo) + return res + + @classmethod + def deserialize(cls, data, memo): + namespace = getattr(cls, '__serialize_namespace__', {}) + namespace = {c.__name__:c for c in namespace} + + fields = getattr(cls, '__serialize_fields__') + + if '@' in data: + return memo[data['@']] + + inst = cls.__new__(cls) + for f in fields: + setattr(inst, f, _deserialize(data[f], namespace, memo)) + postprocess = getattr(inst, '_deserialize', None) + if postprocess: + postprocess() + return inst + + +class SerializeMemoizer(Serialize): + __serialize_fields__ = 'memoized', + + def __init__(self, types_to_memoize): + self.types_to_memoize = tuple(types_to_memoize) + self.memoized = Enumerator() + + def in_types(self, value): + return isinstance(value, self.types_to_memoize) + + def serialize(self): + return _serialize(self.memoized.reversed(), None) + + @classmethod + def deserialize(cls, data, namespace, memo): + return _deserialize(data, namespace, memo) + + + +try: + STRING_TYPE = basestring +except NameError: # Python 3 + STRING_TYPE = str + + +import types +from functools import wraps, partial +from contextlib import contextmanager + +Str = type(u'') +try: + classtype = types.ClassType # Python2 +except AttributeError: + classtype = type # Python3 + +def smart_decorator(f, create_decorator): + if isinstance(f, types.FunctionType): + return wraps(f)(create_decorator(f, True)) + + elif isinstance(f, (classtype, type, types.BuiltinFunctionType)): + return wraps(f)(create_decorator(f, False)) + + elif isinstance(f, types.MethodType): + return wraps(f)(create_decorator(f.__func__, True)) + + elif isinstance(f, partial): + # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 + return create_decorator(f.__func__, True) + + else: + return create_decorator(f.__func__.__call__, True) + +import sys, re +Py36 = (sys.version_info[:2] >= (3, 6)) +class Meta: + def __init__(self): + self.empty = True class Tree(object): - def __init__(self, data, children): + def __init__(self, data, children, meta=None): self.data = data self.children = children + self._meta = meta + + @property + def meta(self): + if self._meta is None: + self._meta = Meta() + return self._meta def __repr__(self): return 'Tree(%s, %s)' % (self.data, self.children) @@ -139,33 +269,111 @@ def _pretty(self, level, indent_str): def pretty(self, indent_str=' '): return ''.join(self._pretty(0, indent_str)) -class Transformer(object): - def _get_func(self, name): - return getattr(self, name) - def transform(self, tree): - items = [] - for c in tree.children: - try: - items.append(self.transform(c) if isinstance(c, Tree) else c) - except Discard: - pass + def __eq__(self, other): + try: + return self.data == other.data and self.children == other.children + except AttributeError: + return False + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return hash((self.data, tuple(self.children))) + +from inspect import getmembers, getmro + +class Discard(Exception): + pass + +# Transformers + +class Transformer: + """Visits the tree recursively, starting with the leaves and finally the root (bottom-up) + + Calls its methods (provided by user via inheritance) according to tree.data + The returned value replaces the old one in the structure. + + Can be used to implement map or reduce. + """ + + def _call_userfunc(self, tree, new_children=None): + # Assumes tree is already transformed + children = new_children if new_children is not None else tree.children try: - f = self._get_func(tree.data) + f = getattr(self, tree.data) except AttributeError: - return self.__default__(tree.data, items) + return self.__default__(tree.data, children, tree.meta) else: - return f(items) + try: + if getattr(f, 'meta', False): + return f(children, tree.meta) + elif getattr(f, 'inline', False): + return f(*children) + elif getattr(f, 'whole_tree', False): + if new_children is not None: + raise NotImplementedError("Doesn't work with the base Transformer class") + return f(tree) + else: + return f(children) + except (GrammarError, Discard): + raise + except Exception as e: + raise VisitError(tree, e) + + def _transform_children(self, children): + for c in children: + try: + yield self._transform_tree(c) if isinstance(c, Tree) else c + except Discard: + pass - def __default__(self, data, children): - return Tree(data, children) + def _transform_tree(self, tree): + children = list(self._transform_children(tree.children)) + return self._call_userfunc(tree, children) + + def transform(self, tree): + return self._transform_tree(tree) def __mul__(self, other): return TransformerChain(self, other) + def __default__(self, data, children, meta): + "Default operation on tree (for override)" + return Tree(data, children, meta) + + @classmethod + def _apply_decorator(cls, decorator, **kwargs): + mro = getmro(cls) + assert mro[0] is cls + libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} + for name, value in getmembers(cls): + if name.startswith('_') or name in libmembers: + continue + if not callable(cls.__dict__[name]): + continue + + # Skip if v_args already applied (at the function level) + if hasattr(cls.__dict__[name], 'vargs_applied'): + continue + + static = isinstance(cls.__dict__[name], (staticmethod, classmethod)) + setattr(cls, name, decorator(value, static=static, **kwargs)) + return cls + + +class InlineTransformer(Transformer): # XXX Deprecated + def _call_userfunc(self, tree, new_children=None): + # Assumes tree is already transformed + children = new_children if new_children is not None else tree.children + try: + f = getattr(self, tree.data) + except AttributeError: + return self.__default__(tree.data, children, tree.meta) + else: + return f(*children) -class Discard(Exception): - pass class TransformerChain(object): def __init__(self, *transformers): @@ -180,13 +388,57 @@ def __mul__(self, other): return TransformerChain(*self.transformers + (other,)) +class Transformer_InPlace(Transformer): + "Non-recursive. Changes the tree in-place instead of returning new instances" + def _transform_tree(self, tree): # Cancel recursion + return self._call_userfunc(tree) + + def transform(self, tree): + for subtree in tree.iter_subtrees(): + subtree.children = list(self._transform_children(subtree.children)) + + return self._transform_tree(tree) + + +class Transformer_InPlaceRecursive(Transformer): + "Recursive. Changes the tree in-place instead of returning new instances" + def _transform_tree(self, tree): + tree.children = list(self._transform_children(tree.children)) + return self._call_userfunc(tree) + + + +# Visitors + +class VisitorBase: + def _call_userfunc(self, tree): + return getattr(self, tree.data, self.__default__)(tree) + + def __default__(self, tree): + "Default operation on tree (for override)" + return tree + + +class Visitor(VisitorBase): + """Bottom-up visitor, non-recursive + + Visits the tree, starting with the leaves and finally the root (bottom-up) + Calls its methods (provided by user via inheritance) according to tree.data + """ + + + def visit(self, tree): + for subtree in tree.iter_subtrees(): + self._call_userfunc(subtree) + return tree -class InlineTransformer(Transformer): - def _get_func(self, name): # use super()._get_func - return inline_args(getattr(self, name)).__get__(self) +class Visitor_Recursive(VisitorBase): + """Bottom-up visitor, recursive + Visits the tree, starting with the leaves and finally the root (bottom-up) + Calls its methods (provided by user via inheritance) according to tree.data + """ -class Visitor(object): def visit(self, tree): for child in tree.children: if isinstance(child, Tree): @@ -196,50 +448,109 @@ def visit(self, tree): f(tree) return tree - def __default__(self, tree): - pass -class Visitor_NoRecurse(Visitor): +def visit_children_decor(func): + "See Interpreter" + @wraps(func) + def inner(cls, tree): + values = cls.visit_children(tree) + return func(cls, values) + return inner + + +class Interpreter: + """Top-down visitor, recursive + + Visits the tree, starting with the root and finally the leaves (top-down) + Calls its methods (provided by user via inheritance) according to tree.data + + Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. + The user has to explicitly call visit_children, or use the @visit_children_decor + """ def visit(self, tree): - subtrees = list(tree.iter_subtrees()) + return getattr(self, tree.data)(tree) - for subtree in (subtrees): - getattr(self, subtree.data, self.__default__)(subtree) - return tree + def visit_children(self, tree): + return [self.visit(child) if isinstance(child, Tree) else child + for child in tree.children] + def __getattr__(self, name): + return self.__default__ -class Transformer_NoRecurse(Transformer): - def transform(self, tree): - subtrees = list(tree.iter_subtrees()) + def __default__(self, tree): + return self.visit_children(tree) - def _t(t): - # Assumes t is already transformed - try: - f = self._get_func(t.data) - except AttributeError: - return self.__default__(t) - else: - return f(t) - for subtree in subtrees: - children = [] - for c in subtree.children: - try: - children.append(_t(c) if isinstance(c, Tree) else c) - except Discard: - pass - subtree.children = children - return _t(tree) - def __default__(self, t): - return t +# Decorators + +def _apply_decorator(obj, decorator, **kwargs): + try: + _apply = obj._apply_decorator + except AttributeError: + return decorator(obj, **kwargs) + else: + return _apply(decorator, **kwargs) + + + +def _inline_args__func(func): + @wraps(func) + def create_decorator(_f, with_self): + if with_self: + def f(self, children): + return _f(self, *children) + else: + def f(self, children): + return _f(*children) + return f + + return smart_decorator(func, create_decorator) + + +def inline_args(obj): # XXX Deprecated + return _apply_decorator(obj, _inline_args__func) + + + +def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, static=False): + assert [whole_tree, meta, inline].count(True) <= 1 + def create_decorator(_f, with_self): + if with_self: + def f(self, *args, **kwargs): + return _f(self, *args, **kwargs) + else: + def f(self, *args, **kwargs): + return _f(*args, **kwargs) + return f + + if static: + f = wraps(func)(create_decorator(func, False)) + else: + f = smart_decorator(func, create_decorator) + f.vargs_applied = True + f.inline = inline + f.meta = meta + f.whole_tree = whole_tree + return f + +def v_args(inline=False, meta=False, tree=False): + "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods" + if [tree, meta, inline].count(True) > 1: + raise ValueError("Visitor functions can either accept tree, or meta, or be inlined. These cannot be combined.") + def _visitor_args_dec(obj): + return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta, whole_tree=tree) + return _visitor_args_dec + + class Indenter: def __init__(self): - self.paren_level = 0 - self.indent_level = [0] + self.paren_level = None + self.indent_level = None + assert self.tab_len > 0 def handle_NL(self, token): if self.paren_level > 0: @@ -260,7 +571,7 @@ def handle_NL(self, token): assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) - def process(self, stream): + def _process(self, stream): for token in stream: if token.type == self.NL_type: for t in self.handle_NL(token): @@ -280,43 +591,213 @@ def process(self, stream): assert self.indent_level == [0], self.indent_level + def process(self, stream): + self.paren_level = 0 + self.indent_level = [0] + return self._process(stream) + # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? @property def always_accept(self): return (self.NL_type,) -class LexError(Exception): - pass -class UnexpectedInput(LexError): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_rules=None): - context = seq[lex_pos:lex_pos+5] - message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) - if allowed: - message += '\n\nExpecting: %s\n' % allowed +class Symbol(Serialize): + is_term = NotImplemented + + def __init__(self, name): + self.name = name + + def __eq__(self, other): + assert isinstance(other, Symbol), other + return self.is_term == other.is_term and self.name == other.name + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return hash(self.name) + + def __repr__(self): + return '%s(%r)' % (type(self).__name__, self.name) + + fullrepr = property(__repr__) + + +class Terminal(Symbol): + __serialize_fields__ = 'name', 'filter_out' + + is_term = True + + def __init__(self, name, filter_out=False): + self.name = name + self.filter_out = filter_out + + @property + def fullrepr(self): + return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) + + + +class NonTerminal(Symbol): + __serialize_fields__ = 'name', + + is_term = False + + + +class RuleOptions(Serialize): + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' + + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.priority = priority + self.empty_indices = empty_indices + + def __repr__(self): + return 'RuleOptions(%r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.priority, + ) + + +class Rule(Serialize): + """ + origin : a symbol + expansion : a list of symbols + order : index of this expansion amongst all rules of the same name + """ + __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') + + __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' + __serialize_namespace__ = Terminal, NonTerminal, RuleOptions + + def __init__(self, origin, expansion, order=0, alias=None, options=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + self.order = order + self.options = options + self._hash = hash((self.origin, tuple(self.expansion))) + + def _deserialize(self): + self._hash = hash((self.origin, tuple(self.expansion))) + + def __str__(self): + return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) + + def __repr__(self): + return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + + def __hash__(self): + return self._hash + + def __eq__(self, other): + if not isinstance(other, Rule): + return False + return self.origin == other.origin and self.expansion == other.expansion + + + + + +class Pattern(Serialize): + __serialize_fields__ = 'value', 'flags' + + def __init__(self, value, flags=()): + self.value = value + self.flags = frozenset(flags) + + def __repr__(self): + return repr(self.to_regexp()) + + # Pattern Hashing assumes all subclasses have a different priority! + def __hash__(self): + return hash((type(self), self.value, self.flags)) + def __eq__(self, other): + return type(self) == type(other) and self.value == other.value and self.flags == other.flags + + def to_regexp(self): + raise NotImplementedError() + + if Py36: + # Python 3.6 changed syntax for flags in regular expression + def _get_flags(self, value): + for f in self.flags: + value = ('(?%s:%s)' % (f, value)) + return value + + else: + def _get_flags(self, value): + for f in self.flags: + value = ('(?%s)' % f) + value + return value + + +class PatternStr(Pattern): + def to_regexp(self): + return self._get_flags(re.escape(self.value)) + + @property + def min_width(self): + return len(self.value) + max_width = min_width + +class PatternRE(Pattern): + def to_regexp(self): + return self._get_flags(self.value) + + @property + def min_width(self): + return get_regexp_width(self.to_regexp())[0] + @property + def max_width(self): + return get_regexp_width(self.to_regexp())[1] + + +class TerminalDef(Serialize): + __serialize_fields__ = 'name', 'pattern', 'priority' + __serialize_namespace__ = PatternStr, PatternRE + + def __init__(self, name, pattern, priority=1): + assert isinstance(pattern, Pattern), pattern + self.name = name + self.pattern = pattern + self.priority = priority + + def __repr__(self): + return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - super(UnexpectedInput, self).__init__(message) - self.line = line - self.column = column - self.context = context - self.allowed = allowed - self.considered_rules = considered_rules class Token(Str): - def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): - inst = Str.__new__(cls, value) - inst.type = type_ - inst.pos_in_stream = pos_in_stream - inst.value = value - inst.line = line - inst.column = column - return inst + __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') + + def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None): + try: + self = super(Token, cls).__new__(cls, value) + except UnicodeDecodeError: + value = value.decode('latin1') + self = super(Token, cls).__new__(cls, value) + + self.type = type_ + self.pos_in_stream = pos_in_stream + self.value = value + self.line = line + self.column = column + self.end_line = end_line + self.end_column = end_column + return self @classmethod def new_borrow_pos(cls, type_, value, borrow_t): - return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) + return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column) + + def __reduce__(self): + return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) def __repr__(self): return 'Token(%s, %r)' % (self.type, self.value) @@ -338,7 +819,7 @@ def __init__(self): self.newline_char = '\n' self.char_pos = 0 self.line = 1 - self.column = 0 + self.column = 1 self.line_start_pos = 0 def feed(self, token, test_newline=True): @@ -353,45 +834,51 @@ def feed(self, token, test_newline=True): self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 self.char_pos += len(token) - self.column = self.char_pos - self.line_start_pos + self.column = self.char_pos - self.line_start_pos + 1 class _Lex: "Built to serve both Lexer and ContextualLexer" - def __init__(self, lexer): + def __init__(self, lexer, state=None): self.lexer = lexer + self.state = state def lex(self, stream, newline_types, ignore_types): - newline_types = list(newline_types) - ignore_types = list(ignore_types) + newline_types = frozenset(newline_types) + ignore_types = frozenset(ignore_types) line_ctr = LineCounter() - t = None - while True: + while line_ctr.char_pos < len(stream): lexer = self.lexer for mre, type_from_index in lexer.mres: m = mre.match(stream, line_ctr.char_pos) - if m: - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: + if not m: + continue + + t = None + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + yield t + else: + if type_ in lexer.callback: t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - yield t - else: - if type_ in lexer.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t) - - line_ctr.feed(value, type_ in newline_types) - if t: - t.end_line = line_ctr.line - t.end_column = line_ctr.column - break - else: - if line_ctr.char_pos < len(stream): - raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t) + + line_ctr.feed(value, type_ in newline_types) + if t: + t.end_line = line_ctr.line + t.end_column = line_ctr.column + break + else: + allowed = [v for m, tfi in lexer.mres for v in tfi.values()] + raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state) + class UnlessCallback: def __init__(self, mres): @@ -401,14 +888,183 @@ def __call__(self, t): for mre, type_from_index in self.mres: m = mre.match(t.value) if m: - value = m.group(0) t.type = type_from_index[m.lastindex] break return t +class CallChain: + def __init__(self, callback1, callback2, cond): + self.callback1 = callback1 + self.callback2 = callback2 + self.cond = cond + + def __call__(self, t): + t2 = self.callback1(t) + return self.callback2(t) if self.cond(t2) else t2 + + + + + +def _create_unless(terminals): + tokens_by_type = classify(terminals, lambda t: type(t.pattern)) + assert len(tokens_by_type) <= 2, tokens_by_type.keys() + embedded_strs = set() + callback = {} + for retok in tokens_by_type.get(PatternRE, []): + unless = [] # {} + for strtok in tokens_by_type.get(PatternStr, []): + if strtok.priority > retok.priority: + continue + s = strtok.pattern.value + m = re.match(retok.pattern.to_regexp(), s) + if m and m.group(0) == s: + unless.append(strtok) + if strtok.pattern.flags <= retok.pattern.flags: + embedded_strs.add(strtok) + if unless: + callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) + + terminals = [t for t in terminals if t not in embedded_strs] + return terminals, callback + + +def _build_mres(terminals, max_size, match_whole): + # Python sets an unreasonable group limit (currently 100) in its re module + # Worse, the only way to know we reached it is by catching an AssertionError! + # This function recursively tries less and less groups until it's successful. + postfix = '$' if match_whole else '' + mres = [] + while terminals: + try: + mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size])) + except AssertionError: # Yes, this is what Python provides us.. :/ + return _build_mres(terminals, max_size//2, match_whole) + + # terms_from_name = {t.name: t for t in terminals[:max_size]} + mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) + terminals = terminals[max_size:] + return mres + +def build_mres(terminals, match_whole=False): + return _build_mres(terminals, len(terminals), match_whole) + +def _regexp_has_newline(r): + """Expressions that may indicate newlines in a regexp: + - newlines (\n) + - escaped newline (\\n) + - anything but ([^...]) + - any-char (.) when the flag (?s) exists + """ + return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) + +class Lexer(Serialize): + """Lexer interface + + Method Signatures: + lex(self, stream) -> Iterator[Token] + + set_parser_state(self, state) # Optional + """ + set_parser_state = NotImplemented + lex = NotImplemented + + +class TraditionalLexer(Lexer): + __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' + __serialize_namespace__ = TerminalDef, + + def _deserialize(self): + self.mres = build_mres(self.terminals) + self.callback = {} # TODO implement + + + def __init__(self, terminals, ignore=(), user_callbacks={}): + assert all(isinstance(t, TerminalDef) for t in terminals), terminals + + terminals = list(terminals) + + # Sanitization + for t in terminals: + try: + re.compile(t.pattern.to_regexp()) + except: + raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) + + if t.pattern.min_width == 0: + raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) + + assert set(ignore) <= {t.name for t in terminals} + + # Init + self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] + self.ignore_types = list(ignore) + + terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) + + terminals, self.callback = _create_unless(terminals) + assert all(self.callback.values()) + + for type_, f in user_callbacks.items(): + if type_ in self.callback: + # Already a callback there, probably UnlessCallback + self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) + else: + self.callback[type_] = f + + self.terminals = terminals + + self.mres = build_mres(terminals) + + + def lex(self, stream): + return _Lex(self).lex(stream, self.newline_types, self.ignore_types) + + + +class ContextualLexer(Lexer): + __serialize_fields__ = 'root_lexer', 'lexers' + __serialize_namespace__ = TraditionalLexer, + + def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): + tokens_by_name = {} + for t in terminals: + assert t.name not in tokens_by_name, t + tokens_by_name[t.name] = t + + lexer_by_tokens = {} + self.lexers = {} + for state, accepts in states.items(): + key = frozenset(accepts) + try: + lexer = lexer_by_tokens[key] + except KeyError: + accepts = set(accepts) | set(ignore) | set(always_accept) + state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] + lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) + lexer_by_tokens[key] = lexer + + self.lexers[state] = lexer + + self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) + + self.set_parser_state(None) # Needs to be set on the outside + + def set_parser_state(self, state): + self.parser_state = state + + def lex(self, stream): + l = _Lex(self.lexers[self.parser_state], self.parser_state) + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + l.lexer = self.lexers[self.parser_state] + l.state = self.parser_state + + +from functools import partial, wraps +from itertools import repeat, product -from functools import partial class ExpandSingleChild: def __init__(self, node_builder): @@ -420,18 +1076,6 @@ def __call__(self, children): else: return self.node_builder(children) - -class CreateToken: - "Used for fixing the results of scanless parsing" - - def __init__(self, token_name, node_builder): - self.node_builder = node_builder - self.token_name = token_name - - def __call__(self, children): - return self.node_builder( [Token(self.token_name, ''.join(children))] ) - - class PropagatePositions: def __init__(self, node_builder): self.node_builder = node_builder @@ -439,23 +1083,83 @@ def __init__(self, node_builder): def __call__(self, children): res = self.node_builder(children) - if children: - for a in children: - with suppress(AttributeError): - res.line = a.line - res.column = a.column - break + if isinstance(res, Tree): + for c in children: + if isinstance(c, Tree) and c.children and not c.meta.empty: + res.meta.line = c.meta.line + res.meta.column = c.meta.column + res.meta.start_pos = c.meta.start_pos + res.meta.empty = False + break + elif isinstance(c, Token): + res.meta.line = c.line + res.meta.column = c.column + res.meta.start_pos = c.pos_in_stream + res.meta.empty = False + break - for a in reversed(children): - with suppress(AttributeError): - res.end_line = a.end_line - res.end_column = a.end_column - break + for c in reversed(children): + if isinstance(c, Tree) and c.children and not c.meta.empty: + res.meta.end_line = c.meta.end_line + res.meta.end_column = c.meta.end_column + res.meta.end_pos = c.meta.end_pos + res.meta.empty = False + break + elif isinstance(c, Token): + res.meta.end_line = c.end_line + res.meta.end_column = c.end_column + res.meta.end_pos = c.pos_in_stream + len(c.value) + res.meta.empty = False + break return res class ChildFilter: + def __init__(self, to_include, append_none, node_builder): + self.node_builder = node_builder + self.to_include = to_include + self.append_none = append_none + + def __call__(self, children): + filtered = [] + + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none + if to_expand: + filtered += children[i].children + else: + filtered.append(children[i]) + + if self.append_none: + filtered += [None] * self.append_none + + return self.node_builder(filtered) + +class ChildFilterLALR(ChildFilter): + "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" + + def __call__(self, children): + filtered = [] + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none + if to_expand: + if filtered: + filtered += children[i].children + else: # Optimize for left-recursion + filtered = children[i].children + else: + filtered.append(children[i]) + + if self.append_none: + filtered += [None] * self.append_none + + return self.node_builder(filtered) + +class ChildFilterLALR_NoPlaceholders(ChildFilter): + "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" def __init__(self, to_include, node_builder): self.node_builder = node_builder self.to_include = to_include @@ -470,77 +1174,158 @@ def __call__(self, children): filtered = children[i].children else: filtered.append(children[i]) - return self.node_builder(filtered) def _should_expand(sym): - return not is_terminal(sym) and sym.startswith('_') + return not sym.is_term and sym.name.startswith('_') + +def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices): + # Prepare empty_indices as: How many Nones to insert at each index? + if _empty_indices: + assert _empty_indices.count(False) == len(expansion) + s = ''.join(str(int(b)) for b in _empty_indices) + empty_indices = [len(ones) for ones in s.split('0')] + assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion)) + else: + empty_indices = [0] * (len(expansion)+1) -def maybe_create_child_filter(expansion, filter_out): - to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] + to_include = [] + nones_to_add = 0 + for i, sym in enumerate(expansion): + nones_to_add += empty_indices[i] + if keep_all_tokens or not (sym.is_term and sym.filter_out): + to_include.append((i, _should_expand(sym), nones_to_add)) + nones_to_add = 0 - if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): - return partial(ChildFilter, to_include) + nones_to_add += empty_indices[len(expansion)] + if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include): + if _empty_indices or ambiguous: + return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add) + else: + # LALR without placeholders + return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include]) + +class AmbiguousExpander: + """Deal with the case where we're expanding children ('_rule') into a parent but the children + are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself + ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children + into the right parents in the right places, essentially shifting the ambiguiuty up the tree.""" + def __init__(self, to_expand, tree_class, node_builder): + self.node_builder = node_builder + self.tree_class = tree_class + self.to_expand = to_expand -class Callback(object): - pass + def __call__(self, children): + def _is_ambig_tree(child): + return hasattr(child, 'data') and child.data == '_ambig' + + #### When we're repeatedly expanding ambiguities we can end up with nested ambiguities. + # All children of an _ambig node should be a derivation of that ambig node, hence + # it is safe to assume that if we see an _ambig node nested within an ambig node + # it is safe to simply expand it into the parent _ambig node as an alternative derivation. + ambiguous = [] + for i, child in enumerate(children): + if _is_ambig_tree(child): + if i in self.to_expand: + ambiguous.append(i) + + to_expand = [j for j, grandchild in enumerate(child.children) if _is_ambig_tree(grandchild)] + child.expand_kids_by_index(*to_expand) + + if not ambiguous: + return self.node_builder(children) + + expand = [ iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children) ] + return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))]) + +def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): + to_expand = [i for i, sym in enumerate(expansion) + if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))] + if to_expand: + return partial(AmbiguousExpander, to_expand, tree_class) + +def ptb_inline_args(func): + @wraps(func) + def f(children): + return func(*children) + return f class ParseTreeBuilder: - def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): + def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class self.propagate_positions = propagate_positions self.always_keep_all_tokens = keep_all_tokens + self.ambiguous = ambiguous + self.maybe_placeholders = maybe_placeholders self.rule_builders = list(self._init_builders(rules)) - self.user_aliases = {} - def _init_builders(self, rules): - filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out} - filter_out |= {sym for rule in rules for sym in rule.expansion if is_terminal(sym) and sym.startswith('_')} - assert all(x.startswith('_') for x in filter_out) - for rule in rules: options = rule.options keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) expand_single_child = options.expand1 if options else False - create_token = options.create_token if options else False - wrapper_chain = filter(None, [ - create_token and partial(CreateToken, create_token), + wrapper_chain = list(filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out), + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), self.propagate_positions and PropagatePositions, - ]) + self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), + ])) yield rule, wrapper_chain def create_callback(self, transformer=None): - callback = Callback() + callbacks = {} for rule, wrapper_chain in self.rule_builders: - internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) - user_callback_name = rule.alias or rule.origin + user_callback_name = rule.alias or rule.origin.name try: - f = transformer._get_func(user_callback_name) + f = getattr(transformer, user_callback_name) + assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer" + # XXX InlineTransformer is deprecated! + if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer): + f = ptb_inline_args(f) except AttributeError: f = partial(self.tree_class, user_callback_name) - self.user_aliases[rule] = rule.alias - rule.alias = internal_callback_name - for w in wrapper_chain: f = w(f) - if hasattr(callback, internal_callback_name): + if rule in callbacks: raise GrammarError("Rule '%s' already exists" % (rule,)) - setattr(callback, internal_callback_name, f) - return callback + callbacks[rule] = f + + return callbacks + +class LALR_Parser(object): + def __init__(self, parser_conf, debug=False): + assert all(r.options is None or r.options.priority is None + for r in parser_conf.rules), "LALR doesn't yet support prioritization" + analysis = LALR_Analyzer(parser_conf, debug=debug) + analysis.compute_lookahead() + callbacks = parser_conf.callbacks + + self._parse_table = analysis.parse_table + self.parser_conf = parser_conf + self.parser = _Parser(analysis.parse_table, callbacks) + + @classmethod + def deserialize(cls, data, memo, callbacks): + inst = cls.__new__(cls) + inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks) + return inst + + def serialize(self, memo): + return self._parse_table.serialize(memo) + + def parse(self, *args): + return self.parser.parse(*args) class _Parser: @@ -551,7 +1336,6 @@ def __init__(self, parse_table, callbacks): self.callbacks = callbacks def parse(self, seq, set_state=None): - i = 0 token = None stream = iter(seq) states = self.states @@ -561,14 +1345,13 @@ def parse(self, seq, set_state=None): if set_state: set_state(self.start_state) - def get_action(key): + def get_action(token): state = state_stack[-1] try: - return states[state][key] + return states[state][token.type] except KeyError: - expected = states[state].keys() - - raise UnexpectedToken(token, expected, seq, i) + expected = [s for s in states[state].keys() if s.isupper()] + raise UnexpectedToken(token, expected, state=state) def reduce(rule): size = len(rule.expansion) @@ -581,15 +1364,15 @@ def reduce(rule): value = self.callbacks[rule](s) - _action, new_state = get_action(rule.origin) + _action, new_state = states[state_stack[-1]][rule.origin.name] assert _action is Shift state_stack.append(new_state) value_stack.append(value) # Main LALR-parser loop - for i, token in enumerate(stream): + for token in stream: while True: - action, arg = get_action(token.type) + action, arg = get_action(token) assert arg != self.end_state if action is Shift: @@ -600,8 +1383,9 @@ def reduce(rule): else: reduce(arg) + token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: - _action, arg = get_action('$END') + _action, arg = get_action(token) if _action is Shift: assert arg == self.end_state val ,= value_stack @@ -611,169 +1395,444 @@ def reduce(rule): -class Rule(object): +class Action: + def __init__(self, name): + self.name = name + def __str__(self): + return self.name + def __repr__(self): + return str(self) + +Shift = Action('Shift') +Reduce = Action('Reduce') + +class ParseTable: + def __init__(self, states, start_state, end_state): + self.states = states + self.start_state = start_state + self.end_state = end_state + + def serialize(self, memo): + tokens = Enumerator() + rules = Enumerator() + + states = { + state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg)) + for token, (action, arg) in actions.items()} + for state, actions in self.states.items() + } + + return { + 'tokens': tokens.reversed(), + 'states': states, + 'start_state': self.start_state, + 'end_state': self.end_state, + } + + @classmethod + def deserialize(cls, data, memo): + tokens = data['tokens'] + states = { + state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg)) + for token, (action, arg) in actions.items()} + for state, actions in data['states'].items() + } + return cls(states, data['start_state'], data['end_state']) + + +class IntParseTable(ParseTable): + + @classmethod + def from_ParseTable(cls, parse_table): + enum = list(parse_table.states) + state_to_idx = {s:i for i,s in enumerate(enum)} + int_states = {} + + for s, la in parse_table.states.items(): + la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v + for k,v in la.items()} + int_states[ state_to_idx[s] ] = la + + + start_state = state_to_idx[parse_table.start_state] + end_state = state_to_idx[parse_table.end_state] + return cls(int_states, start_state, end_state) + + + +def get_frontend(parser, lexer): + if parser=='lalr': + if lexer is None: + raise ValueError('The LALR parser requires use of a lexer') + elif lexer == 'standard': + return LALR_TraditionalLexer + elif lexer == 'contextual': + return LALR_ContextualLexer + elif issubclass(lexer, Lexer): + return partial(LALR_CustomLexer, lexer) + else: + raise ValueError('Unknown lexer: %s' % lexer) + elif parser=='earley': + if lexer=='standard': + return Earley + elif lexer=='dynamic': + return XEarley + elif lexer=='dynamic_complete': + return XEarley_CompleteLex + elif lexer=='contextual': + raise ValueError('The Earley parser does not support the contextual parser') + else: + raise ValueError('Unknown lexer: %s' % lexer) + elif parser == 'cyk': + if lexer == 'standard': + return CYK + else: + raise ValueError('CYK parser requires using standard parser.') + else: + raise ValueError('Unknown parser: %s' % parser) + + + + +class WithLexer(Serialize): + lexer = None + parser = None + lexer_conf = None + + __serialize_fields__ = 'parser', 'lexer' + __serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer + + @classmethod + def deserialize(cls, data, memo, callbacks, postlex): + inst = super(WithLexer, cls).deserialize(data, memo) + inst.postlex = postlex + inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + return inst + + def _serialize(self, data, memo): + data['parser'] = data['parser'].serialize(memo) + + def init_traditional_lexer(self, lexer_conf): + self.lexer_conf = lexer_conf + self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) + self.postlex = lexer_conf.postlex + + def init_contextual_lexer(self, lexer_conf): + self.lexer_conf = lexer_conf + self.postlex = lexer_conf.postlex + states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} + always_accept = self.postlex.always_accept if self.postlex else () + self.lexer = ContextualLexer(lexer_conf.tokens, states, + ignore=lexer_conf.ignore, + always_accept=always_accept, + user_callbacks=lexer_conf.callbacks) + + def lex(self, text): + stream = self.lexer.lex(text) + return self.postlex.process(stream) if self.postlex else stream + + def parse(self, text): + token_stream = self.lex(text) + sps = self.lexer.set_parser_state + return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) + + +class LALR_TraditionalLexer(WithLexer): + def __init__(self, lexer_conf, parser_conf, options=None): + debug = options.debug if options else False + self.parser = LALR_Parser(parser_conf, debug=debug) + self.init_traditional_lexer(lexer_conf) + +class LALR_ContextualLexer(WithLexer): + def __init__(self, lexer_conf, parser_conf, options=None): + debug = options.debug if options else False + self.parser = LALR_Parser(parser_conf, debug=debug) + self.init_contextual_lexer(lexer_conf) + + + +class LarkOptions(Serialize): + """Specifies the options for Lark + """ - origin : a symbol - expansion : a list of symbols + OPTIONS_DOC = """ + parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") + Note: "lalr" requires a lexer + + lexer - Decides whether or not to use a lexer stage + "standard": Use a standard lexer + "contextual": Stronger lexer (only works with parser="lalr") + "dynamic": Flexible and powerful (only with parser="earley") + "dynamic_complete": Same as dynamic, but tries *every* variation + of tokenizing possible. (only with parser="earley") + "auto" (default): Choose for me based on grammar and parser + + ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" + "resolve": The parser will automatically choose the simplest derivation + (it chooses consistently: greedy for tokens, non-greedy for rules) + "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). + + transformer - Applies the transformer to every parse tree + debug - Affects verbosity (default: False) + keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) + cache_grammar - Cache the Lark grammar (Default: False) + postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. + start - The start symbol (Default: start) + profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) + priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) + propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. + lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None """ - def __init__(self, origin, expansion, alias=None, options=None): - self.origin = origin - self.expansion = expansion - self.alias = alias - self.options = options + if __doc__: + __doc__ += OPTIONS_DOC + + _defaults = { + 'debug': False, + 'keep_all_tokens': False, + 'tree_class': None, + 'cache_grammar': False, + 'postlex': None, + 'parser': 'earley', + 'lexer': 'auto', + 'transformer': None, + 'start': 'start', + 'profile': False, + 'priority': 'auto', + 'ambiguity': 'auto', + 'propagate_positions': False, + 'lexer_callbacks': {}, + 'maybe_placeholders': False, + } + + def __init__(self, options_dict): + o = dict(options_dict) + + options = {} + for name, default in self._defaults.items(): + if name in o: + value = o.pop(name) + if isinstance(default, bool): + value = bool(value) + else: + value = default - def __str__(self): - return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + options[name] = value - def __repr__(self): - return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + self.__dict__['options'] = options + assert self.parser in ('earley', 'lalr', 'cyk', None) -class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): - self.keep_all_tokens = keep_all_tokens - self.expand1 = expand1 - self.create_token = create_token # used for scanless postprocessing - self.priority = priority + if self.parser == 'earley' and self.transformer: + raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.' + 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') + + if o: + raise ValueError("Unknown options: %s" % o.keys()) + + def __getattr__(self, name): + return self.options[name] + def __setattr__(self, name, value): + assert name in self.options + self.options[name] = value + + def serialize(self, memo): + return self.options + + @classmethod + def deserialize(cls, data, memo): + return cls(data) + + +class Profiler: + def __init__(self): + self.total_time = defaultdict(float) + self.cur_section = '__init__' + self.last_enter_time = time.time() + + def enter_section(self, name): + cur_time = time.time() + self.total_time[self.cur_section] += cur_time - self.last_enter_time + self.last_enter_time = cur_time + self.cur_section = name + + def make_wrapper(self, name, f): + def wrapper(*args, **kwargs): + last_section = self.cur_section + self.enter_section(name) + try: + return f(*args, **kwargs) + finally: + self.enter_section(last_section) + + return wrapper + + +class Lark(Serialize): + def __init__(self, grammar, **options): + """ + grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) + options : a dictionary controlling various aspects of Lark. + """ + self.options = LarkOptions(options) + + # Some, but not all file-like objects have a 'name' attribute + try: + self.source = grammar.name + except AttributeError: + self.source = '' + + # Drain file-like objects to get their contents + try: + read = grammar.read + except AttributeError: + pass + else: + grammar = read() + + assert isinstance(grammar, STRING_TYPE) - self.filter_out = filter_out # remove this rule from the tree - # used for "token"-rules in scanless + if self.options.cache_grammar: + raise NotImplementedError("Not available yet") + + assert not self.options.profile, "Feature temporarily disabled" + # self.profiler = Profiler() if self.options.profile else None + + if self.options.lexer == 'auto': + if self.options.parser == 'lalr': + self.options.lexer = 'contextual' + elif self.options.parser == 'earley': + self.options.lexer = 'dynamic' + elif self.options.parser == 'cyk': + self.options.lexer = 'standard' + else: + assert False, self.options.parser + lexer = self.options.lexer + assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer) + + if self.options.ambiguity == 'auto': + if self.options.parser == 'earley': + self.options.ambiguity = 'resolve' + else: + disambig_parsers = ['earley', 'cyk'] + assert self.options.parser in disambig_parsers, ( + 'Only %s supports disambiguation right now') % ', '.join(disambig_parsers) + + if self.options.priority == 'auto': + if self.options.parser in ('earley', 'cyk', ): + self.options.priority = 'normal' + elif self.options.parser in ('lalr', ): + self.options.priority = None + elif self.options.priority in ('invert', 'normal'): + assert self.options.parser in ('earley', 'cyk'), "priorities are not supported for LALR at this time" + + assert self.options.priority in ('auto', None, 'normal', 'invert'), 'invalid priority option specified: {}. options are auto, none, normal, invert.'.format(self.options.priority) + assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"' + assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) + + # Parse the grammar file and compose the grammars (TODO) + self.grammar = load_grammar(grammar, self.source) + + # Compile the EBNF grammar into BNF + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile() + + # If the user asked to invert the priorities, negate them all here. + # This replaces the old 'resolve__antiscore_sum' option. + if self.options.priority == 'invert': + for rule in self.rules: + if rule.options and rule.options.priority is not None: + rule.options.priority = -rule.options.priority + # Else, if the user asked to disable priorities, strip them from the + # rules. This allows the Earley parsers to skip an extra forest walk + # for improved performance, if you don't need them (or didn't specify any). + elif self.options.priority == None: + for rule in self.rules: + if rule.options and rule.options.priority is not None: + rule.options.priority = None + self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) + + if self.options.parser: + self.parser = self._build_parser() + elif lexer: + self.lexer = self._build_lexer() + + if __init__.__doc__: + __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC + + __serialize_fields__ = 'parser', 'rules', 'options' + + def _build_lexer(self): + return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) + + def _prepare_callbacks(self): + self.parser_class = get_frontend(self.options.parser, self.options.lexer) + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) + self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) + + def _build_parser(self): + self._prepare_callbacks() + parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) + return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + + @classmethod + def deserialize(cls, data, namespace, memo, transformer=None, postlex=None): + if memo: + memo = SerializeMemoizer.deserialize(memo, namespace, {}) + inst = cls.__new__(cls) + options = dict(data['options']) + options['transformer'] = transformer + options['postlex'] = postlex + inst.options = LarkOptions.deserialize(options, memo) + inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] + inst._prepare_callbacks() + inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex) + return inst + + + @classmethod + def open(cls, grammar_filename, rel_to=None, **options): + """Create an instance of Lark with the grammar given by its filename + + If rel_to is provided, the function will find the grammar filename in relation to it. + + Example: + + >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") + Lark(...) + + """ + if rel_to: + basepath = os.path.dirname(rel_to) + grammar_filename = os.path.join(basepath, grammar_filename) + with open(grammar_filename, encoding='utf8') as f: + return cls(f, **options) def __repr__(self): - return 'RuleOptions(%r, %r, %r, %r, %r)' % ( - self.keep_all_tokens, - self.expand1, - self.create_token, - self.priority, - self.filter_out - ) + return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer) -Shift = 0 -Reduce = 1 -import re -MRES = ( -[(u'(?P(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P(?:[ \t\x0c\r\n])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])', - {1: u'SIGNED_NUMBER', - 2: u'ESCAPED_STRING', - 3: u'WS', - 4: u'__FALSE1', - 5: u'__NULL2', - 6: u'__TRUE0', - 7: u'__COLON', - 8: u'__COMMA', - 9: u'__LBRACE', - 10: u'__LSQB', - 11: u'__RBRACE', - 12: u'__RSQB'})] -) -LEXER_CALLBACK = ( -{} + + def lex(self, text): + "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'" + if not hasattr(self, 'lexer'): + self.lexer = self._build_lexer() + stream = self.lexer.lex(text) + if self.options.postlex: + return self.options.postlex.process(stream) + return stream + + def parse(self, text): + "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." + return self.parser.parse(text) + + +DATA = ( +{'rules': [{'@': 27}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 18}, {'@': 16}, {'@': 23}, {'@': 21}, {'@': 17}, {'@': 28}, {'@': 30}, {'@': 25}, {'@': 29}, {'@': 20}, {'@': 22}, {'@': 15}, {'@': 19}, {'@': 12}, {'@': 14}], 'parser': {'parser': {'tokens': {0: 'COMMA', 1: 'RBRACE', 2: u'pair', 3: u'ESCAPED_STRING', 4: u'string', 5: 'COLON', 6: 'RSQB', 7: '$END', 8: 'LBRACE', 9: u'FALSE', 10: u'object', 11: u'SIGNED_NUMBER', 12: u'value', 13: 'LSQB', 14: u'NULL', 15: u'TRUE', 16: u'array', 17: '__anon_star_1', 18: '__anon_star_0', 19: 'start'}, 'states': {0: {0: (0, 1), 1: (0, 32)}, 1: {2: (0, 5), 3: (0, 21), 4: (0, 3)}, 2: {0: (1, {'@': 12}), 1: (1, {'@': 12})}, 3: {5: (0, 13)}, 4: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 6: (1, {'@': 13}), 7: (1, {'@': 13})}, 5: {0: (1, {'@': 14}), 1: (1, {'@': 14})}, 6: {0: (1, {'@': 15}), 6: (1, {'@': 15})}, 7: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 6: (1, {'@': 16}), 7: (1, {'@': 16})}, 8: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 12), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 9: {0: (1, {'@': 17}), 1: (1, {'@': 17}), 6: (1, {'@': 17}), 7: (1, {'@': 17})}, 10: {0: (0, 22), 17: (0, 0), 1: (0, 26)}, 11: {0: (1, {'@': 18}), 1: (1, {'@': 18}), 6: (1, {'@': 18}), 7: (1, {'@': 18})}, 12: {0: (1, {'@': 19}), 6: (1, {'@': 19})}, 13: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 15), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 14: {3: (0, 21), 4: (0, 4), 6: (0, 30), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 23), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 15: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 16: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 6: (1, {'@': 21}), 7: (1, {'@': 21})}, 17: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 6), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 18: {}, 19: {7: (0, 18)}, 20: {0: (0, 8), 6: (0, 16)}, 21: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 5: (1, {'@': 22}), 6: (1, {'@': 22}), 7: (1, {'@': 22})}, 22: {2: (0, 2), 3: (0, 21), 4: (0, 3)}, 23: {0: (0, 17), 18: (0, 20), 6: (0, 9)}, 24: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 6: (1, {'@': 23}), 7: (1, {'@': 23})}, 25: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 6: (1, {'@': 24}), 7: (1, {'@': 24})}, 26: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 6: (1, {'@': 25}), 7: (1, {'@': 25})}, 27: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 6: (1, {'@': 26}), 7: (1, {'@': 26})}, 28: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 29), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27), 19: (0, 19)}, 29: {7: (1, {'@': 27})}, 30: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 6: (1, {'@': 28}), 7: (1, {'@': 28})}, 31: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 6: (1, {'@': 29}), 7: (1, {'@': 29})}, 32: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 6: (1, {'@': 30}), 7: (1, {'@': 30})}, 33: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 6: (1, {'@': 31}), 7: (1, {'@': 31})}, 34: {1: (0, 31), 2: (0, 10), 3: (0, 21), 4: (0, 3)}}, 'end_state': 18, 'start_state': 28}, '__type__': 'LALR_TraditionalLexer', 'lexer': {'ignore_types': [u'WS'], 'terminals': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], '__type__': 'TraditionalLexer', 'newline_types': [u'WS']}}, '__type__': 'Lark', 'options': {'profile': False, 'transformer': None, 'lexer': 'standard', 'lexer_callbacks': {}, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': 'start', 'keep_all_tokens': False, 'ambiguity': 'auto', 'debug': False, 'propagate_positions': False, 'maybe_placeholders': False}} ) -NEWLINE_TYPES = [u'WS'] -IGNORE_TYPES = [u'WS'] -class LexerRegexps: pass -lexer_regexps = LexerRegexps() -lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES] -lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres]) - for n, mres in LEXER_CALLBACK.items()} -lexer = _Lex(lexer_regexps) -def lex(stream): - return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES) -RULES = { - 0: Rule(u'start', [u'value'], None, RuleOptions(False, True, None, None, False)), - 1: Rule(u'value', [u'string'], None, RuleOptions(False, True, None, None, False)), - 2: Rule(u'value', [u'__TRUE0'], u'true', RuleOptions(False, True, None, None, False)), - 3: Rule(u'value', [u'array'], None, RuleOptions(False, True, None, None, False)), - 4: Rule(u'value', [u'__NULL2'], u'null', RuleOptions(False, True, None, None, False)), - 5: Rule(u'value', [u'SIGNED_NUMBER'], u'number', RuleOptions(False, True, None, None, False)), - 6: Rule(u'value', [u'object'], None, RuleOptions(False, True, None, None, False)), - 7: Rule(u'value', [u'__FALSE1'], u'false', RuleOptions(False, True, None, None, False)), - 8: Rule(u'array', ['__LSQB', u'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)), - 9: Rule(u'array', ['__LSQB', u'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)), - 10: Rule(u'array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)), - 11: Rule(u'object', ['__LBRACE', u'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), - 12: Rule(u'object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), - 13: Rule(u'object', ['__LBRACE', u'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), - 14: Rule(u'pair', [u'string', '__COLON', u'value'], None, RuleOptions(False, False, None, None, False)), - 15: Rule(u'string', [u'ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)), - 16: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', u'value'], None, None), - 17: Rule('__anon_star_0', ['__COMMA', u'value'], None, None), - 18: Rule('__anon_star_1', ['__COMMA', u'pair'], None, None), - 19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', u'pair'], None, None), -} -parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree) -class ParseTable: pass -parse_table = ParseTable() -STATES = { - 0: {0: (1, 4), 1: (1, 4), 2: (1, 4), 3: (1, 4)}, - 1: {1: (1, 14), 2: (1, 14)}, - 2: {0: (0, 29), 1: (0, 32), 4: (0, 9)}, - 3: {1: (0, 13), 2: (0, 12)}, - 4: {0: (1, 1), 1: (1, 1), 2: (1, 1), 3: (1, 1)}, - 5: {0: (1, 10), 1: (1, 10), 2: (1, 10), 3: (1, 10)}, - 6: {2: (0, 15), 5: (0, 27), 6: (0, 16), 7: (0, 26)}, - 7: {5: (0, 34), 6: (0, 16), 7: (0, 26)}, - 8: {0: (1, 2), 1: (1, 2), 2: (1, 2), 3: (1, 2)}, - 9: {0: (0, 11), 1: (0, 22)}, - 10: {0: (1, 6), 1: (1, 6), 2: (1, 6), 3: (1, 6)}, - 11: {0: (1, 9), 1: (1, 9), 2: (1, 9), 3: (1, 9)}, - 12: {0: (1, 11), 1: (1, 11), 2: (1, 11), 3: (1, 11)}, - 13: {5: (0, 20), 6: (0, 16), 7: (0, 26)}, - 14: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 17), 14: (0, 33), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 15: {0: (1, 12), 1: (1, 12), 2: (1, 12), 3: (1, 12)}, - 16: {0: (1, 15), 1: (1, 15), 2: (1, 15), 3: (1, 15), 18: (1, 15)}, - 17: {3: (1, 0)}, - 18: {}, - 19: {0: (1, 3), 1: (1, 3), 2: (1, 3), 3: (1, 3)}, - 20: {1: (1, 19), 2: (1, 19)}, - 21: {0: (1, 5), 1: (1, 5), 2: (1, 5), 3: (1, 5)}, - 22: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 30), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 23: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 1), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 24: {0: (0, 5), 6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 2), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 25: {0: (1, 13), 1: (1, 13), 2: (1, 13), 3: (1, 13)}, - 26: {18: (0, 23)}, - 27: {1: (0, 7), 2: (0, 25), 19: (0, 3)}, - 28: {0: (1, 17), 1: (1, 17)}, - 29: {0: (1, 8), 1: (1, 8), 2: (1, 8), 3: (1, 8)}, - 30: {0: (1, 16), 1: (1, 16)}, - 31: {0: (1, 7), 1: (1, 7), 2: (1, 7), 3: (1, 7)}, - 32: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 28), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 33: {3: (0, 18)}, - 34: {1: (1, 18), 2: (1, 18)}, -} -TOKEN_TYPES = ( -{0: '__RSQB', - 1: '__COMMA', - 2: '__RBRACE', - 3: '$END', - 4: '__anon_star_0', - 5: u'pair', - 6: u'ESCAPED_STRING', - 7: u'string', - 8: '__LBRACE', - 9: u'__FALSE1', - 10: '__LSQB', - 11: u'object', - 12: u'SIGNED_NUMBER', - 13: u'value', - 14: 'start', - 15: u'__NULL2', - 16: u'array', - 17: u'__TRUE0', - 18: '__COLON', - 19: '__anon_star_1'} +MEMO = ( +{0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', 'flags': [], 'value': u'(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+)'}, '__type__': 'TerminalDef', 'name': u'SIGNED_NUMBER'}, 1: {'priority': 1, 'pattern': {'__type__': 'PatternRE', 'flags': [], 'value': u'\\".*?(?