diff --git a/lark/grammar.py b/lark/grammar.py index 405086a2..e6502ec4 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,6 +1,7 @@ from .utils import Serialize ###{standalone +END = '__$END$__' class Symbol(Serialize): __slots__ = ('name',) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 4e8d2987..47fa82f1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -14,7 +14,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import ParsingFrontend from .common import LexerConf, ParserConf -from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END from .utils import classify, suppress, dedup_list, Str from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError @@ -99,6 +99,7 @@ '_EXTEND': r'%extend', '_IMPORT': r'%import', 'NUMBER': r'[+-]?\d+', + '_END': r'\$', } RULES = { @@ -135,6 +136,7 @@ 'nonterminal', 'literal', 'range', + 'end', 'template_usage'], 'terminal': ['TERMINAL'], @@ -144,6 +146,7 @@ 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOTDOT STRING'], + 'end': ['_END'], 'template_usage': ['RULE _LBRACE _template_args _RBRACE'], '_template_args': ['value', @@ -791,6 +794,9 @@ def terminal(self, name): def nonterminal(self, name): return name + def end(self): + return Token('TERMINAL', END) + def _find_used_symbols(tree): assert tree.data == 'expansions' @@ -938,6 +944,8 @@ def __init__(self, global_keep_all_tokens=False, import_paths=None): self._definitions = {} self._ignore_names = [] + self._definitions[END] = ((), Tree('expansions', []), self._check_options(END, None)) + def _is_term(self, name): # Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME` # Only the last part is the actual name, and the rest might contain mixed case diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 737cb02a..3ba22cac 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -2,7 +2,7 @@ from ..utils import bfs, fzset, classify from ..exceptions import GrammarError -from ..grammar import Rule, Terminal, NonTerminal +from ..grammar import Rule, Terminal, NonTerminal, END class RulePtr(object): @@ -125,7 +125,7 @@ class GrammarAnalyzer(object): def __init__(self, parser_conf, debug=False): self.debug = debug - root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')]) + root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)]) for start in parser_conf.start} rules = parser_conf.rules + list(root_rules.values()) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index f6a993b9..d52c7dc6 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -12,7 +12,7 @@ from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet -from ..grammar import Rule +from ..grammar import Rule, END ###{standalone @@ -177,7 +177,7 @@ def compute_reads_relations(self): assert(len(root.kernel) == 1) for rp in root.kernel: assert(rp.index == 0) - self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ]) + self.directly_reads[(root, rp.next)] = set([ Terminal(END) ]) for state in self.lr0_states: seen = set() @@ -261,11 +261,12 @@ def compute_lalr1_states(self): rules = [best[1]] else: reduce_reduce.append((state, la, rules)) - if la in actions: + if la in actions and la.name != END: if self.debug: logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) logger.warning(' * %s', list(rules)[0]) else: + # No shift found for la, or it's End Of Input, in which case Reduce should come before Shift. actions[la] = (Reduce, list(rules)[0]) m[state] = { k.name: v for k, v in actions.items() } diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py index ce596b5d..6c6a58df 100644 --- a/lark/parsers/lalr_interactive_parser.py +++ b/lark/parsers/lalr_interactive_parser.py @@ -4,6 +4,7 @@ from .. import Token from ..exceptions import UnexpectedToken +from ..grammar import END class InteractiveParser(object): @@ -21,18 +22,18 @@ def feed_token(self, token): Note that ``token`` has to be an instance of ``Token``. """ - return self.parser_state.feed_token(token, token.type == '$END') + return self.parser_state.feed_token(token, token.type == END) def exhaust_lexer(self): """Try to feed the rest of the lexer state into the interactive parser. - Note that this modifies the instance in place and does not feed an '$END' Token""" + Note that this modifies the instance in place and does not feed an END Token""" for token in self.lexer_state.lex(self.parser_state): self.parser_state.feed_token(token) def feed_eof(self, last_token=None): - """Feed a '$END' Token. Borrows from 'last_token' if given.""" - eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1) + """Feed a END Token. Borrows from 'last_token' if given.""" + eof = Token.new_borrow_pos(END, '', last_token) if last_token is not None else Token(END, '', 0, 1, 1) return self.feed_token(eof) @@ -116,7 +117,7 @@ def feed_token(self, token): def exhaust_lexer(self): """Try to feed the rest of the lexer state into the parser. - Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token""" + Note that this returns a new ImmutableInteractiveParser and does not feed an END Token""" cursor = self.as_mutable() cursor.exhaust_lexer() return cursor.as_immutable() diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index fe40791e..b5c3e946 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -10,6 +10,7 @@ from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_interactive_parser import InteractiveParser from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken +from ..grammar import END ###{standalone @@ -60,7 +61,7 @@ def parse(self, lexer, start, on_error=None): return e.interactive_parser.resume_parse() except UnexpectedToken as e2: if (isinstance(e, UnexpectedToken) - and e.token.type == e2.token.type == '$END' + and e.token.type == e2.token.type == END and e.interactive_parser == e2.interactive_parser): # Prevent infinite loop raise e2 @@ -132,10 +133,14 @@ def feed_token(self, token, is_end=False): if action is Shift: # shift once and return - assert not is_end + # assert not is_end state_stack.append(arg) value_stack.append(token if token.type not in callbacks else callbacks[token.type](token)) - return + if not is_end: + return + + # If it's the end, keep feeding the same token until we get to a reduce + assert token.type == END else: # reduce+shift as many times as necessary rule = arg @@ -178,7 +183,7 @@ def parse_from_state(self, state): for token in state.lexer.lex(state): state.feed_token(token) - token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + token = Token.new_borrow_pos(END, '', token) if token else Token(END, '', 0, 1, 1) return state.feed_token(token, True) except UnexpectedInput as e: try: diff --git a/tests/test_parser.py b/tests/test_parser.py index 18b70fca..685d155b 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2467,6 +2467,43 @@ def ignore_errors(e): s = "[0 1, 2,@, 3,,, 4, 5 6 ]$" tree = g.parse(s, on_error=ignore_errors) + @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") + def test_end_symbol(self): + grammar = """ + start: a b? + a: "a" $ + b: "b" + """ + parser = _Lark(grammar) + + self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])])) + self.assertRaises(UnexpectedInput, parser.parse, 'ab') + + @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") + def test_end_symbol2(self): + grammar = """ + start: (a|b)+ + a: "a" ("x"|$) + b: "b" + """ + parser = _Lark(grammar) + + self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])])) + self.assertRaises(UnexpectedInput, parser.parse, 'ab') + + @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") + def test_end_symbol3(self): + grammar = """ + start: (a|b)+ + a: "a" (e|"x") + b: "b" + e: $ + """ + parser = _Lark(grammar) + + self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])])) + self.assertRaises(UnexpectedInput, parser.parse, 'ab') + _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME