lark-parser · erezsh · Apr 18, 2021 · Apr 18, 2021 · Apr 20, 2021 · MegaIng
diff --git a/lark/grammar.py b/lark/grammar.py
@@ -1,6 +1,7 @@
 from .utils import Serialize
 
 ###{standalone
+END = '__$END$__'
 
 class Symbol(Serialize):
     __slots__ = ('name',)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
@@ -14,7 +14,7 @@
 from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import ParsingFrontend
 from .common import LexerConf, ParserConf
-from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
+from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END
 from .utils import classify, suppress, dedup_list, Str
 from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError
 
@@ -99,6 +99,7 @@
     '_EXTEND': r'%extend',
     '_IMPORT': r'%import',
     'NUMBER': r'[+-]?\d+',
+    '_END': r'\$',
 }
 
 RULES = {
@@ -135,6 +136,7 @@
               'nonterminal',
               'literal',
               'range',
+              'end',
               'template_usage'],
 
     'terminal': ['TERMINAL'],
@@ -144,6 +146,7 @@
 
     'maybe': ['_LBRA expansions _RBRA'],
     'range': ['STRING _DOTDOT STRING'],
+    'end': ['_END'],
 
     'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
     '_template_args': ['value',
@@ -791,6 +794,9 @@ def terminal(self, name):
     def nonterminal(self, name):
         return name
 
+    def end(self):
+        return Token('TERMINAL', END)
+
 
 def _find_used_symbols(tree):
     assert tree.data == 'expansions'
@@ -938,6 +944,8 @@ def __init__(self, global_keep_all_tokens=False, import_paths=None):
         self._definitions = {}
         self._ignore_names = []
 
+        self._definitions[END] = ((), Tree('expansions', []), self._check_options(END, None))
+
     def _is_term(self, name):
         # Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME`
         # Only the last part is the actual name, and the rest might contain mixed case

diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py
@@ -2,7 +2,7 @@
 
 from ..utils import bfs, fzset, classify
 from ..exceptions import GrammarError
-from ..grammar import Rule, Terminal, NonTerminal
+from ..grammar import Rule, Terminal, NonTerminal, END
 
 
 class RulePtr(object):
@@ -125,7 +125,7 @@ class GrammarAnalyzer(object):
     def __init__(self, parser_conf, debug=False):
         self.debug = debug
 
-        root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
+        root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)])
                       for start in parser_conf.start}
 
         rules = parser_conf.rules + list(root_rules.values())

diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py
@@ -12,7 +12,7 @@
 from ..exceptions import GrammarError
 
 from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
-from ..grammar import Rule
+from ..grammar import Rule, END
 
 ###{standalone
 
@@ -177,7 +177,7 @@ def compute_reads_relations(self):
             assert(len(root.kernel) == 1)
             for rp in root.kernel:
                 assert(rp.index == 0)
-                self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
+                self.directly_reads[(root, rp.next)] = set([ Terminal(END) ])
 
         for state in self.lr0_states:
             seen = set()
@@ -261,11 +261,12 @@ def compute_lalr1_states(self):
                         rules = [best[1]]
                     else:
                         reduce_reduce.append((state, la, rules))
-                if la in actions:
+                if la in actions and la.name != END:
                     if self.debug:
                         logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
                         logger.warning(' * %s', list(rules)[0])
                 else:
+                    # No shift found for la, or it's End Of Input, in which case Reduce should come before Shift.
                     actions[la] = (Reduce, list(rules)[0])
             m[state] = { k.name: v for k, v in actions.items() }
 

diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py
@@ -4,6 +4,7 @@
 
 from .. import Token
 from ..exceptions import UnexpectedToken
+from ..grammar import END
 
 
 class InteractiveParser(object):
@@ -21,18 +22,18 @@ def feed_token(self, token):
 
         Note that ``token`` has to be an instance of ``Token``.
         """
-        return self.parser_state.feed_token(token, token.type == '$END')
+        return self.parser_state.feed_token(token, token.type == END)
 
     def exhaust_lexer(self):
         """Try to feed the rest of the lexer state into the interactive parser.
 
-        Note that this modifies the instance in place and does not feed an '$END' Token"""
+        Note that this modifies the instance in place and does not feed an END Token"""
         for token in self.lexer_state.lex(self.parser_state):
             self.parser_state.feed_token(token)
 
     def feed_eof(self, last_token=None):
-        """Feed a '$END' Token. Borrows from 'last_token' if given."""
-        eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1)
+        """Feed a END Token. Borrows from 'last_token' if given."""
+        eof = Token.new_borrow_pos(END, '', last_token) if last_token is not None else Token(END, '', 0, 1, 1)
         return self.feed_token(eof)
 
 
@@ -116,7 +117,7 @@ def feed_token(self, token):
     def exhaust_lexer(self):
         """Try to feed the rest of the lexer state into the parser.
 
-        Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
+        Note that this returns a new ImmutableInteractiveParser and does not feed an END Token"""
         cursor = self.as_mutable()
         cursor.exhaust_lexer()
         return cursor.as_immutable()

diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
@@ -10,6 +10,7 @@
 from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
 from .lalr_interactive_parser import InteractiveParser
 from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
+from ..grammar import END
 
 ###{standalone
 
@@ -60,7 +61,7 @@ def parse(self, lexer, start, on_error=None):
                     return e.interactive_parser.resume_parse()
                 except UnexpectedToken as e2:
                     if (isinstance(e, UnexpectedToken)
-                        and e.token.type == e2.token.type == '$END'
+                        and e.token.type == e2.token.type == END
                         and e.interactive_parser == e2.interactive_parser):
                         # Prevent infinite loop
                         raise e2
@@ -132,10 +133,14 @@ def feed_token(self, token, is_end=False):
 
             if action is Shift:
                 # shift once and return
-                assert not is_end
+                # assert not is_end
                 state_stack.append(arg)
                 value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
-                return
+                if not is_end:
+                    return
+
+                 # If it's the end, keep feeding the same token until we get to a reduce
+                assert token.type == END
             else:
                 # reduce+shift as many times as necessary
                 rule = arg
@@ -178,7 +183,7 @@ def parse_from_state(self, state):
             for token in state.lexer.lex(state):
                 state.feed_token(token)
 
-            token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
+            token = Token.new_borrow_pos(END, '', token) if token else Token(END, '', 0, 1, 1)
             return state.feed_token(token, True)
         except UnexpectedInput as e:
             try:

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -2467,6 +2467,43 @@ def ignore_errors(e):
             s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
             tree = g.parse(s, on_error=ignore_errors)
 
+        @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
+        def test_end_symbol(self):
+            grammar = """
+                start: a b?
+                a: "a" $
+                b: "b"
+            """
+            parser = _Lark(grammar)
+
+            self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])]))
+            self.assertRaises(UnexpectedInput, parser.parse, 'ab')
+
+        @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
+        def test_end_symbol2(self):
+            grammar = """
+                start: (a|b)+
+                a: "a" ("x"|$)
+                b: "b"
+            """
+            parser = _Lark(grammar)
+
+            self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])]))
+            self.assertRaises(UnexpectedInput, parser.parse, 'ab')
+
+        @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
+        def test_end_symbol3(self):
+            grammar = """
+                start: (a|b)+
+                a: "a" (e|"x")
+                b: "b"
+                e: $
+            """
+            parser = _Lark(grammar)
+
+            self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])]))
+            self.assertRaises(UnexpectedInput, parser.parse, 'ab')            
+
 
     _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
     _TestParser.__name__ = _NAME