Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reimplementation of end symbol (Issue #237) #880

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions lark/grammar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .utils import Serialize

###{standalone
END = '__$END$__'

class Symbol(Serialize):
__slots__ = ('name',)
Expand Down
10 changes: 9 additions & 1 deletion lark/load_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import ParsingFrontend
from .common import LexerConf, ParserConf
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END
from .utils import classify, suppress, dedup_list, Str
from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError

Expand Down Expand Up @@ -99,6 +99,7 @@
'_EXTEND': r'%extend',
'_IMPORT': r'%import',
'NUMBER': r'[+-]?\d+',
'_END': r'\$',
}

RULES = {
Expand Down Expand Up @@ -135,6 +136,7 @@
'nonterminal',
'literal',
'range',
'end',
'template_usage'],

'terminal': ['TERMINAL'],
Expand All @@ -144,6 +146,7 @@

'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOTDOT STRING'],
'end': ['_END'],

'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
'_template_args': ['value',
Expand Down Expand Up @@ -791,6 +794,9 @@ def terminal(self, name):
def nonterminal(self, name):
return name

def end(self):
return Token('TERMINAL', END)


def _find_used_symbols(tree):
assert tree.data == 'expansions'
Expand Down Expand Up @@ -938,6 +944,8 @@ def __init__(self, global_keep_all_tokens=False, import_paths=None):
self._definitions = {}
self._ignore_names = []

self._definitions[END] = ((), Tree('expansions', []), self._check_options(END, None))

def _is_term(self, name):
# Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME`
# Only the last part is the actual name, and the rest might contain mixed case
Expand Down
4 changes: 2 additions & 2 deletions lark/parsers/grammar_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from ..utils import bfs, fzset, classify
from ..exceptions import GrammarError
from ..grammar import Rule, Terminal, NonTerminal
from ..grammar import Rule, Terminal, NonTerminal, END


class RulePtr(object):
Expand Down Expand Up @@ -125,7 +125,7 @@ class GrammarAnalyzer(object):
def __init__(self, parser_conf, debug=False):
self.debug = debug

root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)])
for start in parser_conf.start}

rules = parser_conf.rules + list(root_rules.values())
Expand Down
7 changes: 4 additions & 3 deletions lark/parsers/lalr_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
from ..grammar import Rule
from ..grammar import Rule, END

###{standalone

Expand Down Expand Up @@ -177,7 +177,7 @@ def compute_reads_relations(self):
assert(len(root.kernel) == 1)
for rp in root.kernel:
assert(rp.index == 0)
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])
self.directly_reads[(root, rp.next)] = set([ Terminal(END) ])

for state in self.lr0_states:
seen = set()
Expand Down Expand Up @@ -261,11 +261,12 @@ def compute_lalr1_states(self):
rules = [best[1]]
else:
reduce_reduce.append((state, la, rules))
if la in actions:
if la in actions and la.name != END:
if self.debug:
logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name)
logger.warning(' * %s', list(rules)[0])
else:
# No shift found for la, or it's End Of Input, in which case Reduce should come before Shift.
actions[la] = (Reduce, list(rules)[0])
m[state] = { k.name: v for k, v in actions.items() }

Expand Down
11 changes: 6 additions & 5 deletions lark/parsers/lalr_interactive_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from .. import Token
from ..exceptions import UnexpectedToken
from ..grammar import END


class InteractiveParser(object):
Expand All @@ -21,18 +22,18 @@ def feed_token(self, token):

Note that ``token`` has to be an instance of ``Token``.
"""
return self.parser_state.feed_token(token, token.type == '$END')
return self.parser_state.feed_token(token, token.type == END)

def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the interactive parser.

Note that this modifies the instance in place and does not feed an '$END' Token"""
Note that this modifies the instance in place and does not feed an END Token"""
for token in self.lexer_state.lex(self.parser_state):
self.parser_state.feed_token(token)

def feed_eof(self, last_token=None):
"""Feed a '$END' Token. Borrows from 'last_token' if given."""
eof = Token.new_borrow_pos('$END', '', last_token) if last_token is not None else Token('$END', '', 0, 1, 1)
"""Feed a END Token. Borrows from 'last_token' if given."""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, overlooked this in the review: Shouldn't we feed multiple ENDs here as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm I think you're right.

eof = Token.new_borrow_pos(END, '', last_token) if last_token is not None else Token(END, '', 0, 1, 1)
return self.feed_token(eof)


Expand Down Expand Up @@ -116,7 +117,7 @@ def feed_token(self, token):
def exhaust_lexer(self):
"""Try to feed the rest of the lexer state into the parser.

Note that this returns a new ImmutableInteractiveParser and does not feed an '$END' Token"""
Note that this returns a new ImmutableInteractiveParser and does not feed an END Token"""
cursor = self.as_mutable()
cursor.exhaust_lexer()
return cursor.as_immutable()
Expand Down
13 changes: 9 additions & 4 deletions lark/parsers/lalr_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
from .lalr_interactive_parser import InteractiveParser
from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
from ..grammar import END

###{standalone

Expand Down Expand Up @@ -60,7 +61,7 @@ def parse(self, lexer, start, on_error=None):
return e.interactive_parser.resume_parse()
except UnexpectedToken as e2:
if (isinstance(e, UnexpectedToken)
and e.token.type == e2.token.type == '$END'
and e.token.type == e2.token.type == END
and e.interactive_parser == e2.interactive_parser):
# Prevent infinite loop
raise e2
Expand Down Expand Up @@ -132,10 +133,14 @@ def feed_token(self, token, is_end=False):

if action is Shift:
# shift once and return
assert not is_end
# assert not is_end
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wonder what would happen if you undid this

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's just an assert. It doesn't matter.

state_stack.append(arg)
value_stack.append(token if token.type not in callbacks else callbacks[token.type](token))
return
if not is_end:
return

# If it's the end, keep feeding the same token until we get to a reduce
assert token.type == END
else:
# reduce+shift as many times as necessary
rule = arg
Expand Down Expand Up @@ -178,7 +183,7 @@ def parse_from_state(self, state):
for token in state.lexer.lex(state):
state.feed_token(token)

token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
token = Token.new_borrow_pos(END, '', token) if token else Token(END, '', 0, 1, 1)
return state.feed_token(token, True)
except UnexpectedInput as e:
try:
Expand Down
37 changes: 37 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2467,6 +2467,43 @@ def ignore_errors(e):
s = "[0 1, 2,@, 3,,, 4, 5 6 ]$"
tree = g.parse(s, on_error=ignore_errors)

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol(self):
grammar = """
start: a b?
a: "a" $
b: "b"
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol2(self):
grammar = """
start: (a|b)+
a: "a" ("x"|$)
b: "b"
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')

@unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only")
def test_end_symbol3(self):
grammar = """
start: (a|b)+
a: "a" (e|"x")
b: "b"
e: $
"""
parser = _Lark(grammar)

self.assertEqual(parser.parse('axa'), Tree('start', [Tree('a', []),Tree('a', [Tree('e', [])])]))
self.assertRaises(UnexpectedInput, parser.parse, 'ab')


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME
Expand Down