Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Factor out _check_regex_collisions. Limit total collision count that …
…will be outputted, and limit max time that is being spent searching for examples.
  • Loading branch information
MegaIng committed Mar 11, 2023
1 parent 6c766f9 commit 8fba797
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 23 deletions.
54 changes: 33 additions & 21 deletions lark/lexer.py
Expand Up @@ -457,8 +457,39 @@ def make_lexer_state(self, text):
return LexerState(text)


class BasicLexer(Lexer):
def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8):
if not comparator:
comparator = interegular.Comparator.from_regexes(terminal_to_regexp)

# When in strict mode, we only ever try to provide one example, so taking
# a long time for that should be fine
max_time = 2 if strict_mode else 0.2

# We don't want to show too many collisions.
if comparator.count_marked_pairs() >= max_collisions_to_show:
return
for group in classify(terminal_to_regexp, lambda t: t.priority).values():
for a, b in comparator.check(group, skip_marked=True):
assert a.priority == b.priority
# Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision
comparator.mark(a, b)

# Notify the user
message = f"Collision between Terminals {a.name} and {b.name}. "
try:
example = comparator.get_example_overlap(a, b, max_time).format_multiline()
except ValueError:
# Couldn't find an example within max_time steps.
example = "No example could be found fast enough. However, the collision does still exists"
if strict_mode:
raise LexError(f"{message}\n{example}")
logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example)
if comparator.count_marked_pairs() >= max_collisions_to_show:
logger.warning("Found 8 regex collisions, will not check for more.")
return


class BasicLexer(Lexer):
terminals: Collection[TerminalDef]
ignore_types: FrozenSet[str]
newline_types: FrozenSet[str]
Expand Down Expand Up @@ -491,26 +522,7 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals}))

if has_interegular:
if not comparator:
comparator = interegular.Comparator.from_regexes(terminal_to_regexp)
for group in classify(terminal_to_regexp, lambda t: t.priority).values():
for a, b in comparator.check(group, skip_marked=True):
assert a.priority == b.priority
# Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision
comparator.mark(a, b)

# Notify the user
message = f"Collision between Terminals {a.name} and {b.name}. "
try:
example = comparator.get_example_overlap(a, b, 10000).format_multiline()
except ValueError:
# Couldn't find an example within 10000 steps.
# This value was chosen since it should still guarantee that get_example_overlap
# terminates within < 1s, but it should be able to provide examples for almost everything
example = "No example could be found fast enough. However, the collision does still exists"
if conf.strict:
raise LexError(f"{message}\n{example}")
logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example)
_check_regex_collisions(terminal_to_regexp, comparator, conf.strict)
elif conf.strict:
raise LexError("interegular must be installed for strict mode. Use `pip install 'lark[interegular]'`.")

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -15,7 +15,7 @@
"regex": ["regex"],
"nearley": ["js2py"],
"atomic_cache": ["atomicwrites"],
"interegular": ["interegular>=0.2.7,<0.3.0"],
"interegular": ["interegular>=0.3.1,<0.4.0"],
},

package_data = {'': ['*.md', '*.lark'], 'lark': ['py.typed']},
Expand Down
2 changes: 1 addition & 1 deletion test-requirements.txt
@@ -1,3 +1,3 @@
interegular>=0.2.7,<0.3.0
interegular>=0.3.1,<0.4.0
Js2Py==0.68
regex

0 comments on commit 8fba797

Please sign in to comment.