From 8fba797a9a67157615f604d8161afba04af93c81 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Sat, 11 Mar 2023 16:52:18 +0100 Subject: [PATCH] Factor out _check_regex_collisions. Limit total collision count that will be outputted, and limit max time that is being spent searching for examples. --- lark/lexer.py | 54 ++++++++++++++++++++++++++----------------- setup.py | 2 +- test-requirements.txt | 2 +- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 7c1dbb59..72bfb01a 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -457,8 +457,39 @@ def make_lexer_state(self, text): return LexerState(text) -class BasicLexer(Lexer): +def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparator, strict_mode, max_collisions_to_show=8): + if not comparator: + comparator = interegular.Comparator.from_regexes(terminal_to_regexp) + + # When in strict mode, we only ever try to provide one example, so taking + # a long time for that should be fine + max_time = 2 if strict_mode else 0.2 + + # We don't want to show too many collisions. + if comparator.count_marked_pairs() >= max_collisions_to_show: + return + for group in classify(terminal_to_regexp, lambda t: t.priority).values(): + for a, b in comparator.check(group, skip_marked=True): + assert a.priority == b.priority + # Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision + comparator.mark(a, b) + + # Notify the user + message = f"Collision between Terminals {a.name} and {b.name}. " + try: + example = comparator.get_example_overlap(a, b, max_time).format_multiline() + except ValueError: + # Couldn't find an example within max_time steps. + example = "No example could be found fast enough. However, the collision does still exists" + if strict_mode: + raise LexError(f"{message}\n{example}") + logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example) + if comparator.count_marked_pairs() >= max_collisions_to_show: + logger.warning("Found 8 regex collisions, will not check for more.") + return + +class BasicLexer(Lexer): terminals: Collection[TerminalDef] ignore_types: FrozenSet[str] newline_types: FrozenSet[str] @@ -491,26 +522,7 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None: raise LexError("Ignore terminals are not defined: %s" % (set(conf.ignore) - {t.name for t in terminals})) if has_interegular: - if not comparator: - comparator = interegular.Comparator.from_regexes(terminal_to_regexp) - for group in classify(terminal_to_regexp, lambda t: t.priority).values(): - for a, b in comparator.check(group, skip_marked=True): - assert a.priority == b.priority - # Mark this pair to not repeat warnings when multiple different BasicLexers see the same collision - comparator.mark(a, b) - - # Notify the user - message = f"Collision between Terminals {a.name} and {b.name}. " - try: - example = comparator.get_example_overlap(a, b, 10000).format_multiline() - except ValueError: - # Couldn't find an example within 10000 steps. - # This value was chosen since it should still guarantee that get_example_overlap - # terminates within < 1s, but it should be able to provide examples for almost everything - example = "No example could be found fast enough. However, the collision does still exists" - if conf.strict: - raise LexError(f"{message}\n{example}") - logger.warning("%s The lexer will choose between them arbitrarily.\n%s", message, example) + _check_regex_collisions(terminal_to_regexp, comparator, conf.strict) elif conf.strict: raise LexError("interegular must be installed for strict mode. Use `pip install 'lark[interegular]'`.") diff --git a/setup.py b/setup.py index 9e0ff4e8..1d7f9296 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ "regex": ["regex"], "nearley": ["js2py"], "atomic_cache": ["atomicwrites"], - "interegular": ["interegular>=0.2.7,<0.3.0"], + "interegular": ["interegular>=0.3.1,<0.4.0"], }, package_data = {'': ['*.md', '*.lark'], 'lark': ['py.typed']}, diff --git a/test-requirements.txt b/test-requirements.txt index 97b640ac..e807354a 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,3 +1,3 @@ -interegular>=0.2.7,<0.3.0 +interegular>=0.3.1,<0.4.0 Js2Py==0.68 regex