Skip to content

Commit

Permalink
refs #37 Add greedy variants of +, *, and ?
Browse files Browse the repository at this point in the history
  • Loading branch information
igordejanovic committed Jul 14, 2021
1 parent 5f94f1f commit 517d33e
Show file tree
Hide file tree
Showing 3 changed files with 201 additions and 16 deletions.
57 changes: 57 additions & 0 deletions docs/grammar_language.md
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,63 @@ Same as `one or more` this operator may use separator modifiers.
matched `a` and empty list if no match is found.


### Greedy repetitions

`*`, `+`, and `?` operators have their greedy counterparts. To make an
repetition operator greedy add `!` (e.g. `*!`, `+!`, and `?!`). These versions
will consume as much as possible before proceeding. You can think of the greedy
repetitions as a way to disambiguate a class of ambiguities which arises due to
a sequence of rules where earlier constituent can match an input of various
length leaving the rest to the next rule to consume.

Consider this example:

S: "a"* "a"*;

It is easy to see that this grammar is ambiguous, as for the input:

a a

We have 3 solutions:

1:S[0->3]
a_0[0->1]
a_1[0->1]
a[0->1, "a"]
a_0[2->3]
a_1[2->3]
a[2->3, "a"]
2:S[0->3]
a_0[0->0]
a_0[0->3]
a_1[0->3]
a_1[0->1]
a[0->1, "a"]
a[2->3, "a"]
3:S[0->3]
a_0[0->3]
a_1[0->3]
a_1[0->1]
a[0->1, "a"]
a[2->3, "a"]
a_0[3->3]

If we apply greedy zero-or-more to the first element of the sequence:

S: "a"*! "a"*;

We have only one solution where all `a` tokens are consumed by the first part of
the rule:

S[0->3]
a_0[0->3]
a_1[0->3]
a_1[0->1]
a[0->1, "a"]
a[2->3, "a"]
a_0[3->3]


### Parenthesized groups

You can use parenthesized groups at any place you can use a rule reference. For example:
Expand Down
51 changes: 35 additions & 16 deletions parglare/grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,15 @@ class Reference(object):
multiplicty(str): Multiplicity of the RHS reference (used for regex
operators ?, *, +). See MULT_* constants above. By default
multiplicity is MULT_ONE.
greedy(bool): If the multiplicity was greedy (e.g. ?!, *! or +!).
separator (symbol or Reference): A reference to the separator symbol or
the separator symbol itself if resolved.
"""
def __init__(self, location, name):
self.name = name
self.location = location
self.multiplicity = MULT_ONE
self.greedy = False
self.separator = None

@property
Expand Down Expand Up @@ -695,6 +697,7 @@ def make_multiplicity_symbol(self, symbol_ref, base_symbol, separator,
separators.
"""
mult = symbol_ref.multiplicity
assoc = ASSOC_RIGHT if symbol_ref.greedy else ASSOC_NONE
if mult in [MULT_ONE_OR_MORE, MULT_ZERO_OR_MORE]:
symbol_name = make_multiplicity_name(
symbol_ref.name, MULT_ONE_OR_MORE,
Expand Down Expand Up @@ -738,9 +741,11 @@ def make_multiplicity_symbol(self, symbol_ref, base_symbol, separator,

productions.extend([Production(symbol,
ProductionRHS([symbol_one]),
assoc=assoc,
nops=True),
Production(symbol,
ProductionRHS([EMPTY]))])
ProductionRHS([EMPTY]),
assoc=assoc)])

def action(_, nodes):
if nodes:
Expand All @@ -752,6 +757,19 @@ def action(_, nodes):

self.register_symbol(symbol)

else:
if symbol_ref.greedy:
productions = []
symbol_one = symbol
symbol = NonTerminal('{}_g'.format(symbol_name), productions,
base_symbol.location,
imported_with=imported_with)
productions.extend([Production(symbol,
ProductionRHS([symbol_one]),
assoc=ASSOC_RIGHT)])
symbol.action_name = 'pass_single'
self.register_symbol(symbol)

else:
# MULT_OPTIONAL
if separator:
Expand All @@ -768,7 +786,8 @@ def action(_, nodes):
productions.extend([Production(symbol,
ProductionRHS([base_symbol])),
Production(symbol,
ProductionRHS([EMPTY]))])
ProductionRHS([EMPTY]),
assoc=assoc)])

symbol.action_name = 'optional'

Expand Down Expand Up @@ -1285,9 +1304,7 @@ class GrammarContext:

GSYMBOL_REFERENCE,
OPT_REP_OPERATOR,
REP_OPERATOR_ZERO,
REP_OPERATOR_ONE,
REP_OPERATOR_OPTIONAL,
REP_OPERATOR,
OPT_REP_MODIFIERS_EXP,
OPT_REP_MODIFIERS,
OPT_REP_MODIFIER,
Expand Down Expand Up @@ -1325,9 +1342,7 @@ class GrammarContext:

'GrammarSymbolReference',
'OptRepeatOperator',
'RepeatOperatorZero',
'RepeatOperatorOne',
'RepeatOperatorOptional',
'RepeatOperator',
'OptionalRepeatModifiersExpression',
'OptionalRepeatModifiers',
'OptionalRepeatModifier',
Expand Down Expand Up @@ -1444,13 +1459,14 @@ class GrammarContext:
# Regex-like repeat operators
[GSYMBOL_REFERENCE, [GSYMBOL, OPT_REP_OPERATOR]],
[GSYMBOL_REFERENCE, [PRODUCTION_GROUP, OPT_REP_OPERATOR]],
[OPT_REP_OPERATOR, [REP_OPERATOR_ZERO]],
[OPT_REP_OPERATOR, [REP_OPERATOR_ONE]],
[OPT_REP_OPERATOR, [REP_OPERATOR_OPTIONAL]],
[OPT_REP_OPERATOR, [REP_OPERATOR]],
[OPT_REP_OPERATOR, [EMPTY]],
[REP_OPERATOR_ZERO, ['*', OPT_REP_MODIFIERS_EXP]],
[REP_OPERATOR_ONE, ['+', OPT_REP_MODIFIERS_EXP]],
[REP_OPERATOR_OPTIONAL, ['?', OPT_REP_MODIFIERS_EXP]],
[REP_OPERATOR, ['*', OPT_REP_MODIFIERS_EXP]],
[REP_OPERATOR, ['*!', OPT_REP_MODIFIERS_EXP]],
[REP_OPERATOR, ['+', OPT_REP_MODIFIERS_EXP]],
[REP_OPERATOR, ['+!', OPT_REP_MODIFIERS_EXP]],
[REP_OPERATOR, ['?', OPT_REP_MODIFIERS_EXP]],
[REP_OPERATOR, ['?!', OPT_REP_MODIFIERS_EXP]],
[OPT_REP_MODIFIERS_EXP, ['[', OPT_REP_MODIFIERS, ']']],
[OPT_REP_MODIFIERS_EXP, [EMPTY]],
[OPT_REP_MODIFIERS, [OPT_REP_MODIFIERS, ',', OPT_REP_MODIFIER]],
Expand Down Expand Up @@ -1809,13 +1825,16 @@ def act_gsymbol_reference(context, nodes):
sep_ref = Reference(Location(context), sep_ref)
symbol_ref.separator = sep_ref

if rep_op == '*':
if rep_op.startswith('*'):
symbol_ref.multiplicity = MULT_ZERO_OR_MORE
elif rep_op == '+':
elif rep_op.startswith('+'):
symbol_ref.multiplicity = MULT_ONE_OR_MORE
else:
symbol_ref.multiplicity = MULT_OPTIONAL

if rep_op.endswith('!'):
symbol_ref.greedy = True

return symbol_ref


Expand Down
109 changes: 109 additions & 0 deletions tests/func/parsing/test_greedy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import pytest
from parglare import GLRParser, Grammar


def test_greedy_zero_or_more():
"""
Test greedy variant of zero or more.
"""
grammar = r"""
S: A* A*;
terminals
A: "a";
"""

g = Grammar.from_string(grammar)
p = GLRParser(g)
forest = p.parse("a a a a a a")
assert len(forest) == 7

# But greedy variant has only one solution where first A*! collects all tokens.
grammar = r"""
S: A*! A*;
terminals
A: "a";
"""

g = Grammar.from_string(grammar)
p = GLRParser(g)
forest = p.parse("a a a a a a")
assert len(forest) == 1


def test_greedy_zero_or_more_complex():
"""
Test greedy variant of zero or more for complex subexpression.
"""
grammar = r"""
S: ("a" | "b" "c")* "a"*;
"""

g = Grammar.from_string(grammar)
p = GLRParser(g)
forest = p.parse("a a a b c a b c a a a")
assert len(forest) == 4

grammar = r"""
S: ("a" | "b" "c")*! "a"*;
"""

g = Grammar.from_string(grammar)
p = GLRParser(g)
forest = p.parse("a a a b c a b c a a a")
assert len(forest) == 1


def test_greedy_one_or_more():
"""
Test greedy variant of one or more.
"""
grammar = r"""
S: A+ A*;
terminals
A: "a";
"""

g = Grammar.from_string(grammar)
p = GLRParser(g)
forest = p.parse("a a a a a a")
assert len(forest) == 6

# But greedy variant has only one solution where first A+! collects all tokens.
grammar = r"""
S: A+! A*;
terminals
A: "a";
"""

g = Grammar.from_string(grammar)
p = GLRParser(g)
forest = p.parse("a a a a a a")
assert len(forest) == 1


def test_greedy_optional():
"""
Test greedy variant of one or more.
"""
grammar = r"""
S: A? A+;
terminals
A: "a";
"""

g = Grammar.from_string(grammar)
p = GLRParser(g)
forest = p.parse("a a a a a a")
assert len(forest) == 2

# But greedy variant has only one solution where first A?! is non-empty if possible
grammar = r"""
S: A?! A*;
terminals
A: "a";
"""

g = Grammar.from_string(grammar)
p = GLRParser(g)
forest = p.parse("a a a a a a")
assert len(forest) == 1

0 comments on commit 517d33e

Please sign in to comment.