In [None]:
from IPython.display import HTML
HTML(open('../style.css').read())

# A Shift-Reduce Parser for Arithmetic Expressions

In this notebook we implement a generic *shift reduce parser*.  The parse table that we use 
implements the following grammar for arithmetic expressions:
$$
  \begin{eqnarray*}
  \mathrm{expr}        & \rightarrow & \mathrm{expr}\;\;\texttt{'+'}\;\;\mathrm{product}   \\
                       & \mid        & \mathrm{expr}\;\;\texttt{'-'}\;\;\mathrm{product}   \\
                       & \mid        & \mathrm{product}                                    \\[0.2cm]
  \mathrm{product}     & \rightarrow & \mathrm{product}\;\;\texttt{'*'}\;\;\mathrm{factor} \\
                       & \mid        & \mathrm{product}\;\;\texttt{'/'}\;\;\mathrm{factor} \\
                       & \mid        & \mathrm{factor}                                     \\[0.2cm]
  \mathrm{factor}      & \rightarrow & \texttt{'('} \;\;\mathrm{expr} \;\;\texttt{')'}     \\
                       & \mid        & \texttt{NUMBER} 
  \end{eqnarray*}
$$

In [None]:
import re

The function `tokenize` transforms the string `s` into a list of tokens. See below for an example.

In [None]:
def tokenize(s):
    '''Transform the string s into a list of (type, value) tuples.'''
    lexSpec = r'''([ \t\n]+)      |  # blanks and tabs
                  ([1-9][0-9]*|0) |  # number
                  ([-+*/()])      |  # arithmetical operators
                  (.)                # unrecognized character
               '''
    tokenList = re.findall(lexSpec, s, re.VERBOSE)
    result    = []
    for ws, number, operator, error in tokenList:
        if ws:        
            continue
        elif number:
            # Store the generic type 'NUMBER' and the actual value string
            result.append(('NUMBER', number))
        elif operator:
            # For operators, type and value are identical
            result.append((operator, operator))
        else:
            result.append(('ERROR', error))
    return result

In [None]:
tokenize('1 + 2 * (3 - 4)')

In [None]:
class ShiftReduceParser():
    def __init__(self, actionTable, gotoTable, stateTable):
        self.mActionTable = actionTable
        self.mGotoTable   = gotoTable
        self.mStateTable  = stateTable
        
    def parse(self, TL: list[str]) -> bool:
        return None 

In [None]:
def parse(self, TL): 
    index   = 0                 # points to next token
    Symbols: list[str] = []     # stack of grammar symbols (stores types, e.g., 'NUMBER')
    States : list[str] = ['s0'] # stack of states
    # The Forest stack stores the tree nodes/values
    Forest : list[str | tuple] = [] 
    TL += [('EOF', 'EOF')]
    while True:
        q = States[-1]
        # Unpack the current lookahead token into type and value
        token_type, token_value = TL[index]
        # Debug output (optional formatting)
        # We map the remaining tokens to their string representation for cleaner printing
        remaining_tokens = [val for _, val in TL[index:]]
        print(f'States:  [ {", ".join(States)} ]')
        print('Symbols:', " ".join(Symbols + ['|'] + remaining_tokens).strip())
        # Use token_type for the table lookup
        action = self.mActionTable.get((q, token_type), 'error')
        match action:
            case 'error': 
                print(f'Action({q}, {token_type}) undefined.')
                print('Syntax error!\n')
                return False
            case 'accept':
                print('Accepting!\n')
                return Forest[0] 
            case 'shift', s:
                print(f'Shifting state {s}\n')
                Symbols += [token_type] # Stack tracks the grammar type
                States  += [s]
                # Forest tracks the actual value ('1', '+', etc.)
                Forest += [token_value]
                index   += 1
            case 'reduce', rule:
                head, body = rule
                print(f'Reducing with rule {head} â†’ {" ".join(body)}')
                n        = len(body)
                children = tuple(Forest[-n:]) if n > 0 else ()
                Forest   = Forest[:-n]
                Forest  += [(head, children)]
                Symbols = Symbols[:-n]
                States  = States [:-n] 
                Symbols = Symbols + [head]
                state   = States[-1]
                States += [ self.mGotoTable[state, head] ]
            
ShiftReduceParser.parse = parse 
del parse

In [None]:
%run Parse-Table.ipynb

## Visualization

In [None]:
import graphviz

Visualize the given parse tree using `graphviz`.

In [None]:
def draw_tree(tree: tuple | str) -> graphviz.Digraph:
    dot = graphviz.Digraph(format='png')
    # Counter to generate unique IDs for each node in the graph
    counter = 0

    def get_id():
        nonlocal counter
        counter += 1
        return f'node{counter}'

    def visit(node, parent_id=None):
        # Create a unique ID for the current node
        node_id = get_id() 
        if isinstance(node, tuple):
            # It's a Non-Terminal: (Head, (Children...))
            head, children = node
            # Draw the Non-Terminal node (default shape is oval)
            dot.node(node_id, label=head)
            # Connect to parent if it exists
            if parent_id:
                dot.edge(parent_id, node_id)
            # Recursively visit all children
            for child in children:
                visit(child, node_id)
        else:
            # It's a Terminal: string value (e.g., '1', '+')
            label = str(node)
            # Draw the Terminal node with a distinct shape/font
            dot.node(node_id, label=label, shape='box', fontname='Courier')
            if parent_id:
                dot.edge(parent_id, node_id)
                
    # Start the traversal from the root
    visit(tree)
    
    return dot

## Testing

In [None]:
def test(s: str):
    parser = ShiftReduceParser(actionTable, gotoTable, stateTable)
    TL     = tokenize(s)
    print(f'tokenlist: {TL}\n')
    tree = parser.parse(TL)
    if tree:
        print('Parse successful!')
        return draw_tree(tree)
    else:
        print('Parse failed!')

In [None]:
pt = test('1 + 2 * 3')
pt

In [None]:
test('1 + 2 * (3 - 4)')

In [None]:
test('1 + * 2')