In [1]:
from IPython.core.display import HTML
with open ("../style.css", "r") as file:
    css = file.read()
HTML(css)

# A Shift-Reduce Parser for Arithmetic Expressions

In this notebook we implement a simple *recursive descend* parser for arithmetic expressions.
This parser will implement the following grammar:
$$
  \begin{eqnarray*}
  \mathrm{expr}        & \rightarrow & \mathrm{expr}\;\;\texttt{'+'}\;\;\mathrm{product}   \\
                       & \mid        & \mathrm{expr}\;\;\texttt{'-'}\;\;\mathrm{product}   \\
                       & \mid        & \mathrm{product}                                    \\[0.2cm]
  \mathrm{product}     & \rightarrow & \mathrm{product}\;\;\texttt{'*'}\;\;\mathrm{factor} \\
                       & \mid        & \mathrm{product}\;\;\texttt{'/'}\;\;\mathrm{factor} \\
                       & \mid        & \mathrm{factor}                                     \\[0.2cm]
  \mathrm{factor}      & \rightarrow & \texttt{'('} \;\;\mathrm{expr} \;\;\texttt{')'}     \\
                       & \mid        & \texttt{NUMBER} 
  \end{eqnarray*}
$$

In [2]:
import re

The function `tokenize` transforms the string `s` into a list of tokens. See below for an example.

In [3]:
def tokenize(s):
    '''Transform the string s into a list of tokens.  The string s
       is supposed to represent an arithmetic expression.
    '''
    lexSpec = r'''([ \t]+)        |  # blanks and tabs
                  ([1-9][0-9]*|0) |  # number
                  ([()])          |  # parentheses 
                  ([-+*/])        |  # arithmetical operators
                  (.)                # unrecognized character
               '''
    tokenList = re.findall(lexSpec, s, re.VERBOSE)
    result    = []
    for ws, number, parenthesis, operator, error in tokenList:
        if ws:        # skip blanks and tabs
            continue
        elif number:
            result += [ 'int' ]
        elif parenthesis:
            result += [ parenthesis ]
        elif operator:
            result += [ operator ]
        else:
            result += [ f'ERROR({error})']
    return result

In [4]:
tokenize('1 + 2 * (3 - 4)')

['int', '+', 'int', '*', '(', 'int', '-', 'int', ')']

In [5]:
class ShiftReduceParser():
    def __init__(self, actionTable, gotoTable, stateTable):
        self.mActionTable = actionTable
        self.mGotoTable   = gotoTable
        self.mStateTable  = stateTable

In [6]:
def parse(self, TL):
    index   = 0      # points to next token
    Symbols = []     # stack of symbols
    States  = ['s0'] # stack of states, s0 is start state
    TL     += ['EOF']
    while True:
        q = States[-1]
        t = TL[index]
        print(f'States:  [ {", ".join(States)} ]')
        print('Symbols:', ' '.join(Symbols + ['|'] + TL[index:]).strip())
        print('State:   {', ", ".join(self.mStateTable[q]), '}')
        p = self.mActionTable.get((q, t), 'error')
        if p == 'error': 
            print(f'Action({q}, {t}) undefined.')
            print('Syntax error!\n')
            return False
        elif p == 'accept':
            print('Accepting!\n')
            return True
        elif p[0] == 'shift':
            s = p[1]
            print(f'Shifting state {s}')
            print('State:   {', ', '.join(self.mStateTable[s]), '}\n')
            Symbols += [t]
            States  += [s]
            index   += 1
        elif p[0] == 'reduce':
            head, body = p[1]
            print(f'Reducing with rule {head} → {" ".join(body)}')
            n       = len(body)
            Symbols = Symbols[:-n]
            States  = States [:-n] 
            Symbols = Symbols + [head]
            state   = States[-1]
            States += [ self.mGotoTable[state, head] ]
            print('State:   {', ', '.join(self.mStateTable[self.mGotoTable[state, head]]), '}\n')
            
ShiftReduceParser.parse = parse
del parse

In [7]:
%run Parse-Table.ipynb

## Testing

In [8]:
def test(s): 
    parser = ShiftReduceParser(actionTable, gotoTable, stateTable)
    TL     = tokenize(s)
    print(f'tokenlist: {TL}\n')
    if parser.parse(TL):
        print('Parse successful!')
    else:
        print('Parse failed!')

In [9]:
test('1 + 2 * 3')

tokenlist: ['int', '+', 'int', '*', 'int']

States:  [ s0 ]
Symbols: | int + int * int EOF
State:   { E -> • P, E -> • E "+" P, S -> • E, F -> • int, E -> • E "-" P, P -> • P "/" F, P -> • F, P -> • P "*" F, F -> • "(" E ")" }
Shifting state s2
State:   { F -> int • }

States:  [ s0, s2 ]
Symbols: int | + int * int EOF
State:   { F -> int • }
Reducing with rule F → int
State:   { P -> F • }

States:  [ s0, s1 ]
Symbols: F | + int * int EOF
State:   { P -> F • }
Reducing with rule P → F
State:   { P -> P • "*" F, E -> P •, P -> P • "/" F }

States:  [ s0, s3 ]
Symbols: P | + int * int EOF
State:   { P -> P • "*" F, E -> P •, P -> P • "/" F }
Reducing with rule E → P
State:   { E -> E • "+" P, S -> E •, E -> E • "-" P }

States:  [ s0, s4 ]
Symbols: E | + int * int EOF
State:   { E -> E • "+" P, S -> E •, E -> E • "-" P }
Shifting state s8
State:   { P -> • P "/" F, P -> • F, P -> • P "*" F, E -> E "+" • P, F -> • int, F -> • "(" E ")" }

States:  [ s0, s4, s8 ]
Symbols: E + | int * int 

In [None]:
test('1 + 2 * (3 - 4)')