In [1]:
from IPython.core.display import HTML
with open ("../style.css", "r") as file:
    css = file.read()
HTML(css)

# A Shift-Reduce Parser for Arithmetic Expressions

In this notebook we implement a simple *recursive descend* parser for arithmetic expressions.
This parser will implement the following grammar:
$$
  \begin{eqnarray*}
  \mathrm{expr}        & \rightarrow & \mathrm{expr}\;\;\texttt{'+'}\;\;\mathrm{product}   \\
                       & \mid        & \mathrm{expr}\;\;\texttt{'-'}\;\;\mathrm{product}   \\
                       & \mid        & \mathrm{product}                                    \\[0.2cm]
  \mathrm{product}     & \rightarrow & \mathrm{product}\;\;\texttt{'*'}\;\;\mathrm{factor} \\
                       & \mid        & \mathrm{product}\;\;\texttt{'/'}\;\;\mathrm{factor} \\
                       & \mid        & \mathrm{factor}                                     \\[0.2cm]
  \mathrm{factor}      & \rightarrow & \texttt{'('} \;\;\mathrm{expr} \;\;\texttt{')'}     \\
                       & \mid        & \texttt{NUMBER} 
  \end{eqnarray*}
$$

## Implementing a Scanner

In [2]:
import re

The function `tokenize` transforms the string `s` into a list of tokens. See below for an example.

In [3]:
def tokenize(s):
    '''Transform the string s into a list of tokens.  The string s
       is supposed to represent an arithmetic expression.
    '''
    lexSpec = r'''([ \t]+)        |  # blanks and tabs
                  ([1-9][0-9]*|0) |  # number
                  ([()])          |  # parentheses 
                  ([-+*/])        |  # arithmetical operators
                  (.)                # unrecognized character
               '''
    tokenList = re.findall(lexSpec, s, re.VERBOSE)
    result    = []
    for ws, number, parenthesis, operator, error in tokenList:
        if ws:        # skip blanks and tabs
            continue
        elif number:
            result += [ 'int' ]
        elif parenthesis:
            result += [ parenthesis ]
        elif operator:
            result += [ operator ]
        else:
            result += [ f'ERROR({error})']
    return result

In [4]:
tokenize('1 + 2 * (3 - 4)')

['int', '+', 'int', '*', '(', 'int', '-', 'int', ')']

In [5]:
class ShiftReduceParser():
    def __init__(self, actionTable, gotoTable):
        self.mActionTable = actionTable
        self.mGotoTable   = gotoTable

In [6]:
def parse(self, TL):
    index   = 0      # points to next token
    Symbols = []     # stack of symbols
    States  = ['s0'] # stack of states, s0 is start state
    TL     += ['EOF']
    while True:
        q = States[-1]
        t = TL[index]
        print('Symbols:', ' '.join(Symbols + ['|'] + TL[index:]).strip())
        p = self.mActionTable.get((q, t), 'error')
        if p == 'error': 
            return False
        elif p == 'accept':
            return True
        elif p[0] == 'shift':
            s = p[1]
            Symbols += [t]
            States  += [s]
            index   += 1
        elif p[0] == 'reduce':
            head, body = p[1]
            n       = len(body)
            Symbols = Symbols[:-n]
            States  = States [:-n] 
            Symbols = Symbols + [head]
            state   = States[-1]
            States += [ self.mGotoTable[state, head] ]
            
ShiftReduceParser.parse = parse
del parse

In [7]:
%run Parse-Table.ipynb

## Testing

In [8]:
def test(s): 
    parser = ShiftReduceParser(actionTable, gotoTable)
    TL     = tokenize(s)
    print(f'tokenlist: {TL}\n')
    if parser.parse(TL):
        print('Parse successful!')
    else:
        print('Parse failed!')

In [9]:
test('(1 + 2) * 3')

tokenlist: ['(', 'int', '+', 'int', ')', '*', 'int']

Symbols: | ( int + int ) * int EOF
Symbols: ( | int + int ) * int EOF
Symbols: ( int | + int ) * int EOF
Symbols: ( F | + int ) * int EOF
Symbols: ( P | + int ) * int EOF
Symbols: ( E | + int ) * int EOF
Symbols: ( E + | int ) * int EOF
Symbols: ( E + int | ) * int EOF
Symbols: ( E + F | ) * int EOF
Symbols: ( E + P | ) * int EOF
Symbols: ( E | ) * int EOF
Symbols: ( E ) | * int EOF
Symbols: F | * int EOF
Symbols: P | * int EOF
Symbols: P * | int EOF
Symbols: P * int | EOF
Symbols: P * F | EOF
Symbols: P | EOF
Symbols: E | EOF
Parse successful!


In [10]:
test('1 * 2 + 3 * (4 - 5) / 2')

tokenlist: ['int', '*', 'int', '+', 'int', '*', '(', 'int', '-', 'int', ')', '/', 'int']

Symbols: | int * int + int * ( int - int ) / int EOF
Symbols: int | * int + int * ( int - int ) / int EOF
Symbols: F | * int + int * ( int - int ) / int EOF
Symbols: P | * int + int * ( int - int ) / int EOF
Symbols: P * | int + int * ( int - int ) / int EOF
Symbols: P * int | + int * ( int - int ) / int EOF
Symbols: P * F | + int * ( int - int ) / int EOF
Symbols: P | + int * ( int - int ) / int EOF
Symbols: E | + int * ( int - int ) / int EOF
Symbols: E + | int * ( int - int ) / int EOF
Symbols: E + int | * ( int - int ) / int EOF
Symbols: E + F | * ( int - int ) / int EOF
Symbols: E + P | * ( int - int ) / int EOF
Symbols: E + P * | ( int - int ) / int EOF
Symbols: E + P * ( | int - int ) / int EOF
Symbols: E + P * ( int | - int ) / int EOF
Symbols: E + P * ( F | - int ) / int EOF
Symbols: E + P * ( P | - int ) / int EOF
Symbols: E + P * ( E | - int ) / int EOF
Symbols: E + P * ( E - | int ) / 

In [None]:
test('11+22*(33-44)/(5-10*5/(4-3))')

In [None]:
test('1+2*3-')