Arith4 is a simple grammar for arithmetic.
We'll build a parser for this grammar.

```
<expr> ::= <term> { ("+" | "-") <term> }
<term> ::= <factor> { ("*" | "/") <factor> }
<factor> ::= "(" <expr> ")" | <atom>
<atom> ::= <identifier> | <numeral>
<identifier> ::= <letter> { <letter> | <digit> }
<letter> ::= [a-z] 
<numeral> ::= <positive_digit> { <digit> }
<digit> ::= [0-9]
<positive_digit> :: = [1-9]
```

In [42]:
class Token:
  def __init__(self, value):
    self.value = value
    # input value is guaranteed to be a valid token
    if value in ("+", "-"):
      self.token_type = 'op_type1' # precedence 1
    elif value in ("*", "/"):
      self.token_type = 'op_type2' # precedence 2
    elif value == "(":
      self.token_type = 'lparen'
    elif value == ")":
      self.token_type = 'rparen'
    elif value.isdecimal():
      self.token_type = 'numeral'
    elif value.isalnum() and value[0].isalpha():
      self.token_type = 'identifier'
    else:
      raise ValueError(f"'{value}' is invalid token")
  
  def __str__(self):
    return f'{self.value} ({self.token_type})'

In [43]:
print(Token("+"))
print(Token("*"))
print(Token("("))
print(Token(")"))
print(Token("13"))
print(Token("abc"))


+ (op_type1)
* (op_type2)
( (lparen)
) (rparen)
13 (numeral)
abc (identifier)


In [44]:
import re

def tokenizer(input_text):
  tokens = []
  # split the input text into a list of tokens at word boundries and whitespaces
  # then remove empty strings and strip off leading and trailing whitespaces
  li = [s.strip() for s in re.split(r"\b|\s", input_text, re.ASCII) 
                  if s.strip()]
  for s in li: # s is a string
    if not s.isascii():
      raise ValueError(f"'{s}' is invalid (non-ASCII)")
    if not (set(s).issubset("+-*/()") or      # operator or parenthesis
            (s.isdecimal() and s[0]!='0') or  # numeral
            (s.isalnum() and s[0].isalpha() and s.islower())):   
                                              # identifier
      raise ValueError(f"'{s}' is invalid (non-token)")
    if set(s).issubset("+-*/()") and len(s) > 1:
      # split string of consecutive operators into individual characters
      for c in s: # c is an operator character
        tokens.append(Token(c))
    else:
      tokens.append(Token(s))
  
  return tokens

In [45]:
def testTokenizer(input_text):
  try:
    tokens = tokenizer(input_text)
  except ValueError as e:
    print(f"Tokenizer: {e}")
  else:
    for t in tokens:
      print(t)

In [46]:
testTokenizer("first + second* (hello + c1)+a23")

first (identifier)
+ (op_type1)
second (identifier)
* (op_type2)
( (lparen)
hello (identifier)
+ (op_type1)
c1 (identifier)
) (rparen)
+ (op_type1)
a23 (identifier)


In [47]:
testTokenizer("first + second* +Hello + 23+2")
testTokenizer("first + second*-hello + 023+2")

Tokenizer: 'Hello' is invalid (non-token)
Tokenizer: '023' is invalid (non-token)


In [48]:
class Node:
  def __init__(self, token, children=None):
    self.token = token # the node is labeled with a Token object
    self.children = children if children else [] # list of Node objects

  def __str__(self):
    return self.build_polish_notation()

  def build_polish_notation(self):
    ret_str = f"{self.token.value}"
    if self.children:
      ret_str += ' '
    ret_str += ' '.join(child.build_polish_notation() 
                        for child in self.children)
    return ret_str

In [49]:
class Parser:
  def __init__(self, tokens):
    self.tokens = tokens
    self.current_token = None
    self.index = -1
    self.advance()  # set self.current_token to 
                    # the first(i.e. self.index=0) element of tokens

  def advance(self): # increment self.index and set self.current_token
    self.index += 1
    if self.index < len(self.tokens):
      self.current_token = self.tokens[self.index]
    else:
      self.current_token = None

  def parse(self):
    return self.expr() # expr() corresponds to the starting symbol <expr>

  def expr(self):
    node = self.term()

    while(self.current_token is not None and  
          self.current_token.token_type in ('op_type1')):
      # If we are at '+' in "a + b * c - ..." then the next token is '-'
      # because we will consume tokens by self.advance() and self.term().
      token = self.current_token
      self.advance()
      right_term = self.term()
      node = Node(token, [node, right_term]) # left associative

    return node

  def term(self):
    node = self.factor()

    while(self.current_token is not None and 
          self.current_token.token_type in ('op_type2')):
      token = self.current_token
      self.advance()
      right = self.factor()
      node = Node(token, [node, right])

    return node

  def factor(self):
    if(self.current_token is not None and 
       self.current_token.token_type == 'lparen'):
      self.advance()
      node = self.expr()
      if(self.current_token is not None and 
         self.current_token.token_type == 'rparen'):
        self.advance()
      else:
        raise SyntaxError("Expected ')' after expression, in factor()")
    else:
      node = self.atom()

    return node

  def atom(self):
    if self.current_token is not None:
      token = self.current_token
      if token.token_type in ('numeral', 'identifier'):
        self.advance()
        return Node(token)
      else:
        raise SyntaxError(f"Expected numeral or identifier, in atom(): {token}")
    else:
      raise SyntaxError("Unexpected end of input, in atom()")
      
def parse_input(input_text):
  tokens = tokenizer(input_text)
  parser = Parser(tokens)
  ast = parser.parse() # ast = Abstract Syntax Tree
  return ast

def testParser(input_text):
  try:
    tree = parse_input(input_text)
  except ValueError as e:
    print(f"ValueError: {e}")
  except SyntaxError as e:
    print(f"SyntaxError: {e}")
  else:
    print(tree)

In [50]:
testParser("a + b * (c - d) + ab")
testParser("(a/b + 102)*(const - 2*var)")
testParser("c - a + UpperCaseVar")
testParser("c1 - a + UpperCaseVar")
testParser("a + + b")
testParser("a + b *")
testParser("-a + b *")

+ + a * b - c d ab
* + / a b 102 - const * 2 var
ValueError: 'UpperCaseVar' is invalid (non-token)
ValueError: 'UpperCaseVar' is invalid (non-token)
SyntaxError: Expected numeral or identifier, in atom(): + (op_type1)
SyntaxError: Unexpected end of input, in atom()
SyntaxError: Expected numeral or identifier, in atom(): - (op_type1)
