Arith4 is a simple grammar for arithmetic.
We'll build a parser for this grammar.

```
<expr> ::= <expr> ("+" | "-") <term> | <term>
<term> ::= <term> ("*" | "/") <factor> | <factor>
<factor> ::= "(" <expr> ")" | <atom>
<atom> ::= <identifier> | <numeral>
<identifier> ::= <letter> { <letter>  }
<letter> ::= [a-z] 
<numeral> ::= [1-9] { [0-9] }
```

In [1]:
class Token:
  def __init__(self, text, pos):
    self.text = text
    self.pos = pos
    # pos is the position(=index) of the token in the list of tokens
    #   obtained from the tokenizer.
    # input text is guaranteed to be an operator or an operand
    self.type = 'operator' if text in ("+", "-", "*", "/") else 'operand'
  
  def __str__(self):
    return f'{self.text} ({self.type}) at {self.pos}'

In [2]:
t1 = Token("+", 3)
print(t1)
t2 = Token("123", 5)
print(t2)

+ (operator) at 3
123 (operand) at 5


In [3]:
import re

def tokenizer(text):
  pos = 0
  tokens = []
  # split the input text into a list of tokens at word boundries and whitespaces
  # then remove empty strings and strip off leading and trailing whitespaces
  li = [s.strip() for s in re.split(r"\b|\s", text, re.ASCII) 
                  if s.strip()]
  for s in li:
    if not s.isascii():
      raise ValueError(f"'{s}' is invalid (non-ASCII)")
    if not (set(s).issubset("+-*/") or        # operator
            (s.isdecimal() and s[0]!='0') or  # numeral
            (s.isalpha() and s.islower())):   # identifier
      raise ValueError(f"'{s}' is invalid")
    if(set(s).issubset("+-*/") and len(s) > 1):
      # split string of consecutive operators into individual characters
      for c in s:
        tokens.append(Token(c, pos))
        pos += 1
    else:
      tokens.append(Token(s, pos))
      pos += 1
  
  return tokens

In [4]:
def testTokenizer(text):
  try:
    tokens = tokenizer(text)
  except ValueError as e:
    print(f"Tokenizer: {e}")
  else:
    for t in tokens:
      print(t)

In [5]:
testTokenizer("first + second*+hello + 23+2")

first (operand) at 0
+ (operator) at 1
second (operand) at 2
* (operator) at 3
+ (operator) at 4
hello (operand) at 5
+ (operator) at 6
23 (operand) at 7
+ (operator) at 8
2 (operand) at 9


In [6]:
testTokenizer("first + second* +Hello + 23+2")
testTokenizer("first + second* +hello + 23+c2")
testTokenizer("first + second*-hello + 023+2")

Tokenizer: 'Hello' is invalid
Tokenizer: 'c2' is invalid
Tokenizer: '023' is invalid
