Arith4 is a simple grammar for arithmetic.
We'll build a parser for this grammar.

```
<expr> ::= <expr> ("+" | "-") <term> | <term>
<term> ::= <term> ("*" | "/") <factor> | <factor>
<factor> ::= "(" <expr> ")" | <atom>
<atom> ::= <identifier> | <numeral>
<identifier> ::= <letter> { <letter>  }
<letter> ::= [a-z] 
<numeral> ::= [1-9] { [0-9] }
```

In [1]:
class Token():
  def __init__(self, text, pos):
    self.text = text
    self.pos = pos
    # pos is the position(=index) of the token in the list of tokens
    #   obtained from the tokenizer.
    # input text is guaranteed to be an operator or an operand
    self.type = 'operator' if text in ("+", "-", "*", "/") else 'operand'
  
  def __str__(self):
    return f'{self.text} ({self.type}) at {self.pos}'

In [2]:
t1 = Token("+", 3)
print(t1)
t2 = Token("123", 5)
print(t2)

+ (operator) at 3
123 (operand) at 5


In [3]:
import re

class Tokenizer():
  def __init__(self, text):
    self.text = text
    self.pos = 0
    self.tokens = []
    self.tokenize()
  
  def tokenize(self):
    li = [s.strip() for s in re.split(r"\b|\s",self.text, re.ASCII) if s.strip()]
    for s in li:
      if not s.isascii():
        raise ValueError(f"'{s}' is invalid (non-ASCII)")
      if not (set(s).issubset("+-*/") or # operator
              (s.isdecimal() and s[0]!='0') or # numeral
              (s.isalpha() and s.islower())): # identifier
        raise ValueError(f"'{s}' is invalid")
      if(set(s).issubset("+-*/") and len(s) > 1):
        # split string of consecutive operators into individual characters
        for c in s:
          self.tokens.append(Token(c, self.pos))
          self.pos += 1
      else:
        self.tokens.append(Token(s, self.pos))
        self.pos += 1

In [4]:
textall = "first + second*+hello + 23+2"
try:
  tokens = Tokenizer(textall).tokens
except ValueError as e:
  print(e)
else:
  for t in tokens:
    print(t)

first (operand) at 0
+ (operator) at 1
second (operand) at 2
* (operator) at 3
+ (operator) at 4
hello (operand) at 5
+ (operator) at 6
23 (operand) at 7
+ (operator) at 8
2 (operand) at 9


In [5]:
textall = "first + second* +Hello + 23+2"
try:
  tokens = Tokenizer(textall).tokens
except ValueError as e:
  print(e)
else:
  for t in tokens:
    print(t)

'Hello' is invalid


In [6]:
textall = "first + second* +hello + 23+c2"
try:
  tokens = Tokenizer(textall).tokens
except ValueError as e:
  print(e)
else:
  for t in tokens:
    print(t)

'c2' is invalid


In [9]:
textall = "first + second*-hello + 023+2"
try:
  tokens = Tokenizer(textall).tokens
except ValueError as e:
  print(e)
else:
  for t in tokens:
    print(t)

'023' is invalid
