Arith6 is an extension of Arith5.  

We add function symbols.

```perl
<expr> ::= (<term> | <nterm>) { ("+" | "-") <term> }
<nterm> ::= "-" { "-" } <term>
<term> ::= <factor> { ("*" | "/") <factor> }
<factor> ::= { <factor_exp> "^" } <factor_exp>
<factor_exp> ::= <factor_post> { ("!" | "'") }
<factor_post> ::= "(" <expr> ")"  | <func_call> | <atom>
<func_call> ::= <func_symb> '(' <expr> {',' <expr>} ')' 
  # 0-ary functions are not allowed
<atom> ::= <identifier> | <numeral>
<identifier> ::= <letter> { <letter> | <digit> }
<letter> ::= [a-zA-Z] 
<numeral> ::= <nonzero_digit> { <digit> }
<digit> ::= [0-9]
<nonzero_digit> :: = [1-9] 
<func_symb> ::= "sin" | "cos" | "max" | "min" | "f"
  # arities are given in Token constructor
  # sin, cos are unary, max, min are binary, f are ternary
```

In [2]:
class Token:
  def __init__(self, value):
    self.value = value
    self.arity = None
    # input value is guaranteed to be a valid token
    if value == ",":
      self.token_type = 'comma'
    elif value in ("+", "-"):
      self.token_type = 'op_bin_1' # precedence 1
    elif value in ("*", "/"):
      self.token_type = 'op_bin_2' # precedence 2
    elif value == "(":
      self.token_type = 'lparen'
    elif value == ")":
      self.token_type = 'rparen'
    elif value in ("!", "'"):
      self.token_type = 'op_postfix'
    elif value == "^":
      self.token_type = "op_bin_exp"
    elif value.isdecimal():
      self.token_type = 'numeral'
    elif(value in ("sin", "cos", "max", "min", "f")):
        self.token_type = 'func_symb'
        if(value in ("sin", "cos")):
          self.arity = 1
        elif(value in ("max", "min")):
          self.arity = 2
        else:
          self.arity = 3
    elif value.isalnum() and value[0].isalpha():
      self.token_type = 'identifier'
    else:
      raise ValueError(f"'{value}' is invalid (Token)")
  
  def __str__(self):
    ret_str = f'{self.value} ({self.token_type})'
    if self.arity is not None:
      ret_str += f' arity: {self.arity}'
    return ret_str

In [3]:
print(Token(","))
print(Token("+"), Token("*"), sep=", ")
print(Token("("), Token(")"), sep=", ")
print(Token("13"))
print(Token("abc"), Token("a1"), sep=", ")
print(Token("!"), Token("'"), sep=", ")
print(Token("^"))
print(Token("sin"), Token("max"), Token("f"), sep=", ")

, (comma)
+ (op_bin_1), * (op_bin_2)
( (lparen), ) (rparen)
13 (numeral)
abc (identifier), a1 (identifier)
! (op_postfix), ' (op_postfix)
^ (op_bin_exp)
sin (func_symb) arity: 1, max (func_symb) arity: 2, f (func_symb) arity: 3


In [4]:
import re

def tokenizer(input_text):
  tokens = []
  # split the input text into a list of tokens at word boundries and whitespaces
  # then remove empty strings and strip off leading and trailing whitespaces
  li = [s.strip() for s in re.split(r"\b|\s", input_text, re.ASCII) 
                  if s.strip()]
  for s in li: # s is a string
    if not s.isascii():
      raise ValueError(f"'{s}' is invalid (non-ASCII)")
    if not (set(s).issubset("+-*/()!'^,") or  # operator or parenthesis or comma
            (s.isdecimal() and s[0]!='0') or  # numeral
            (s.isalnum() and s[0].isalpha())):   
                                              # identifier or function symbol
      raise ValueError(f"'{s}' is invalid (non-token)")
    if set(s).issubset("+-*/()!'^,") and len(s) > 1:
      # split string of consecutive operators into individual characters
      for c in s: # c is an operator character
        tokens.append(Token(c))
    else:
      tokens.append(Token(s))
  
  return tokens

In [5]:
def testTokenizer(input_text):
  try:
    tokens = tokenizer(input_text)
  except ValueError as e:
    print(f"Tokenizer: {e}")
  else:
    for t in tokens:
      print(t)

In [6]:
testTokenizer("max(a/b!+ 2, 10) + b1^b'*(-c1!)* (hello - cc1)+a23")

max (func_symb) arity: 2
( (lparen)
a (identifier)
/ (op_bin_2)
b (identifier)
! (op_postfix)
+ (op_bin_1)
2 (numeral)
, (comma)
10 (numeral)
) (rparen)
+ (op_bin_1)
b1 (identifier)
^ (op_bin_exp)
b (identifier)
' (op_postfix)
* (op_bin_2)
( (lparen)
- (op_bin_1)
c1 (identifier)
! (op_postfix)
) (rparen)
* (op_bin_2)
( (lparen)
hello (identifier)
- (op_bin_1)
cc1 (identifier)
) (rparen)
+ (op_bin_1)
a23 (identifier)


In [7]:
# some invalid inputs
testTokenizer("First + Second2* +1hello + 23+2")
testTokenizer("first + second*-hello + 023+2")

Tokenizer: '1hello' is invalid (non-token)
Tokenizer: '023' is invalid (non-token)


In [8]:
class Node:
  def __init__(self, token, children=None):
    self.token = token # the node is labeled with a Token object
    self.children = children if children else [] # list of Node objects

  def __str__(self):
    return self.build_polish_notation()

  def build_polish_notation(self, opt=False):
    ret_str = (f"{self.token.value}({self.token.token_type})" if opt 
      else f"{self.token.value}")
    if self.children:
      ret_str += ' '
    ret_str += ' '.join(child.build_polish_notation(opt) 
                        for child in self.children)
    return ret_str

In [9]:
1 in [1]

True

In [10]:
class Parser:
  def __init__(self, tokens):
    self.tokens = tokens
    self.current_token = None
    self.index = -1
    self.advance()  # set self.current_token to 
                    # the first(i.e. self.index=0) element of tokens

  def advance(self): # increment self.index and set self.current_token
    self.index += 1
    if self.index < len(self.tokens):
      self.current_token = self.tokens[self.index]
    else:
      self.current_token = None

  def check_token_type(self, token_types):
    # token_types can be a string or a tuple of strings
    # Check if self.current_token is of type token_types if token_types is a string
    # or belongs to token_types if token_types is a tuple of strings.
    token = self.current_token
    if token is None:
      return False
    elif type(token_types) is not tuple: # must be a string in this case
        return token.token_type == token_types
    elif token.token_type in token_types:
      return True
    else:
      return False
    
  def check_token_value(self, token_value):
    # Check if self.current_token is of value token_value.
    token = self.current_token
    if token is None:
      return False
    elif token.value == token_value:
      return True
    else:
      return False
    
  def parse(self):
    return self.expr() # expr() corresponds to the starting symbol <expr>

  def expr(self):
    if self.check_token_value('-'): # unary minus
      node = self.nterm() # negative term
    else: # ordinary term
      node = self.term()  

    while self.check_token_type('op_bin_1'): # '+' or '-'
      # If we are at '+' in "a + b * c - ..." then the next token is '-'
      # because we will consume tokens by self.advance() and self.term().
      token = self.current_token
      self.advance()
      right_term = self.term()
      node = Node(token, [node, right_term]) # left associative
    
    return node
  
  def nterm(self):
    token = self.current_token 
    # For the first visit only, token.value == '-' is  guaranteed
    #   because we have checked it in self.expr().
    # But for subsequent recursive calls it can be otherwise.
    if(token is None or token.value != '-'):
      node = self.term()
    else:
      token.token_type = 'op_unary_prefix'
      self.advance()
      unary_node = self.nterm() # recursive call
      node = Node(token, [unary_node])

    return node
  
  def term(self):
    node = self.factor()

    while self.check_token_type('op_bin_2'): # '*' or '/'
      token = self.current_token
      self.advance()
      right_factor = self.factor()
      node = Node(token, [node, right_factor])

    return node

  def factor(self):
    node = self.factor_exp()

    if self.check_token_type('op_bin_exp'): # '^'
      token = self.current_token
      self.advance()
      right_factor = self.factor() # recursive call for right associativity
      node = Node(token, [node, right_factor])

    return node
  
  def factor_exp(self):
    node = self.factor_postfix()

    while self.check_token_type('op_postfix'):
      token = self.current_token
      self.advance()
      node = Node(token, [node])

    return node

  def factor_postfix(self):
    if self.check_token_type('lparen'):
      self.advance()
      node = self.expr()
      if self.check_token_type('rparen'):
        self.advance()
      else:
        raise SyntaxError("Expected ')' after expression at {self.index} in factor_postfix(), but {self.current_token} is given.")
    elif self.check_token_type('func_symb'):
      node = self.func_call()
    else:
      node = self.atom()

    return node

  def func_call(self):
    if self.current_token is not None:
      token = self.current_token
      if self.check_token_type('func_symb'):
        self.advance()
        if self.check_token_type('lparen'):
          self.advance()
          args = []

          while True:
            args.append(self.expr())
            if self.check_token_type('comma'):
              self.advance()
            elif self.check_token_type('rparen'):
              break
            else:
              raise SyntaxError(f"Expected ',' or ')' after function argument at {self.index} in func_call(), but {self.current_token} is given.")
          
          # arity check
          if token.arity is None or token.arity != len(args):
            raise SyntaxError(f"Function {token.value} expects {token.arity} arguments, but {len(args)} were given")

          self.advance()
          return Node(token, args)
        
        else:
          raise SyntaxError(f"Expected '(' after function symbol at {self.index} in func_call(), but {self.current_token} is given.")
      else:
        raise SyntaxError(f"Expected function symbol at {self.index} in func_call(), but {token} is given.")
    else:
      raise SyntaxError("Unexpected end of input, in func_call()")
      
  def atom(self):
    if self.current_token is not None:
      token = self.current_token
      if self.check_token_type(('numeral', 'identifier')):
        self.advance()
        return Node(token)
      else:
        raise SyntaxError(f"Expected numeral or identifier at {self.index}, in atom(), but {token} is given.")
    else:
      raise SyntaxError("Unexpected end of input, in atom()")
      
def parse_input(input_text):
  tokens = tokenizer(input_text)
  parser = Parser(tokens)
  ast = parser.parse() # ast = Abstract Syntax Tree
  if parser.current_token is not None:
    raise SyntaxError(f"Unexpected token {parser.current_token} at {parser.index}, in parse_input(). Expected end of input.")
  return ast

def testParser(input_text, showOperType=False):
  try:
    ast = parse_input(input_text)
  except ValueError as e:
    print(f"ValueError: {e}")
  except SyntaxError as e:
    print(f"SyntaxError: {e}")
  else:
    print(ast.build_polish_notation(showOperType))

In [11]:
testParser("a + f(sin(a),max(b, -a/b'! + 3), 12)")

+ a f sin a max b + - / a ! ' b 3 12


In [12]:
# Some illegal expressions
testParser("a + f(a, b)")
testParser("a + f(a, b, )")

SyntaxError: Function f expects 3 arguments, but 2 were given
SyntaxError: Expected numeral or identifier at 8, in atom(), but ) (rparen) is given.
