Arith6 is an extension of Arith5.  

We add function symbols.

```perl
<expr> ::= (<term> | <nterm>) { ("+" | "-") <term> }
<nterm> ::= "-" { "-" } <term>
<term> ::= <factor> { ("*" | "/") <factor> }
<factor> ::= { <factor_exp> "^" } <factor_exp>
<factor_exp> ::= <factor_post> { ("!" | "'") }
<factor_post> ::= "(" <expr> ")"  | <func_call> | <atom>
<func_call> ::= <func_symb> '(' <expr> {',' <expr>} ')' 
  # 0-ary functions are not allowed
<atom> ::= <identifier> | <numeral>
<identifier> ::= <letter> { <letter> | <digit> }
<letter> ::= [a-zA-Z] 
<numeral> ::= <nonzero_digit> { <digit> }
<digit> ::= [0-9]
<nonzero_digit> :: = [1-9] 
<func_symb> ::= "sin" | "cos" | "max" | "min" | "f"
  # arities are given in Token constructor
  # sin, cos are unary, max, min are binary, f are ternary
```

In [191]:
class Token:
  def __init__(self, value):
    # You can put more function symbols here. 
    # For example, ('tan', 1), ('log', 1), ('choose', 2), ('g', 2), ('f1', 1).
    FUNCTION_SYMBOLS = dict([('sin', 1), ('cos', 1), ('max', 2), ('min', 2), ('f', 3)])

    self.value = value
    self.arity = None 
    self.precedence = None
    # input value is guaranteed to be a valid token
    if value == ",":
      self.token_type = 'comma'
    elif value in ("+", "-"):
      self.token_type = 'op_bin_1' # weak precedence
      self.precedence = 1
    elif value in ("*", "/"):
      self.token_type = 'op_bin_2' # medium precedence
      self.precedence = 2
    elif value == "(":
      self.token_type = 'lparen'
    elif value == ")":
      self.token_type = 'rparen'
    elif value in ("!", "'"):
      self.token_type = 'op_postfix' # strongest precedence
      self.precedence = 4
    elif value == "^":
      self.token_type = "op_bin_exp" # strong precedence
      self.precedence = 3
    elif value.isdecimal():
      self.token_type = 'numeral'
      self.precedence = 6
    elif value in FUNCTION_SYMBOLS:
        self.token_type = 'func_symb'
        self.arity = FUNCTION_SYMBOLS[value]
        self.precedence = 5
    elif value.isalnum() and value[0].isalpha():
      self.token_type = 'identifier'
      self.precedence = 6
    else:
      raise ValueError(f"'{value}' is invalid (Token)")
  
# There is a token_type which is called 'op_unary_prefix'. 
# It will be given to '-' during parsing. 
# It has precedence 1, the same as binary "-"'s precedence.
# During tokenization, '-' is given the provisional type 'op_bin_1'.
# This can only appear as the first token in an expression. 
# a * -b, a + -b, a ^ -b are all invalid. 
# a * (-b), a + (-b), a ^ (-b) are all valid. f(-x, -y) is valid too.

  def __str__(self):
    ret_str = f'{self.value} ({self.token_type})'
    if self.arity is not None:
      ret_str += f' arity: {self.arity}'
    return ret_str

In [192]:
# Test
print(Token(","))
print(Token("+"), Token("*"), sep=", ")
print(Token("("), Token(")"), sep=", ")
print(Token("13"))
print(Token("abc"), Token("a1"), sep=", ")
print(Token("!"), Token("'"), sep=", ")
print(Token("^"))
print(Token("sin"), Token("max"), Token("f"), sep=", ")

, (comma)
+ (op_bin_1), * (op_bin_2)
( (lparen), ) (rparen)
13 (numeral)
abc (identifier), a1 (identifier)
! (op_postfix), ' (op_postfix)
^ (op_bin_exp)
sin (func_symb) arity: 1, max (func_symb) arity: 2, f (func_symb) arity: 3


In [193]:
import re

def tokenizer(input_text):
  tokens = []
  # split the input text into a list of tokens at word boundries and whitespaces
  # then remove empty strings and strip off leading and trailing whitespaces
  li = [s.strip() for s in re.split(r"\b|\s", input_text, re.ASCII) 
                  if s.strip()]
  for s in li: # s is a string
    if not s.isascii():
      raise ValueError(f"'{s}' is invalid (non-ASCII)")
    if not (set(s).issubset("+-*/()!'^,") or  # operator or parenthesis or comma
            (s.isdecimal() and s[0]!='0') or  # numeral
            (s.isalnum() and s[0].isalpha())):   
                                              # identifier or function symbol
      raise ValueError(f"'{s}' is invalid (non-token)")
    if set(s).issubset("+-*/()!'^,") and len(s) > 1:
      # split string of consecutive operators into individual characters
      for c in s: # c is an operator character
        tokens.append(Token(c))
    else:
      tokens.append(Token(s))
  
  return tokens

In [194]:
def testTokenizer(input_text):
  try:
    tokens = tokenizer(input_text)
  except ValueError as e:
    print(f"Tokenizer: {e}")
  else:
    for t in tokens:
      print(t)

In [195]:
testTokenizer("max(a/b!+ min(n,k), 10) + b1^b'*(-c1!)* (hello - cos(pi)))+a23")

max (func_symb) arity: 2
( (lparen)
a (identifier)
/ (op_bin_2)
b (identifier)
! (op_postfix)
+ (op_bin_1)
min (func_symb) arity: 2
( (lparen)
n (identifier)
, (comma)
k (identifier)
) (rparen)
, (comma)
10 (numeral)
) (rparen)
+ (op_bin_1)
b1 (identifier)
^ (op_bin_exp)
b (identifier)
' (op_postfix)
* (op_bin_2)
( (lparen)
- (op_bin_1)
c1 (identifier)
! (op_postfix)
) (rparen)
* (op_bin_2)
( (lparen)
hello (identifier)
- (op_bin_1)
cos (func_symb) arity: 1
( (lparen)
pi (identifier)
) (rparen)
) (rparen)
) (rparen)
+ (op_bin_1)
a23 (identifier)


In [196]:
# some invalid inputs
testTokenizer("First + Second2* +1hello + 23+2")
testTokenizer("first + second*-hello + 023+2")

Tokenizer: '1hello' is invalid (non-token)
Tokenizer: '023' is invalid (non-token)


In [197]:
class Node:
  def __init__(self, token, children=None):
    self.token = token # the node is labeled with a Token object
    self.children = children if children else [] # list of Node objects

  def __str__(self):
    return self.build_polish_notation()

  def build_polish_notation(self, opt=False):
    ret_str = f"{self.token.value}({self.token.token_type})" if opt \
      else f"{self.token.value}"
    if self.children:
      ret_str += ' '
    ret_str += ' '.join(child.build_polish_notation(opt) 
                        for child in self.children)
    return ret_str
  
  def build_RPN(self, opt=False):
    ret_str = ''
    if self.children:
      ret_str += ' '.join(child.build_RPN(opt) 
                          for child in self.children) + ' '
    ret_str += f"{self.token.value}({self.token.token_type})" if opt \
      else f"{self.token.value}"
    return ret_str
  
  def build_infix_latex(self):
    # This method is harder to implement than the other two, but not very much so.
    # Basically, we do recursion as we did everywhere else.
    # All we need is repeat
    #   root sub1 sub2 => (sub1) root (sub2)
    # recursively.  But we need to take care of the parentheses, for otherwise
    # we will get too many (unharmful but unnecessary) parentheses. 
    # In *(+(a, b), *(c, d)) => (a + b) * (c * d), we need the parentheses around
    # a + b, but not around c * d. This is because * has higher precedence than +.
    if not self.children: # leaf node, i.e. a numeral or an identifier
      return self.token.value
    else:
      # self.token.token_type == func_symbol | op_unary_prefix | op_bin_1 | 
      #                          op_bin_2 | op_bin_exp | op_postfix
      ret_str = ''
      if self.token.token_type == 'func_symb':
        label = self.token.value
        if(len(label) > 1):
          label = r'\operatorname{' + label + '}'
        ret_str += label + '('
        ret_str += ', '.join(kid.build_infix_latex() for kid in self.children) + ')'
      else: # self.token is an operator symbol
        if self.token.precedence == 1: # op_unary_prefix or op_bin_1
          if self.token.token_type == 'op_unary_prefix':
            kid1 = self.children[0]
            kid1_str = kid1.build_infix_latex()
            if kid1.token.precedence == 1:
              kid1_str = '(' + kid1_str + ')'
            # else pass
            ret_str += self.token.value + kid1_str
          else: # op_bin_1
            kid1, kid2 = self.children
            kid1_str = kid1.build_infix_latex()
            kid2_str = kid2.build_infix_latex()
            if self.token.value == '-' and kid2.token.precedence == 1:
              kid2_str = '(' + kid2_str + ')'
            # else pass
            ret_str += kid1_str + ' ' + self.token.value + ' ' + kid2_str
        elif self.token.precedence == 2: # op_bin_2
          kid1, kid2 = self.children
          kid1_str = kid1.build_infix_latex()
          kid2_str = kid2.build_infix_latex()
          if self.token.value == '/' and kid2.token.precedence == 2:
            kid2_str = '(' + kid2_str + ')'
          if kid1.token.precedence < self.token.precedence:
            kid1_str = '(' + kid1_str + ')'
          if kid2.token.precedence < self.token.precedence:
            kid2_str = '(' + kid2_str + ')'
          ret_str += kid1_str + ' ' + self.token.value + ' ' + kid2_str
        elif self.token.precedence == 3: # op_bin_exp
          kid1, kid2 = self.children
          kid1_str = kid1.build_infix_latex()
          kid2_str = kid2.build_infix_latex()
          if kid1.token.precedence <= self.token.precedence: 
            # <= instead of < because of right-associativity
            kid1_str = '(' + kid1_str + ')'
          if kid2.token.precedence < self.token.precedence:
            pass # In a^(b+c), we don't need parentheses around b+c when it's latexed.
          ret_str += kid1_str + ' ' + self.token.value + ' ' + '{' + kid2_str + '}'
        else: # precedenc==4. must be of type op_postfix
          kid1 = self.children[0]
          kid1_str = kid1.build_infix_latex()
          if kid1.token.precedence < self.token.precedence:
            kid1_str = '(' + kid1_str + ')'
          ret_str += kid1_str + self.token.value
      return ret_str

In [198]:
class Parser:
  def __init__(self, tokens):
    self.tokens = tokens
    self.current_token = None
    self.index = -1
    self.advance()  # set self.current_token to 
                    # the first(i.e. self.index=0) element of tokens

  def advance(self): # increment self.index and set self.current_token
    self.index += 1
    if self.index < len(self.tokens):
      self.current_token = self.tokens[self.index]
    else:
      self.current_token = None

  def check_token_type(self, token_types):
    # token_types can be a string or a tuple of strings
    # Check if self.current_token is of type token_types if token_types is a string
    # or belongs to token_types if token_types is a tuple of strings.
    token = self.current_token
    if token is None:
      return False
    elif type(token_types) is not tuple: # must be a string in this case
        return token.token_type == token_types
    elif token.token_type in token_types:
      return True
    else:
      return False
    
  def check_token_value(self, token_value):
    # Check if self.current_token is of value token_value.
    token = self.current_token
    if token is None:
      return False
    elif token.value == token_value:
      return True
    else:
      return False
    
  def parse(self):
    return self.expr() # expr() corresponds to the starting symbol <expr>

  def expr(self):
    if self.check_token_value('-'): # unary minus
      node = self.nterm() # negative term
    else: # ordinary term
      node = self.term()  

    while self.check_token_type('op_bin_1'): # '+' or '-'
      # If we are at '+' in "a + b * c - ..." then the next token is '-'
      # because we will consume tokens by self.advance() and self.term().
      token = self.current_token
      self.advance()
      right_term = self.term()
      node = Node(token, [node, right_term]) # left associative
    
    return node
  
  def nterm(self):
    token = self.current_token 
    # For the first visit only, token.value == '-' is  guaranteed
    #   because we have checked it in self.expr().
    # But for subsequent recursive calls it can be otherwise.
    if(token is None or token.value != '-'):
      node = self.term()
    else:
      token.token_type = 'op_unary_prefix'
      self.advance()
      unary_node = self.nterm() # recursive call
      node = Node(token, [unary_node])

    return node
  
  def term(self):
    node = self.factor()

    while self.check_token_type('op_bin_2'): # '*' or '/'
      token = self.current_token
      self.advance()
      right_factor = self.factor()
      node = Node(token, [node, right_factor])

    return node

  def factor(self):
    node = self.factor_exp()

    if self.check_token_type('op_bin_exp'): # '^'
      token = self.current_token
      self.advance()
      right_factor = self.factor() # recursive call for right associativity
      node = Node(token, [node, right_factor])

    return node
  
  def factor_exp(self):
    node = self.factor_postfix()

    while self.check_token_type('op_postfix'):
      token = self.current_token
      self.advance()
      node = Node(token, [node])

    return node

  def factor_postfix(self):
    if self.check_token_type('lparen'):
      self.advance()
      node = self.expr()
      if self.check_token_type('rparen'):
        self.advance()
      else:
        raise SyntaxError(f"Expected ')' after expression at {self.index} " +
                          f"in factor_postfix(), but {self.current_token} is given.")
    elif self.check_token_type('func_symb'):
      node = self.func_call()
    else:
      node = self.atom()

    return node

  def func_call(self):
    if self.current_token is not None:
      token = self.current_token
      if self.check_token_type('func_symb'):
        self.advance()
        if self.check_token_type('lparen'):
          self.advance()
          args = []

          while True:
            args.append(self.expr())
            if self.check_token_type('comma'):
              self.advance()
            elif self.check_token_type('rparen'):
              break
            else:
              raise SyntaxError(f"Expected ',' or ')' after function argument at " +
                                f"{self.index} in, but {self.current_token} is given.")
          
          # arity check
          if token.arity is None or token.arity != len(args):
            raise SyntaxError(f"Function {token.value} expects {token.arity} " +
                              f"arguments, but {len(args)} were given")

          self.advance()
          return Node(token, args)
        
        else:
          raise SyntaxError(f"Expected '(' after function symbol at {self.index}" +
                            f" in func_call(), but {self.current_token} is given.")
      else:
        raise SyntaxError(f"Expected function symbol at {self.index} in" +
                          f" func_call(), but {token} is given.")
    else:
      raise SyntaxError("Unexpected end of input, in func_call()")
      
  def atom(self):
    if self.current_token is not None:
      token = self.current_token
      if self.check_token_type(('numeral', 'identifier')):
        self.advance()
        return Node(token)
      else:
        raise SyntaxError(f"Expected numeral or identifier at {self.index}," +
                          f" in atom(), but {token} is given.")
    else:
      raise SyntaxError("Unexpected end of input, in atom()")
    

In [199]:
def parse_text(input_text):
  tokens = tokenizer(input_text)
  parser = Parser(tokens)
  ast = parser.parse() # ast = Abstract Syntax Tree
  if parser.current_token is not None:
    raise SyntaxError(f"Unexpected token {parser.current_token} at {parser.index}," +
                      f" in parse_text(). Expected end of input.")
  return ast

def testParser(input_text, showOption='polish', showOperType=False):
  # showOption ::= 'polish' | 'RPN' | 'infix_latex' | 'tree'
  # showOperType has effect only when showOption == 'polish' or 'RPN'
  from IPython.display import display, Math

  try:
    ast = parse_text(input_text)
  except ValueError as e:
    print(f"ValueError: {e}")
  except SyntaxError as e:
    print(f"SyntaxError: {e}")
  else:
    if showOption=='polish':
      print(ast.build_polish_notation(showOperType))
    elif showOption=='RPN':
      print(ast.build_RPN(showOperType))
    elif showOption=='infix_latex':
      s = ast.build_infix_latex()
      print(s)
      display(Math(f"${s}$"))
    elif showOption=='tree':
      pass
      # ast.draw_tree()
  

In [200]:
testParser("a + b * c - d")
testParser("a + b * c - d", showOperType=True)
testParser("a + f(sin(a),max(b, -a/b'! + 3), 12)")

- + a * b c d
-(op_bin_1) +(op_bin_1) a(identifier) *(op_bin_2) b(identifier) c(identifier) d(identifier)
+ a f sin a max b + - / a ! ' b 3 12


In [201]:
testParser('-a*b')
testParser('(-a)*b')

- * a b
* - a b


In [202]:
testParser("a + b * c - d", showOption='RPN')
testParser("a + b * c - d", showOption='RPN', showOperType=True)
testParser("a + f(sin(a),max(b, -a/b'! + 3), 12)", showOption='RPN')
testParser("a + f(sin(a),max(b, -a/b'! + 3), 12)", showOption='infix_latex')
testParser("a + f(sin(a),max(b, (-a/b'! + 3)), 12)", showOption='infix_latex')

a b c * + d -
a(identifier) b(identifier) c(identifier) *(op_bin_2) +(op_bin_1) d(identifier) -(op_bin_1)
a a sin b a b ' ! / - 3 + max 12 f +
a + f(\operatorname{sin}(a), \operatorname{max}(b, -a / b'! + 3), 12)


<IPython.core.display.Math object>

a + f(\operatorname{sin}(a), \operatorname{max}(b, -a / b'! + 3), 12)


<IPython.core.display.Math object>

In [203]:
# Some illegal expressions
testParser("a + f(a, b)")
testParser("a + f(a, b, )")

SyntaxError: Function f expects 3 arguments, but 2 were given
SyntaxError: Expected numeral or identifier at 8, in atom(), but ) (rparen) is given.


In [204]:
testParser("a + b + c", showOption='infix_latex')
testParser("(a + b) + c", showOption='infix_latex')
testParser("a + (b + c)", showOption='infix_latex') 
testParser("a * b * c", showOption='infix_latex') 
testParser("a ^ b ^ c", showOption='infix_latex') 
testParser("(a ^ b) ^ c", showOption='infix_latex') 
testParser("a ^ (b ^ c)", showOption='infix_latex') 



a + b + c


<IPython.core.display.Math object>

a + b + c


<IPython.core.display.Math object>

a + b + c


<IPython.core.display.Math object>

a * b * c


<IPython.core.display.Math object>

a ^ {b ^ {c}}


<IPython.core.display.Math object>

(a ^ {b}) ^ {c}


<IPython.core.display.Math object>

a ^ {b ^ {c}}


<IPython.core.display.Math object>

In [205]:
testParser("-(a + b)", showOption='infix_latex')
testParser("-(a - b)", showOption='infix_latex')
testParser("-(a * b)", showOption='infix_latex')
testParser("-(a / b)", showOption='infix_latex')
testParser("-(a ^ b)", showOption='infix_latex')


-(a + b)


<IPython.core.display.Math object>

-(a - b)


<IPython.core.display.Math object>

-a * b


<IPython.core.display.Math object>

-a / b


<IPython.core.display.Math object>

-a ^ {b}


<IPython.core.display.Math object>

In [206]:
testParser("a - (b - c)", showOption='infix_latex')
testParser("a - (b + c)", showOption='infix_latex')
testParser("a + (b - c)", showOption='infix_latex')
testParser("a + (b + c)", showOption='infix_latex')

a - (b - c)


<IPython.core.display.Math object>

a - (b + c)


<IPython.core.display.Math object>

a + b - c


<IPython.core.display.Math object>

a + b + c


<IPython.core.display.Math object>

In [177]:
testParser("a - (b * c)", showOption='infix_latex')
testParser("a - (b / c)", showOption='infix_latex')
testParser("a + (b * c)", showOption='infix_latex')
testParser("a + (b / c)", showOption='infix_latex')

a - b * c


<IPython.core.display.Math object>

a - b / c


<IPython.core.display.Math object>

a + b * c


<IPython.core.display.Math object>

a + b / c


<IPython.core.display.Math object>

In [207]:
testParser("(a - b) * c", showOption='infix_latex')
testParser("(a - b) / c", showOption='infix_latex')
testParser("(a + b) * c", showOption='infix_latex')
testParser("(a + b) / c", showOption='infix_latex')

(a - b) * c


<IPython.core.display.Math object>

(a - b) / c


<IPython.core.display.Math object>

(a + b) * c


<IPython.core.display.Math object>

(a + b) / c


<IPython.core.display.Math object>

In [208]:
testParser("a * b / c", showOption='infix_latex')
testParser("(a * b) / c", showOption='infix_latex')
testParser("a * (b / c)", showOption='infix_latex')
testParser("a * (b * c)", showOption='infix_latex')

a * b / c


<IPython.core.display.Math object>

a * b / c


<IPython.core.display.Math object>

a * b / c


<IPython.core.display.Math object>

a * b * c


<IPython.core.display.Math object>

In [180]:
testParser("a / (b / c)", showOption='infix_latex')


a / (b / c)


<IPython.core.display.Math object>

In [210]:
testParser("a / b / c", showOption='infix_latex')
testParser("(a / b) / c", showOption='infix_latex')
testParser("a / (b / c)", showOption='infix_latex')
testParser("a / (b * c)", showOption='infix_latex')


a / b / c


<IPython.core.display.Math object>

a / b / c


<IPython.core.display.Math object>

a / (b / c)


<IPython.core.display.Math object>

a / (b * c)


<IPython.core.display.Math object>

In [211]:
testParser("a ^ b' * c", showOption='infix_latex')
testParser("(a ^ (-b)!) * c", showOption='infix_latex')
testParser("a ^ (b + c)", showOption='infix_latex')
testParser("(b*c)^ a", showOption='infix_latex')

a ^ {b'} * c


<IPython.core.display.Math object>

a ^ {(-b)!} * c


<IPython.core.display.Math object>

a ^ {b + c}


<IPython.core.display.Math object>

(b * c) ^ {a}


<IPython.core.display.Math object>