In [None]:
from IPython.display import HTML
HTML(open('../style.css').read())

In [None]:
%load_ext nb_mypy

# An Interpreter for a Simple Programming Language

In this notebook we develop an interpreter for a small programming language.

In [None]:
import ply.lex as lex

In [None]:
tokens = [ 'NUMBER',     # r'0|[1-9][0-9]*'
           'IDENTIFIER', # r'[a-zA-Z][a-zA-Z0-9_]*'
           'ASSIGN',     # r':='
           'EQ',         # r'=='
           'NE',         # r'!='
           'LE',         # r'<='
           'GE',         # r'>='
           'IF',         # see below
           'WHILE'       # see below
          ]

We allow both *single-line comments* and *multi-line comments*.
- The regular expression `/\*(.|\n)*?\*/` recognizes multi-line comments.
  Multi-line comments start with the string `/*` and end with the string `*/`.
  Note the use of the *non-greedy* quantor `*?`.  If we have code like
  ```
  /* blah */ a := 1; /* blub */
  ```
  the greedy quantor would recognize the whole line as one comment. 
- The regular expression `//.*` recognizes single-line comments.
  A single line comment starts with the string `//` and extends to the end of the line.

In [None]:
def t_COMMENT(t):
    r'/\*(.|\n)*?\*/|//.*'
    t.lexer.lineno += t.value.count('\n')

The token `NUMBER` specifies a natural number.

In [None]:
def t_NUMBER(t):
    r'0|[1-9][0-9]*'
    t.value = int(t.value)
    return t

In [None]:
t_ASSIGN = r':='
t_EQ     = r'=='
t_NE     = r'!='
t_LE     = r'<='
t_GE     = r'>='

The *keywords* `'if'` and `'while'` have to be dealt with separately as they are syntactical identical to identifiers. The dictionary `Keywords` shown below maps every keyword to its token type.

In [None]:
Keywords = { 'if': 'IF', 'while': 'WHILE' }

When an identifier is read, we first have to check whether the identifier is one of our *keywords*.  If so, we assign the corresponding token type that is stored in the dictionary `Keywords`.  Otherwise, the token type is set to `IDENTIFIER`.

In [None]:
def t_IDENTIFIER(t):
    r'[a-zA-Z][a-zA-Z0-9_]*'
    t.type = Keywords.get(t.value, 'IDENTIFIER')
    return t

Operators consisting of a single character do not need an associated token type.
They are declared via the keyword `literals`.

In [None]:
literals = ['+', '-', '*', '/', '%', '(', ')', '{', '}', ';', '<', '>', ',']

The *white space* characters *blank*, *tabulator*, and *carriage return* are ignored. 

In [None]:
t_ignore  = ' \t\r'

Syntactically, *newline* characters are ignored. However, we still need to keep track of them in order to know the current line number, which is used for error messages.

In [None]:
def t_newline(t):
    r'\n'
    t.lexer.lineno += 1
    return

Given a `token`, the function `find_colum` returns the column where `token` starts.  This is possible, because every token contains a reference to the current lexer as `token.lexer` and this lexer in turn stores the string that is given to it via the reference `lexer.lexdata`.  Furthermore, `token.lexpos` is the number of characters that precede `token`.
The function `rfind` searches the string *in reverse*, so we are looking for the first newline preceding the current position.

In [None]:
def find_column(token):
    program    = token.lexer.lexdata  # the complete string given to the scanner
    line_start = program.rfind('\n', 0, token.lexpos)
    return token.lexpos - line_start

The function `t_error` is called for any token `t` that can not be scanned by the lexer.  In this case, `t.value[0]` is the first character that is not recognized by the scanner.  This character is discarded.  After that, scanning proceeeds as if nothing has happened.

In [None]:
def t_error(t):
    column = find_column(t)
    print(f"Illegal character '{t.value[0]}' in line {t.lineno}, column {column}.")
    t.lexer.skip(1)

In [None]:
__file__ = 'main'

In [None]:
lexer = lex.lex()

In [None]:
def test_scanner(file_name):
    with open(file_name, 'r') as handle:
        program = handle.read() 
    print(program)
    lexer.input(program)
    lexer.lineno = 1          # reset line number
    for t in lexer:           # start scanning and collect all tokens
        print(t) 

In [None]:
test_scanner('sum.sl')

In [None]:
test_scanner('factorial.sl')

In [None]:
import ply.yacc as yacc

Below is the grammar for a simple `C`-like language:
```
program
    : 𝜆 
    | stmnt program
    
stmnt 
    : IF '(' bool_expr ')' stmnt                 
    | WHILE '(' bool_expr ')' stmnt
    | '{' program '}' 
    | IDENTIFIER ':=' expr ';'  
    | expr ';'       

bool_expr 
    : expr '==' expr     
    | expr '!=' expr     
    | expr '<=' expr     
    | expr '>=' expr     
    | expr '<'  expr      
    | expr '>'  expr     
 
expr: expr '+' product                 
    | expr '-' product
    | product
              
product
    : product '*' factor               
    | product '/' factor
    | product '%' factor 
    | factor

factor
    : '(' expr ')' 
    | NUMBER
    | IDENTIFIER
    | IDENTIFIER '(' expr_list ')'

expr_list
    : 𝜆 
    | ne_expr_list

ne_expr_list
    : expr
    | expr ',' ne_expr_list
```

The *start variable* of our grammar is `program`.

In [None]:
start = 'program'

An example program that conforms to this grammar is stored in the file `sum.sl`.

In [None]:
!cat sum.sl

A program is a list of statements.

In [None]:
def p_program_lambda(p):
    "program : "
    p[0] = ('.',)

In [None]:
def p_program_more(p):
    "program : stmnt program"
    p[0] = ('.', p[1]) + p[2][1:]

In [None]:
def p_stmnt_if(p):
    "stmnt : IF '(' bool_expr ')' stmnt"
    p[0] = ('if', p[3], p[5])   

def p_stmnt_while(p):
    "stmnt : WHILE '(' bool_expr ')' stmnt"
    p[0] = ('while', p[3], p[5])
    
def p_stmnt_block(p):
    "stmnt : '{' program '}'"
    p[0] = p[2]
    
def p_stmnt_assign(p):
    "stmnt : IDENTIFIER ASSIGN expr ';'"
    p[0] = (':=', p[1], p[3]) 

def p_stmnt_expr(p):
    "stmnt : expr ';'"
    p[0] = ('expr', p[1])

In [None]:
def p_bool_expr_eq(p):
    "bool_expr : expr EQ expr"
    p[0] = ('==', p[1], p[3])

def p_bool_expr_ne(p):
    "bool_expr : expr NE expr"
    p[0] = ('!=', p[1], p[3])

def p_bool_expr_le(p):
    "bool_expr : expr LE expr"
    p[0] = ('<=', p[1], p[3])
    
def p_bool_expr_ge(p):
    "bool_expr : expr GE expr"
    p[0] = ('>=', p[1], p[3])
    
def p_bool_expr_lt(p):
    "bool_expr : expr '<' expr"
    p[0] = ('<', p[1], p[3])

def p_bool_expr_gt(p):
    "bool_expr : expr '>' expr"
    p[0] = ('>', p[1], p[3])

In [None]:
def p_expr_plus(p):
    "expr : expr '+' product"
    p[0] = ('+', p[1], p[3])
    
def p_expr_minus(p):
    "expr : expr '-' product"
    p[0] = ('-', p[1], p[3])

def p_expr_product(p):
    "expr : product"
    p[0] = p[1]
    
def p_product_times(p):
    "product : product '*' factor"
    p[0] = ('*', p[1], p[3])
    
def p_product_divide(p):
    "product : product '/' factor"
    p[0] = ('/', p[1], p[3])

def p_product_modulo(p):
    "product : product '%' factor"
    p[0] = ('%', p[1], p[3])

def p_product_factor(p):
    "product : factor"
    p[0] = p[1]

def p_factor_paren(p):
    "factor : '(' expr ')'"
    p[0] = p[2]

def p_factor_number(p):
    "factor : NUMBER"
    p[0] = p[1]

def p_factor_identifier(p):
    "factor : IDENTIFIER"
    p[0] = p[1]

def p_factor_fct_call(p):
    "factor : IDENTIFIER '(' expr_list ')'"
    p[0] = ('call', p[1]) + p[3][1:]

In [None]:
def p_expr_list_empty(p):
    "expr_list : "
    p[0] = ('.',)
     
def p_expr_more(p):
    "expr_list : ne_expr_list"
    p[0] = p[1]     

def p_ne_expr_list_one(p):
    "ne_expr_list : expr"
    p[0] = ('.', p[1]) 
    
def p_ne_expr_list_more(p):
    "ne_expr_list : expr ',' ne_expr_list"
    p[0] = ('.', p[1]) + p[3][1:] 

In [None]:
def p_error(t):
    column = find_column(t)
    if t:
        print(f'Syntax error at token "{t.value}" in line {t.lineno}, column {column}.')
    else:
        print('Syntax error at end of input.')

In [None]:
parser = yacc.yacc(write_tables=False, debug=True)

The parser shown above will transform the program `sum.sl` into the *nested tuple* stored in the file `sum.ast`.

In [None]:
%run AST2Dot.ipynb

The function `parse` takes a `file_name` as ist sole argument.  The file is read and parsed. 
The resulting parse tree is visualized using `graphviz`.  It is important to reset the
attribute `lineno` of the scanner, for otherwise error messages will not have the correct line numbers.

In [None]:
def parse(file_name):
    lexer.lineno = 1
    with open(file_name, 'r') as handle:
        program = handle.read() 
    ast = yacc.parse(program)
    print(ast)
    return tuple2dot(ast)

In [None]:
parse('sum.sl')

In [None]:
from typing import TypeVar
NestedTuple = TypeVar('NestedTuple')
NestedTuple = int | str | tuple[NestedTuple, ...]
Number      = int | float

We need some forward declarations.

In [None]:
def execute(stmnt: NestedTuple, Values: dict[str, Number]) -> None:
    return None

In [None]:
def evaluate(expr: NestedTuple, Values: dict[str, Number]) -> Number:
    return None # type: ignore

In [None]:
def evaluate_bool(expr: NestedTuple, Values: dict[str, Number]) -> bool:
    return None # type: ignore

The function `execute_tuple` takes two arguments:
- `StatementList` is a list of statements,
- `Values` is a dictionary assigning integer values to variable names.

The function executes the statements in `Statement_List`.  If an assignment statement is executed,
the dictionary `Values` is updated.

In [None]:
def execute_tuple(StatementList: tuple[NestedTuple, ...], Values: dict[str, Number]={}) -> None:
    for stmnt in StatementList:
        execute(stmnt, Values)

The function `execute` takes two arguments:
- `stmnt` is a statement,
- `Values` is a dictionary assigning values to variable names.

The function executes the statements in `Statement_List`.  If an assignment statement is executed,
the dictionary `Values` is updated.

In [None]:
def execute(stmnt: NestedTuple, Values: dict[str, Number]) -> None:
    match stmnt:
        case ('.', *SL):
            execute_tuple(tuple(SL), Values)
        case (':=', var, value):
            Values[var] = evaluate(value, Values)
        case ('expr', expr):
            evaluate(expr, Values)
        case ('if', test, stmnt):
            if evaluate_bool(test, Values):
                execute(stmnt, Values)
        case ('while', test, stmnt):
            while evaluate_bool(test, Values):
                execute(stmnt, Values)
        case _:
            assert False, f'{stmnt} unexpected'

The function `evaluate_bool` takes two arguments:
- `expr` is a boolean expression expression,
- `Values` is a dictionary assigning integer values to variable names.

The function evaluates the given expression and returns this value.

In [None]:
def evaluate_bool(expr: NestedTuple, Values: dict[str, Number]) -> bool:
    match expr:
        case ('==', lhs, rhs):
            return evaluate(lhs, Values) == evaluate(rhs, Values)
        case ('!=', lhs, rhs):
            return evaluate(lhs, Values) != evaluate(rhs, Values)
        case ('<=', lhs, rhs):
            return evaluate(lhs, Values) <= evaluate(rhs, Values)
        case ('>=', lhs, rhs):
            return evaluate(lhs, Values) >= evaluate(rhs, Values)
        case ('<', lhs, rhs):
            return evaluate(lhs, Values) <  evaluate(rhs, Values)
        case ('>', lhs, rhs):
            return evaluate(lhs, Values) >  evaluate(rhs, Values)
        case _:
            assert False, f'{expr} unexpected'

The function `evaluate` takes two arguments:
- `expr` is a logical expression or an arithmetic expression,
- `Values` is a dictionary assigning integer values to variable names.

The function evaluates the given expression and returns this value.

In [None]:
def evaluate(expr: NestedTuple, Values: dict[str, Number]) -> Number:
    match expr:
        case int():
            return expr
        case str():
            return Values[expr] 
        case ('call', 'read'):
            return int(input('Please enter a natural number: '))
        case ('call', 'print', expr):
            print(evaluate(expr, Values))
            return 0;
        case ('+', lhs, rhs):
            return evaluate(lhs, Values) + evaluate(rhs, Values)
        case ('-', lhs, rhs):
            return evaluate(lhs, Values) - evaluate(rhs, Values)
        case ('*', lhs, rhs):
            return evaluate(lhs, Values) * evaluate(rhs, Values)
        case ('/', lhs, rhs):
            return evaluate(lhs, Values) / evaluate(rhs, Values)
        case ('%', lhs, rhs):
            return evaluate(lhs, Values) % evaluate(rhs, Values)
        case _:
            assert False, f'{expr} unexpected'

In [None]:
!cat sum.sl

In [None]:
def main(file):
    with open(file, 'r') as handle:
        program = handle.read() 
    stmnt = yacc.parse(program)
    print(stmnt)
    Values = {}
    execute(stmnt, Values)

In [None]:
main('sum.sl')

In [None]:
!cat factorial.sl

In [None]:
main('factorial.sl')