In [57]:
# Constants: build lists for optimal checks. Otherwise string matching will be expensive

letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits = "0123456789"
operators = "+-*/=<>%"

letters_list = []
digits_list = []
operators_list = []
for letter in letters:
    letters_list.append(letter)

for digit in digits:
    digits_list.append(digit)

for operator in operators:
    operators_list.append(operator)

# ~ denotes end of input
separators_list = ["(", ")", "{", "}", "[", "]", ",", ".", ":", ";", "~"]
keywords = ("int", "float", "bool", "True", "False", "if", "else", "then", "endif", "endelse", "while", "whileend", "do", "enddo", "for", "STDinput", "and", "or", "not")

'''
State table: 
1 - starting state
2 - in identifier
3 - end of identifier
4 - in integer
5 - end of integer
6 - in decimal
7 - end of decimal
8 - in block comment
9 - end block comment
10 - operator
11 - separator           
'''

table = {       # l  d  sp !  .   op  sep 
            1 : ( 2, 4, 1, 8, 11, 10, 11 ),
            2 : ( 2, 2, 3, 3, 3, 3, 3 ),
            3 : ( 1, 1, 1, 1, 1, 1, 1 ),
            4 : ( 5, 4, 5, 5, 5, 5, 5 ),
            5 : ( 1, 1, 1, 1, 1, 1, 1 ),
            6 : ( 7, 6, 7, 7, 7, 7, 7 ),
            7 : ( 1, 1, 1, 1, 1, 1, 1 ),
            8 : ( 8, 8, 8, 9, 8, 8, 8 ),
            9 : ( 1, 1, 1, 1, 1, 1, 1 ),
            10: ( 1, 1, 1, 1, 1, 1, 1 ),
            11: ( 1, 1, 1, 1, 1, 1, 1 )
        }

accepting_states = (3, 5, 7, 9, 10, 11)

In [58]:
# Shortcomings: 
# 1. Inability to recognize compound operators, eg. >= or <=. Will instead recognize as two separate operators

In [59]:
def get_token(accepting_state):
    #if accepting_state not in accepting_states:
    #    return "Error: Not an accepting state"
    if accepting_state == 3:
        return "IDENTIFIER"
    elif accepting_state == 5:
        return "INTEGER"
    elif accepting_state == 7:
        return "FLOAT"
    elif accepting_state == 9:
        return "BLOCK COMMENT"
    elif accepting_state == 10:
        return "OPERATOR"
    elif accepting_state == 11:
        return "SEPARATOR"

In [60]:
def get_next_state(curr_state, char):
    if char in letters_list:
        return table[curr_state][0]
    elif char in digits_list:
        return table[curr_state][1]
    elif char == " ":
        return table[curr_state][2]
    elif char == "!":
        return table[curr_state][3]
    elif char == ".":
        return table[curr_state][4]
    elif char in operators_list:
        return table[curr_state][5]
    elif char in separators_list:
        return table[curr_state][6]

In [61]:
def lexer(input_str):
    state = 1
    tokens = []
    i = 0
    lexeme = ""
    while i < len(input_str):
        state = get_next_state(state, input_str[i])
        lexeme += input_str[i]
        if (state in accepting_states):
            print(get_token(state) + ": " + lexeme + "\n")
            lexeme = ""
            state = 1
        i += 1
 
    # If end of input, and lexer is not in accepting state, force it to return a token and lexeme
    # Do this by assuming end of input forces an accepting state
    if lexeme != "":
        state = get_next_state(state, '~')
        print(get_token(state) + ": " + lexeme + "\n")

In [62]:
lexer("This is input")

IDENTIFIER: This 

IDENTIFIER: is 

IDENTIFIER: input

