# Lexical Analysis | 词法分析

In [1]:
import re

## Code for All Kinds of Word | 各种单词符号对应的种别码

### Keyword/Reserved Word | 关键字/保留字

In [2]:
KEYWORD = {
    'begin': 1,
    'if':    2,
    'then':  3,
    'while': 4,
    'do':    5,
    'end':   6
}
KEYWORD.keys()

dict_keys(['begin', 'if', 'then', 'while', 'do', 'end'])

### Identifier | 标识符

In [3]:
ID = 10

### Integer | 整数

In [4]:
INT = 11

### Symbol | 符号

In [5]:
SYMBOL = {
    '+':  13,
    '-':  14,
    '*':  15,
    '/':  16,
    ':':  17,
    ':=': 18,
    '<':  20,
    '<>': 21,
    '<=': 22,
    '>':  23,
    '>=': 24,
    '=':  25,
    ';':  26,
    '(':  27,
    ')':  28,
    '#': 0
}
SYMBOL.keys()

dict_keys(['+', '-', '*', '/', ':', ':=', '<', '<>', '<=', '>', '>=', '=', ';', '(', ')', '#'])

## Regular Expression | 正则表达式

In [6]:
tokens = {
    'WORD':  r'[a-zA-Z0-9]+',  # word, may be keyword/id/integer or illegal
    'SYMBOL': r':=|<>|<=|>=|[+\-*/:<>=;()#]',  # double-character symbols get matched first
    'NEW_LINE': r'\n',
    'WHITE_SPACE': r'[ \t]+',
    'ILL_CHAR': r'.'
}
any_token = '|'.join('(?P<%s>%s)' % (key, value) for (key, value) in tokens.items())
any_token

'(?P<WORD>[a-zA-Z0-9]+)|(?P<SYMBOL>:=|<>|<=|>=|[+\\-*/:<>=;()#])|(?P<NEW_LINE>\\n)|(?P<WHITE_SPACE>[ \\t]+)|(?P<ILL_CHAR>.)'

## Lexer Function | 词法分析函数

In [7]:
def lexer(src):
    line = 1
    for mo in re.finditer(any_token, src):  # match object returned by `re.finditer(pattern, str)`
        kind = mo.lastgroup
        value = mo.group()
        if kind == 'WORD':
            if value in KEYWORD.keys(): code = KEYWORD[value]
            elif re.fullmatch(r'[a-zA-Z][a-zA-Z0-9]*', value): code = ID
            elif re.fullmatch(r'\d+', value): code = INT; value = int(value)  # convert type
            else: raise RuntimeError('Illegal word "%s" on line %s' % (value, line))
        elif kind == 'SYMBOL': code = SYMBOL[value]
        elif kind == 'NEW_LINE': line += 1; continue
        elif kind == 'WHITE_SPACE': continue
        elif kind == 'ILL_CHAR':
            raise RuntimeError('Illegal character \'%c\' on line %s' % (value, line))
        yield (code, value)

## Test Example | 测试样例

### Damn Right | 正确样例

In [8]:
src_sample = \
'''begin
    x:=999;
    if x>998 then x:=2*x+1/3;
    if x<>9 then x:=(1>=0);
end #'''

for tup in lexer(src_sample): print(tup)

(1, 'begin')
(10, 'x')
(18, ':=')
(11, 999)
(26, ';')
(2, 'if')
(10, 'x')
(23, '>')
(11, 998)
(3, 'then')
(10, 'x')
(18, ':=')
(11, 2)
(15, '*')
(10, 'x')
(13, '+')
(11, 1)
(16, '/')
(11, 3)
(26, ';')
(2, 'if')
(10, 'x')
(21, '<>')
(11, 9)
(3, 'then')
(10, 'x')
(18, ':=')
(27, '(')
(11, 1)
(24, '>=')
(11, 0)
(28, ')')
(26, ';')
(6, 'end')
(0, '#')


### Illegal Character | 非法字符

In [9]:
src_sample_ill_char = \
'''begin
    x:=999;
    if _x>998 then ?x:=2*x+1/3;
    if []x<>9 then x:=(1>=0);
end #'''

for tup in lexer(src_sample_ill_char): print(tup)

(1, 'begin')
(10, 'x')
(18, ':=')
(11, 999)
(26, ';')
(2, 'if')


RuntimeError: Illegal character '_' on line 3

### Illegal Word | 非法单词

In [10]:
src_sample_ill_word = \
'''begin
    x:=999;
    if x>998 then 666x:=2*x+1/3;
    if 888x<>9 then x:=(1>=0);
end #'''

for tup in lexer(src_sample_ill_word): print(tup)

(1, 'begin')
(10, 'x')
(18, ':=')
(11, 999)
(26, ';')
(2, 'if')
(10, 'x')
(23, '>')
(11, 998)
(3, 'then')


RuntimeError: Illegal word "666x" on line 3

## Reference | 参考资料

[re --- 正则表达式操作 — Python 3.7.3 文档](https://docs.python.org/zh-cn/3.7/library/re.html "re --- 正则表达式操作 — Python 3.7.3 文档")