-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexertest.py
171 lines (144 loc) · 5.13 KB
/
lexertest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import re
class Token(object):
""" A simple Token structure. Token type, value and position.
"""
def __init__(self, type, val, pos):
self.type = type
self.val = val
self.pos = pos
def __str__(self):
return '%s(%s) at %s' % (self.type, self.val, self.pos)
class LexerError(Exception):
def __init__(self, pos):
self.pos = pos
class Lexer(object):
""" A simple regex-based lexer/tokenizer.
"""
def __init__(self, rules, skip_whitespace=True):
""" Create a lexer.
rules:
A list of rules. Each rule is a regex, type
pair, where regex is the regular expression used
to recognize the token and type is the type
of the token to return when it's recognized.
skip_whitespace:
If True, whitespace (\s+) will be skipped and not
reported by the lexer. Otherwise, you have to
specify your rules for whitespace, or it will be
flagged as an error.
"""
self.rules = []
for regex, type in rules:
self.rules.append((re.compile(regex), type))
self.skip_whitespace = skip_whitespace
self.re_ws_skip = re.compile('\S')
def input(self, buf):
""" Initialize the lexer with a buffer as input.
"""
self.buf = buf
self.pos = 0
def token(self):
""" Return the next token (a Token object) found in the
input buffer. None is returned if the end of the
buffer was reached.
In case of a lexing error (the current chunk of the
buffer matches no rule), a LexerError is raised with
the position of the error.
"""
if self.pos >= len(self.buf):
return None
if self.skip_whitespace:
m = self.re_ws_skip.search(self.buf, self.pos)
if m:
self.pos = m.start()
else:
return None
for regex, type in self.rules:
m = regex.match(self.buf, self.pos)
if m:
tok = Token(type, m.group(), self.pos)
self.pos = m.end()
return tok
# if we're here, no rule matched
raise LexerError(self.pos)
def tokens(self):
""" Returns an iterator to the tokens found in the buffer.
"""
while 1:
tok = self.token()
if tok is None: break
yield tok
rules = [
('\d+', 'NUMBER'),
('[a-zA-Z_]\w*', 'IDENTIFIER'),
('\+', 'PLUS'),
('\-', 'MINUS'),
('\*', 'MULTIPLY'),
('\/', 'DIVIDE'),
('\(', 'LP'),
('\)', 'RP'),
('=', 'EQUALS'),
]
lx = Lexer(rules, skip_whitespace=True)
lx.input('erw = _abc + 12*(R4-623902) ')
try:
for tok in lx.tokens():
print(tok)
except LexerError as err:
print('LexerError at position %s' % err.pos)
def __init__(self, rules, skip_whitespace=True):
""" Create a lexer.
rules:
A list of rules. Each rule is a regex, type
pair, where regex is the regular expression used
to recognize the token and type is the type
of the token to return when it's recognized.
skip_whitespace:
If True, whitespace (\s+) will be skipped and not
reported by the lexer. Otherwise, you have to
specify your rules for whitespace, or it will be
flagged as an error.
"""
# All the regexes are concatenated into a single one
# with named groups. Since the group names must be valid
# Python identifiers, but the token types used by the
# user are arbitrary strings, we auto-generate the group
# names and map them to token types.
#
idx = 1
regex_parts = []
self.group_type = {}
for regex, type in rules:
groupname = 'GROUP%s' % idx
regex_parts.append('(?P<%s>%s)' % (groupname, regex))
self.group_type[groupname] = type
idx += 1
self.regex = re.compile('|'.join(regex_parts))
self.skip_whitespace = skip_whitespace
self.re_ws_skip = re.compile('\S')
def token(self):
""" Return the next token (a Token object) found in the
input buffer. None is returned if the end of the
buffer was reached.
In case of a lexing error (the current chunk of the
buffer matches no rule), a LexerError is raised with
the position of the error.
"""
if self.pos >= len(self.buf):
return None
else:
if self.skip_whitespace:
m = self.re_ws_skip.search(self.buf, self.pos)
if m:
self.pos = m.start()
else:
return None
m = self.regex.match(self.buf, self.pos)
if m:
groupname = m.lastgroup
tok_type = self.group_type[groupname]
tok = Token(tok_type, m.group(groupname), self.pos)
self.pos = m.end()
return tok
# if we're here, no rule matched
raise LexerError(self.pos)