-
Notifications
You must be signed in to change notification settings - Fork 3
/
sexpr.py
145 lines (114 loc) · 5.22 KB
/
sexpr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
# Steven Bird <sb@csse.unimelb.edu.au> (minor edits)
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
"""
S-Expression Tokenizer
``SExprTokenizer`` is used to find parenthesized expressions in a
string. In particular, it divides a string into a sequence of
substrings that are either parenthesized expressions (including any
nested parenthesized expressions), or other whitespace-separated
tokens.
>>> from nltk.tokenize import SExprTokenizer
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
['(a b (c d))', 'e', 'f', '(g)']
By default, `SExprTokenizer` will raise a ``ValueError`` exception if
used to tokenize an expression with non-matching parentheses:
>>> SExprTokenizer().tokenize('c) d) e (f (g')
Traceback (most recent call last):
...
ValueError: Un-matched close paren at char 1
The ``strict`` argument can be set to False to allow for
non-matching parentheses. Any unmatched close parentheses will be
listed as their own s-expression; and the last partial sexpr with
unmatched open parentheses will be listed as its own sexpr:
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
['c', ')', 'd', ')', 'e', '(f (g']
The characters used for open and close parentheses may be customized
using the ``parens`` argument to the `SExprTokenizer` constructor:
>>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
['{a b {c d}}', 'e', 'f', '{g}']
The s-expression tokenizer is also available as a function:
>>> from nltk.tokenize import sexpr_tokenize
>>> sexpr_tokenize('(a b (c d)) e f (g)')
['(a b (c d))', 'e', 'f', '(g)']
"""
import re
from nltk.tokenize.api import TokenizerI
class SExprTokenizer(TokenizerI):
"""
A tokenizer that divides strings into s-expressions.
An s-expresion can be either:
- a parenthesized expression, including any nested parenthesized
expressions, or
- a sequence of non-whitespace non-parenthesis characters.
For example, the string ``(a (b c)) d e (f)`` consists of four
s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.
By default, the characters ``(`` and ``)`` are treated as open and
close parentheses, but alternative strings may be specified.
:param parens: A two-element sequence specifying the open and close parentheses
that should be used to find sexprs. This will typically be either a
two-character string, or a list of two strings.
:type parens: str or list
:param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
"""
def __init__(self, parens='()', strict=True):
if len(parens) != 2:
raise ValueError('parens must contain exactly two strings')
self._strict = strict
self._open_paren = parens[0]
self._close_paren = parens[1]
self._paren_regexp = re.compile('%s|%s' % (re.escape(parens[0]),
re.escape(parens[1])))
def tokenize(self, text):
"""
Return a list of s-expressions extracted from *text*.
For example:
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
['(a b (c d))', 'e', 'f', '(g)']
All parentheses are assumed to mark s-expressions.
(No special processing is done to exclude parentheses that occur
inside strings, or following backslash characters.)
If the given expression contains non-matching parentheses,
then the behavior of the tokenizer depends on the ``strict``
parameter to the constructor. If ``strict`` is ``True``, then
raise a ``ValueError``. If ``strict`` is ``False``, then any
unmatched close parentheses will be listed as their own
s-expression; and the last partial s-expression with unmatched open
parentheses will be listed as its own s-expression:
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
['c', ')', 'd', ')', 'e', '(f (g']
:param text: the string to be tokenized
:type text: str or iter(str)
:rtype: iter(str)
"""
result = []
pos = 0
depth = 0
for m in self._paren_regexp.finditer(text):
paren = m.group()
if depth == 0:
result += text[pos:m.start()].split()
pos = m.start()
if paren == self._open_paren:
depth += 1
if paren == self._close_paren:
if self._strict and depth == 0:
raise ValueError('Un-matched close paren at char %d'
% m.start())
depth = max(0, depth-1)
if depth == 0:
result.append(text[pos:m.end()])
pos = m.end()
if self._strict and depth > 0:
raise ValueError('Un-matched open paren at char %d' % pos)
if pos < len(text):
result.append(text[pos:])
return result
sexpr_tokenize = SExprTokenizer().tokenize
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)