/
tokenizer.py
674 lines (534 loc) · 22.6 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
# *-* coding: utf-8 *-*
# These lines were automatically added by the 3to2-conversion.
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
"""
This module provides graphemic and orthographic tokenization using with orthography profiles.
"""
__author__ = "Steven Moran"
__date__ = "2010-12-01"
import os
import logging
import codecs
import unicodedata
# basic lingpy imports
from ..settings import rcParams
from .. import log
from .. import util
try:
import regex as re
except ImportError:
import re
log.missing_module('regex')
class Tokenizer(object):
"""
Class for orthographic parsing using orthography profiles as designed for the QLC project.
Parameters
----------
orthography_profile : string (default = None)
Filename of the a document source-specific orthography profile and rules file.
Notes
-----
The Tokenizer reads in an orthography profile and calls a helper
class to build a tree data structure, which stores the possible Unicode
character combinations that are specified in the orthography profile
and appear in the data source.
For example, an orthography profile might specify that in source X
<uu> is a single grapheme (Unicode parlance: tailored grapheme) and
therefore it should be chunked as so. Given an orthography profile and
some data to parse, the process would look like this:
input string example: uubo uubo
output string example: uu b o # uu b o
See also the tokenizer examples in lingpy/scripts/tokenize
Additionally, the Tokenizer provides functionality to transform graphemes
into associated character(s) specified in additional columns in the orthography
profile. A dictionary is created that keeps a mapping between source-specific
graphemes and their counterparts (e.g. an IPA column in the orthography profile).
The tokenizer can also be used for pure Unicode character and grapheme
tokenization, i.e. it uses the Unicode standard grapheme parsing rules, as
implemented in the Python regex package by Matthew Barnett, to do basic tokenization
with the "\X" grapheme regular expression match. This grapheme match
combines one or more Combining Diacritical Marks to their base character.
These are called "Grapheme clusters" in Unicode parlance. With these functions
the Tokenizer is meant to do basic rudimentary parsing for things like getting
an additional unigram model (segments and their counts) in an input data source.
An additional method (in its infancy) called combine_modifiers handles the
case where there are Unicode Spacing Modifier Letters, which are not explicitly
combined to their base character in the Unicode Standard. These graphemes
are called "Tailored grapheme clusters" in Unicode. For more information
see the Unicode Standard Annex #29: Unicode Text Segmentation:
http://www.unicode.org/reports/tr29/
Lastly, the Tokenizer can be used to transformation as specified in an
orthography rules file. These transformations are specified in a separate
file from the orthography profile (that specifics the document specific graphemes,
and possibly their IPA counterparts) and the orthography rules should
be applied to the output of an OrthographyParser.
In an orthography rules file, rules are given in order in regular
expressions, e.g. this rule replaces a vowel followed by an <n>
followed by <space> followed by a second vowel with first vowel
<space> <n> <space> second vowel, e.g.:
([a|á|e|é|i|í|o|ó|u|ú])(n)(\s)([a|á|e|é|i|í|o|ó|u|ú]), \1 \2 \4
Examples
--------
>>> from lingpy.sequence.tokenizer import *
>>> t = Tokenizer("test.prf")
>>> word = "baach"
>>> print(t.characters(word))
b a a c h
>>> print(t.graphemes(word))
b aa ch
>>> print(t.transform(word, "ipa"))
b aː tʃ
"""
def __init__(self, orthography_profile=None):
if orthography_profile and not os.path.exists(orthography_profile):
raise ValueError("The orthography profile you specified does not exist!")
self.orthography_profile = orthography_profile
self.orthography_profile_rules = None
self.column_labels = None
# orthography profile processing
if self.orthography_profile:
# read in orthography profile and create a tree structure for tokenization
self.root = createTree(self.orthography_profile)
# store column labels from the orthography profile
self.column_labels = []
# look up table of graphemes to other column transforms
self.mappings = {}
# double check that there are no duplicate graphemes in the orthography profile
self.op_graphemes = {}
# process the orthography profiles and rules
self._init_profile(self.orthography_profile)
rules_path = os.path.splitext(self.orthography_profile)[0] + '.rules'
if os.path.isfile(rules_path):
self.orthography_profile_rules = rules_path
self.op_rules = []
self.op_replacements = []
self._init_rules(self.orthography_profile_rules)
else:
try:
import regex as re
except ImportError:
raise ImportError(
"Please install the `regex` module to use Tokenizer without an orthography_profile."
)
self.grapheme_pattern = re.compile("\X", re.UNICODE)
log.debug("Orthography profile: %s" % self.orthography_profile)
log.debug("Orthography rules: %s" % self.orthography_profile_rules)
log.debug("Columns labels: %s" % self.column_labels)
def _init_profile(self, f):
# Process and initialize data structures given an orthography profile.
for line_count, line in enumerate(util.read_config_file(f, normalize='NFD')):
# deal with the columns header -- should always start with "graphemes" as per
# the orthography profiles specification
if line.lower().startswith("graphemes"):
column_tokens = line.split("\t")
# clean the header
for column_token in column_tokens:
self.column_labels.append(column_token.lower().strip())
continue
# split the orthography profile into columns
tokens = line.split("\t")
grapheme = tokens[0].strip()
# check for duplicates in the orthography profile (fail if dups)
if not grapheme in self.op_graphemes:
self.op_graphemes[grapheme] = 1
else:
raise Exception("You have a duplicate in your orthography profile.")
if len(tokens) == 1:
continue
for i, token in enumerate(tokens):
token = token.strip()
self.mappings[grapheme, self.column_labels[i].lower()] = token
log.debug('%s %s' % (grapheme, self.column_labels[i].lower()))
# print the tree structure if debug mode is on
if log.get_logger().getEffectiveLevel() <= logging.INFO:
log.debug("A graphical representation of your orthography profile in a tree ('*' denotes sentinels):\n")
printTree(self.root, "")
print()
def _init_rules(self, f):
# Process the orthography rules file.
for line in util.read_config_file(f, normalize='NFD'):
rule, replacement = line.split("\t")
rule = rule.strip() # just in case there's trailing whitespace
replacement = replacement.strip() # because there's probably trailing whitespace!
self.op_rules.append(re.compile(rule))
self.op_replacements.append(replacement)
# check that num rules == num replacements; if not fail
if len(self.op_rules) != len(self.op_replacements):
raise ValueError("Number of inputs does not match number of outputs in the rules file.")
def characters(self, string):
"""
Given a string as input, return a space-delimited string of Unicode characters (code points rendered as glyphs).
Parameters
----------
string : str
A Unicode string to be parsed into graphemes.
Returns
-------
result : str
String returned is space-delimited on Unicode characters and contains "#" to mark word boundaries.
The string is in NFD.
Notes
-----
Input is first normalized according to Normalization Ford D(ecomposition).
String returned contains "#" to mark word boundaries.
"""
string = string.replace(" ", "#") # add boundaries between words
string = unicodedata.normalize("NFD", string)
result = ""
for character in string:
result += character+" "
return result.strip()
def grapheme_clusters(self, string):
"""
Given a string as input, return a space-delimited string of Unicode graphemes using the "\X" regular expression.
Parameters
----------
string : str
A Unicode string to be parsed into graphemes.
Returns
-------
result : str
String returned is space-delimited on Unicode graphemes and contains "#" to mark word boundaries.
The string is in NFD.
Notes
-----
Input is first normalized according to Normalization Ford D(ecomposition).
See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
http://www.unicode.org/reports/tr29/
"""
# init the regex Unicode grapheme cluster match
return ' '.join(self.grapheme_pattern.findall(
unicodedata.normalize("NFD", string.replace(" ", "#"))))
def graphemes(self, string):
"""
Tokenizes strings given an orthograhy profile that specifies graphemes in a source doculect.
Parameters
----------
string : str
The str to be parsed and formatted.
Returns
-------
result : str
The result of the parsed and QLC formatted str.
"""
string = unicodedata.normalize("NFD", string)
# if no orthography profile is specified, simply return
# Unicode grapheme clusters, regex pattern "\X"
if self.orthography_profile == None:
return self.grapheme_clusters(string)
parses = []
for word in string.split():
parse = getParse(self.root, word)
# case where the parsing fails
if len(parse) == 0:
# replace characters in string but not in orthography profile with <?>
parse = " "+self.find_missing_characters(self.characters(word))
# write problematic stuff to standard error
log.debug("The string '{0}' does not parse given the specified orthography profile {1}.\n".format(word, self.orthography_profile))
parses.append(parse)
# remove the outter word boundaries
result = "".join(parses).replace("##", "#")
result = result.rstrip("#")
result = result.lstrip("#")
return result.strip()
def transform(self, string, column="graphemes"):
"""
Transform a string's graphemes into the mappings given in a different column
in the orthography profile. By default this function returns an orthography
profile grapheme tokenized string.
Parameters
----------
string : str
The input string to be parsed.
conversion : str (default = "graphemes")
The label of the column to transform to. Default it to tokenize with orthography profile.
Returns
-------
result : str
Result of the transformation.
"""
# column labels are normalized
column = column.lower()
# This method can't be called unless an orthography profile was specified.
if not self.orthography_profile:
raise Exception("This method only works when an orthography profile is specified.")
if column == "graphemes":
return self.graphemes(string)
# if the column label for conversion doesn't exist, return grapheme tokenization
if column not in self.column_labels:
return self.graphemes(string)
# first tokenize the input string into orthography profile graphemes
tokenized_string = self.graphemes(string)
tokens = tokenized_string.split()
result = []
for token in tokens:
# special cases: word breaks and unparsables
if token == "#":
result.append("#")
continue
if token == "?":
result.append("?")
continue
# transform given the grapheme and column label; skip NULL
target = self.mappings[token, column]
if not target == "NULL":
# result.append(self.mappings[token, column])
result.append(target)
return " ".join(result).strip()
def tokenize(self, string, column="graphemes"):
"""
This function determines what to do given any combination
of orthography profiles and rules or not orthography profiles
or rules.
Parameters
----------
string : str
The input string to be tokenized.
column : str (default = "graphemes")
The column label for the transformation, if specified.
Returns
-------
result : str
Result of the tokenization.
"""
# column labels are normalized
column = column.lower()
if self.orthography_profile and self.orthography_profile_rules:
return self.rules(self.transform(string, column))
if not self.orthography_profile and not self.orthography_profile_rules:
return self.grapheme_clusters(string)
if self.orthography_profile and not self.orthography_profile_rules:
return self.transform(string, column)
# it's not yet clear what the order for this procedure should be
if not self.orthography_profile and self.orthography_profile_rules:
return self.rules(self.grapheme_clusters(string))
def transform_rules(self, string):
"""
Convenience function that first tokenizes a string into orthographic profile-
specified graphemes and then applies the orthography profile rules.
Parameters
----------
string : str
The input string to be transformed.
Returns
-------
result : str
Result of the transformation.
"""
return self.rules(self.transform(string))
def rules(self, string):
"""
Function to parse input string and return output of str with ortho rules applied.
Parameters
----------
string : str
The input string to be parsed.
Returns
-------
result : str
Result of the orthography rules applied to the input str.
"""
# if no orthography profile was initiated, this method can't be called
# if self.orthography_profile == None:
# raise Exception("This function requires that an orthography profile is specified.")
# if no orthography profile rules file has been specified, simply return the string
if self.orthography_profile_rules == None:
return string
result = unicodedata.normalize("NFD", string)
for i in range(0, len(self.op_rules)):
match = self.op_rules[i].search(result)
if match:
result = re.sub(self.op_rules[i], self.op_replacements[i], result)
log.debug("Input/output:"+"\t"+string+"\t"+result)
log.debug("Pattern/replacement:"+"\t"+self.op_rules[i].pattern+"\t"+self.op_replacements[i])
# this is incase someone introduces a non-NFD ordered sequence of characters
# in the orthography profile
result = unicodedata.normalize("NFD", result)
return result
def find_missing_characters(self, char_tokenized_string):
"""
Given a string tokenized into characters, return a characters
tokenized string where each character missing from the orthography
profile is replaced with a question mark <?>.
Parameters
----------
string : str
A character tokenized string.
Returns
-------
result : str
Result of the tokenization.
"""
result = []
chars = char_tokenized_string.split()
for char in chars:
if not char in self.op_graphemes:
result.append("?")
else:
result.append(char)
return " ".join(result)
def tokenize_ipa(self, string):
# Experimental method for tokenizing IPA.
return self.combine_modifiers(self.grapheme_clusters(string))
def combine_modifiers(self, string):
"""
Given a string that is space-delimited on Unicode grapheme cluters,
group Unicode modifier letters with their preceeding base characters.
Parameters
----------
string : str
A Unicode string tokenized into grapheme clusters to be tokenized into simple IPA.
.. todo:: check if we need to apply NDF after string is parsed
"""
result = []
graphemes = string.split()
temp = ""
count = len(graphemes)
for grapheme in reversed(graphemes):
count -= 1
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm":
temp = grapheme + temp
# hack for the cases where a space modifier is the first character in the str
if count == 0:
result[-1] = temp + result[-1]
continue
result.append(grapheme + temp)
temp = ""
# check for tie bars
segments = result[::-1]
i = 0
r = []
while i < len(segments):
if ord(segments[i][-1]) in [865, 860]:
r.append(segments[i]+segments[i+1])
i = i+2
else:
r.append(segments[i])
i += 1
return " ".join(r)
def exists_multiple_columns(self):
"""
Returns boolean of whether multiple columns exist in the orthography profile, e.g. graphemes and IPA and x, etc.
"""
if len(self.column_labels) > 1:
return True
else:
return False
def remove_spaces(self, string):
string = string.lstrip("# ")
string = string.rstrip(" #")
string = re.sub("\s+", "", string)
string = string.replace("#", " ")
return string
# ---------- Tree node --------
class TreeNode(object):
"""
Private class that creates the tree data structure from the orthography profile for parsing.
"""
def __init__(self, char):
self.char = char
self.children = {}
self.sentinel = False
def isSentinel(self):
return self.sentinel
def getChar(self):
return self.char
def makeSentinel(self):
self.sentinel = True
def addChild(self, char):
child = self.getChild(char)
if not child:
child = TreeNode(char)
self.children[char] = child
return child
def getChild(self, char):
if char in self.children:
return self.children[char]
else:
return None
def getChildren(self):
return self.children
# ---------- Util functions ------
def createTree(file_name):
# Internal function to add a multigraph starting at node.
def addMultigraph(node, line):
for char in line:
node = node.addChild(char)
node.makeSentinel()
# Add all multigraphs in each line of file_name. Skip "#" comments and blank lines.
root = TreeNode('')
root.makeSentinel()
f = codecs.open(file_name, "r", "utf-8")
header = []
for line in f:
line = line.strip()
# skip any comments
if line.startswith("#") or line == "":
continue
# deal with the columns header -- should always start with "graphemes" as per the orthography profiles specification
if line.lower().startswith("graphemes"):
header = line.split("\t")
continue
line = unicodedata.normalize("NFD", line)
tokens = line.split("\t") # split the orthography profile into columns
grapheme = tokens[0]
addMultigraph(root, grapheme)
f.close()
return root
def printMultigraphs(root, line, result):
# Base (or degenerate..) case.
if len(line) == 0:
result += "#"
return result
# Walk until we run out of either nodes or characters.
curr = 0 # Current index in line.
last = 0 # Index of last character of last-seen multigraph.
node = root
while curr < len(line):
node = node.getChild(line[curr])
if not node:
break
if node.isSentinel():
last = curr
curr += 1
# Print everything up to the last-seen sentinel, and process
# the rest of the line, while there is any remaining.
last = last + 1 # End of span (noninclusive).
result += line[:last]+" "
return printMultigraphs(root, line[last:], result)
def getParse(root, line):
parse = getParseInternal(root, line)
if len(parse) == 0:
return ""
return "# " + parse
def getParseInternal(root, line):
# Base (or degenerate..) case.
if len(line) == 0:
return "#"
parse = ""
curr = 0
node = root
while curr < len(line):
node = node.getChild(line[curr])
curr += 1
if not node:
break
if node.isSentinel():
subparse = getParseInternal(root, line[curr:])
if len(subparse) > 0:
# Always keep the latest valid parse, which will be
# the longest-matched (greedy match) graphemes.
parse = line[:curr] + " " + subparse
# Note that if we've reached EOL, but not end of valid grapheme,
# this will be an empty string.
return parse
def printTree(root, path):
for char, child in root.getChildren().items():
if child.isSentinel():
char += "*"
branch = (" -- " if len(path) > 0 else "")
printTree(child, path + branch + char)
if len(root.getChildren()) == 0:
print(path)