forked from mideind/GreynirServer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
executable file
·881 lines (727 loc) · 32.8 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
"""
Reynir: Natural language processing for Icelandic
Parser module
Copyright (c) 2015 Vilhjalmur Thorsteinsson
All rights reserved
***
NOTE: A faster version of the Earley parser, written in C++,
is available in fastparser.py/eparser.cpp.
***
This module uses an Earley-Scott parser to transform token sequences
(sentences) into forests of parse trees, with each tree representing a
possible parse of a sentence, according to a given context-free grammar.
An Earley parser handles all valid context-free grammars,
irrespective of ambiguity, recursion (left/middle/right), nullability, etc.
The returned parse trees reflect the original grammar, which does not
need to be normalized or modified in any way. All partial parses are
available in the final parse state table.
For further information see J. Earley, "An efficient context-free parsing algorithm",
Communications of the Association for Computing Machinery, 13:2:94-102, 1970.
The Earley parser used here is the improved version described by Scott & Johnstone,
referencing Tomita. This allows worst-case cubic (O(n^3)) order, where n is the
length of the input sentence, while still returning all possible parse trees
for an ambiguous grammar.
See Elizabeth Scott, Adrian Johnstone:
"Recognition is not parsing — SPPF-style parsing from cubic recognisers"
Science of Computer Programming, Volume 75, Issues 1–2, 1 January 2010, Pages 55–70
"""
from collections import defaultdict
from grammar import Nonterminal, Terminal, Token
#from flask import current_app
#
#def debug():
# # Call this to trigger the Flask debugger on purpose
# assert current_app.debug == False, "Don't panic! You're here by request of debug()"
class Node:
""" Shared Packed Parse Forest (SPPF) node representation.
A node label is a tuple (s, j, i) where s can be
(a) a nonterminal, for completed productions;
(b) a token corresponding to a terminal;
(c) a (nonterminal, dot, prod) tuple, for partially parsed productions.
j and i are the start and end token indices, respectively.
A forest of Nodes can be navigated using a subclass of
ParseForestNavigator.
"""
def __init__(self, parser, label):
""" Initialize a SPPF node with a given label tuple """
# assert isinstance(label, tuple)
if isinstance(label[0], int):
# Convert the label from (nt-index, i, j) to (nt, i, j)
assert label[0] < 0 # Nonterminal indices are negative
label = (parser._nonterminals[label[0]], label[1], label[2])
self._label = label
self._families = None # Families of children
def add_family(self, prod, children):
""" Add a family of children to this node, in parallel with other families """
# Note which production is responsible for this subtree,
# to help navigate the tree in case of ambiguity
pc_tuple = (prod, children)
if self._families is None:
self._families = [ pc_tuple ]
return
if pc_tuple not in self._families:
self._families.append(pc_tuple)
@property
def label(self):
""" Return the node label """
return self._label
@property
def start(self):
""" Return the start token index """
return self._label[1]
@property
def end(self):
""" Return the end token index """
return self._label[2]
@property
def head(self):
""" Return the 'head' of this node, i.e. a top-level readable name for it """
return self._label[0]
@property
def is_ambiguous(self):
""" Return True if this node has more than one family of children """
return self._families and len(self._families) >= 2
@property
def is_interior(self):
""" Returns True if this is an interior node (partially parsed production) """
return isinstance(self._label[0], tuple)
@property
def is_completed(self):
""" Returns True if this is a node corresponding to a completed nonterminal """
return isinstance(self._label[0], Nonterminal)
@property
def is_token(self):
""" Returns True if this is a token node """
return isinstance(self._label[0], Token)
@property
def has_children(self):
""" Return True if there are any families of children of this node """
return bool(self._families)
@property
def is_empty(self):
""" Return True if there is only a single empty family of this node """
if not self._families:
return True
return len(self._families) == 1 and self._families[0][1] is None
def enum_children(self):
""" Enumerate families of children """
if self._families:
for prod, c in self._families:
yield (prod, c)
def reduce_to(self, child_ix):
""" Eliminate all child families except the given one """
if not self._families or child_ix >= len(self._families):
raise IndexError("Child index out of range")
f = self._families[child_ix] # The survivor
# Collapse the list to one option
self._families = [ f ]
def __eq__(self, other):
""" Nodes are considered equal if their labels are equal """
if not isinstance(other, Node):
return False
return self._label == other._label
def __hash__(self):
""" Make this node hashable """
return id(self).__hash__()
def __repr__(self):
""" Create a reasonably nice text representation of this node
and its families of children, if any """
label_rep = self._label.__repr__()
families_rep = ""
if self._families:
if len(self._families) == 1:
families_rep = self._families[0].__repr__()
else:
families_rep = "<" + self._families.__repr__() + ">"
return label_rep + ((": " + families_rep + "\n") if families_rep else "")
def to_str(self, grammar):
""" Return a string representation of this node """
h = self.head
if isinstance(h, tuple):
# Interior node: return a readable rep of the associated nonterminal
assert isinstance(h[0], int)
assert h[0] < 0
h = grammar.nonterminals_by_ix[h[0]]
return str(h)
class ParseForestNavigator(object):
""" Base class for navigating parse forests. Override the underscored
methods to perform actions at the corresponding points of navigation. """
def __init__(self, grammar, visit_all = False):
""" If visit_all is False, we only visit each packed node once.
If True, we visit the entire tree in order. """
self._grammar = grammar
self._visit_all = visit_all
def _visit_epsilon(self, level):
""" At Epsilon node """
return None
def _visit_token(self, level, node, terminal):
""" At token node """
return None
def _visit_nonterminal(self, level, node):
""" At nonterminal node """
# Return object to collect results
return None
def _visit_family(self, results, level, node, ix, prod):
""" At a family of children """
return
# noinspection PyMethodMayBeStatic
def _add_result(self, results, ix, r):
""" Append a single result object r to the result object """
return
# noinspection PyMethodMayBeStatic
def _process_results(self, results, node):
""" Process results after visiting children.
The results list typically contains tuples (ix, r) where ix is
the family index and r is the child result """
return None
def go(self, root_node):
""" Navigate the forest from the root node """
visited = dict()
def _nav_helper(w, index, level, parent):
""" Navigate from w """
if w is None:
# Epsilon node
return self._visit_epsilon(level)
if w.is_token:
p = parent.production[index] # Note that index may be (and often is) negative
assert isinstance(p, Terminal)
# Return the score of this terminal option
return self._visit_token(level, w, p)
if not self._visit_all and w.label in visited:
# Already seen: return the previously calculated result
return visited[w.label]
# Init container for child results
results = self._visit_nonterminal(level, w)
# noinspection PyNoneFunctionAssignment
if results is NotImplemented:
# If _visit_nonterminal() returns NotImplemented,
# don't bother visiting children or processing
# results; instead _nav_helper() return NotImplemented
v = results
else:
if w.is_interior:
child_level = level
else:
child_level = level + 1
for ix, pc in enumerate(w.enum_children()):
prod, f = pc
self._visit_family(results, level, w, ix, prod)
if w.is_completed:
# Completed nonterminal: restart children index
child_ix = -1
else:
child_ix = index
if isinstance(f, tuple):
child_ix -= 1
for child in range(2):
self._add_result(results, ix,
_nav_helper(f[child], child_ix + child, child_level, prod))
else:
self._add_result(results, ix, _nav_helper(f, child_ix, child_level, prod))
v = self._process_results(results, w)
if not self._visit_all:
# Mark the node as visited and store its result
visited[w.label] = v
return v
return _nav_helper(root_node, 0, 0, None)
class ParseError(Exception):
""" Exception class for parser errors """
def __init__(self, txt, token_index = None, info = None):
""" Store an information object with the exception,
containing the parser state immediately before the error """
Exception.__init__(self, txt)
self._info = info
self._token_index = token_index
@property
def info(self):
""" Return the parser state information object """
return self._info
@property
def token_index(self):
""" Return the 0-based index of the token where the parser ran out of options """
return self._token_index
class Parser:
""" Parses a sequence of tokens according to a given grammar and
a root nonterminal within that grammar, returning a forest of
possible parses. The parses uses an optimized Earley algorithm.
"""
# Parser version - change when logic changes so that output is affected
_VERSION = "1.0"
class EarleyColumn:
""" Container for the (unique) states in a single Earley column.
Each token in the input sentence has an associated column,
and a final column corresponds to the end-of-sentence marker.
This class stores the states both in a list for easy indexed
access, and in a set to enable a quick check for whether
a state is already present. It also stores a dictionary
of all states keyed by the particular nonterminal at the
'dot' in the production that the state refers to. This greatly
speeds up the Earley completion phase. """
def __init__(self, token):
""" Maintain a list and a set in parallel """
self._token = token
if token is None:
# If no token associated with this column
# (because it's the last column), call
# self.matches_none() instead of self.matches()
# to avoid a check for this (rare) condition
# on every call to self.matches()
self.matches = self.matches_none
self._states = []
self._numstates = 0
self._set = set()
# Set of nonterminals already added to this column
self._nt_set = set ()
# Dictionary of states keyed by nonterminal at prod[dot]
self._nt_dict = defaultdict(list)
# Cache of terminal matches
self._matches = dict()
def add(self, newstate):
""" Add a new state to this column if it is not already there """
if newstate not in self._set:
self._states.append(newstate)
self._set.add(newstate)
_, dot, prod, _, _ = newstate
# prod is a tuple of terminal (>0) and nonterminal (<0) indexes
nt = 0 if dot >= len(prod) else prod[dot]
if nt < 0:
# The state is at a nonterminal: add its index to our dict
# defaultdict automatically creates empty list if no entry for nt
self._nt_dict[nt].append(self._numstates)
self._numstates += 1
def matches(self, terminal):
""" Check whether the token in this column matches the given terminal """
# Cache lookup
m = self._matches.get(terminal)
if m is None:
# Not found in cache: do the actual match and cache it
self._matches[terminal] = m = self._token.matches(terminal)
return m
# noinspection PyMethodMayBeStatic
def matches_none(self, terminal):
""" Shadow function for matches() that is called if there is no token in this column """
return False
def already_has(self, nt):
""" Return False if this nonterminal has already been added to the column.
Otherwise, add it and return True. """
if nt in self._nt_set:
return True
self._nt_set.add(nt)
return False
def enum_nt(self, nt):
""" Enumerate all states where prod[dot] is nt """
st_list = self._nt_dict.get(nt)
if st_list:
for ix in st_list:
yield self._states[ix]
def cleanup(self):
""" Get rid of temporary data once the parser has moved past this column """
self._set = None
self._matches = None
# Keep the states themselves for diagnostics and debugging
def __len__(self):
""" Return the number of states in the column """
return self._numstates
def __getitem__(self, index):
""" Return the state at the given index position """
return self._states[index]
def __iter__(self):
""" Return an iterator over the state list """
return iter(self._states)
def info(self, parser):
""" Return a list of the parser states within this column in a 'readable' format """
def readable(s):
""" Return a 'readable' form of parser state s where
item indices have been converted to object references """
nt, dot, prod, i, w = s
return (parser._lookup(nt), dot, [parser._lookup(t) for t in prod], i)
return [readable(s) for s in self._states if s[1] > 0] # Skip states with the dot at the beginning
class PackedProduction:
""" A container for a packed production, i.e. a grammar Production
where the component terminals and nonterminals have been packed
into a list of integer indices """
def __init__(self, priority, production):
# Store the relative priority of this production within its nonterminal
self._priority = priority
# Keep a reference to the original production
self._production = production
# Store the packed list of indices
self._ix_list = production.prod
# Cache the length
self._len = len(self._ix_list)
# Cache the hash
self._hash = production.__hash__()
@property
def production(self):
return self._production
@property
def priority(self):
return self._priority
def __hash__(self):
return self._hash
def __getitem__(self, index):
return self._ix_list[index] if index < self._len else 0
def __len__(self):
return self._len
def __eq__(self, other):
return id(self) == id(other)
def __iter__(self):
return iter(self._ix_list)
def __init__(self, g):
""" Initialize a parser for a given grammar """
nt_d = g.nt_dict
r = g.root
assert nt_d is not None
assert r is not None
assert r in nt_d
# Convert the grammar to integer index representation for speed
self._root = r.index
# Make new grammar dictionary, keyed by nonterminal index and
# containing packed productions with integer indices
self._nt_dict = { }
for nt, plist in nt_d.items():
self._nt_dict[nt.index] = None if plist is None else \
[ Parser.PackedProduction(prio, p) for prio, p in plist ]
self._nonterminals = g.nonterminals_by_ix
self._terminals = g.terminals_by_ix
@classmethod
def for_grammar(cls, g):
""" Create a Parser for the Grammar in g """
return cls(g)
def go(self, tokens):
""" Parse the token stream and return a forest of nodes using
the Earley algorithm as improved by Scott (referencing Tomita).
The parser handles ambiguity, returning alternative options within
a single packed tree.
Comments refer to the EARLEY_PARSER pseudocode given in the
Scott/Johnstone paper, cf. the reference at the top of this module.
***
NOTE that this function is overridden in fastparser.py. It is no longer
used for parsing within Reynir.
***
"""
def _make_node(nt_B, dot, prod, j, i, w, v, V):
""" MAKE_NODE(B ::= αx · β, j, i, w, v, V) """
len_prod = len(prod)
if dot == 1 and len_prod >= 2:
# α is empty and β is nonempty: return v
return v
# Create a label for the new node
if dot >= len_prod:
# β is empty (i.e. the nonterminal B is complete)
s = nt_B
else:
# Intermediate position within production of B
s = (nt_B, dot, prod)
# If there is no node y ∈ V labelled (s, j, i),
# create one and add it to V
label = (s, j, i)
if label in V:
y = V[label]
else:
V[label] = y = Node(self, label)
# assert v is not None
if w is None:
y.add_family(prod, v)
else:
# w is an already built subtree that we're putting a new
# node on top of
y.add_family(prod, (w, v)) # The code breaks if this is modified!
return y
def _push(newstate, _E, _Q):
""" Append a new state to an Earley column (_E) and a look-ahead set (_Q), as appropriate """
# (N ::= α·δ, h, y)
# newstate = (N, dot, prod, h, y)
dot, prod = newstate[1], newstate[2]
item = prod[dot] # Returns 0 if dot >= len(prod)
if item <= 0:
# Nonterminal or epsilon
# δ ∈ ΣN
_E.add(newstate)
elif _E.matches(self._terminals[item]):
# Terminal matching the current token
_Q.append(newstate)
if not tokens:
raise ParseError("No tokens to parse")
# V = ∅
V = { }
n = len(tokens)
# Initialize the Earley columns
# We create one for each token, plus a final (sentinel) column
E = [ Parser.EarleyColumn(t) for t in tokens ] + [ Parser.EarleyColumn(None) ]
E0 = E[0]
Q0 = [ ]
# Populate column 0 (E0) with start states and Q0 with lookaheads
for root_p in self._nt_dict[self._root]:
# Go through root productions
newstate = (self._root, 0, root_p, 0, None)
# add (S ::= ·α, 0, null) to E0 and Q0
_push(newstate, E0, Q0)
# Step through the columns
for i, Ei in enumerate(E):
# The agenda set R is Ei[j..len(Ei)]
if not Ei and not Q0:
# Parse options exhausted, nothing to do
raise ParseError("No parse available at token {0} ({1})"
.format(i, tokens[i-1]), i-1, E[i-1].info(self)) # Token index is 1-based
j = 0
H = defaultdict(list)
Q = Q0
Q0 = [ ]
try:
#while j < len(Ei):
while True:
# Remove an element, Λ say, from R
# Λ = state
state = Ei[j]
j += 1
nt_B, dot, prod, h, w = state
nt_C = prod[dot]
# if Λ = (B ::= α · Cβ, h, w):
if nt_C < 0: # Nonterminal
# Earley predictor
# for all (C ::= δ) ∈ P:
# Go through all right hand sides of non-terminal nt_C
if not Ei.already_has(nt_C):
for p in self._nt_dict[nt_C]:
# if δ ∈ ΣN and (C ::= ·δ, i, null) !∈ Ei:
newstate = (nt_C, 0, p, i, None)
_push(newstate, Ei, Q)
# if ((C, v) ∈ H):
for v in H.get(nt_C, []):
# y = MAKE_NODE(B ::= αC · β, h, i, w, v, V)
y = _make_node(nt_B, dot + 1, prod, h, i, w, v, V)
newstate = (nt_B, dot + 1, prod, h, y)
_push(newstate, Ei, Q)
# if Λ = (D ::= α·, h, w):
elif nt_C == 0: # dot >= len(prod)
# Earley completer
if not w:
label = (nt_B, i, i)
if label in V:
w = V[label]
else:
w = V[label] = Node(self, label)
w.add_family(prod, None) # Add e (empty production) as a family
if h == i:
# Empty production satisfied
H[nt_B].append(w) # defaultdict automatically creates an empty list
# for all (A ::= τ · Dδ, k, z) in Eh:
for st0 in E[h].enum_nt(nt_B):
nt_A, dot0, prod0, k, z = st0
y = _make_node(nt_A, dot0 + 1, prod0, k, i, z, w, V)
newstate = (nt_A, dot0 + 1, prod0, k, y)
_push(newstate, Ei, Q)
except IndexError:
# The loop terminates naturally on an IndexError
# when j becomes >= len(Ei)
assert j >= len(Ei)
V = { }
if Q:
label = (tokens[i], i, i + 1)
v = Node(self, label)
while Q:
# Earley scanner
# Remove an element, Λ = (B ::= α · ai+1β, h, w) say, from Q
state = Q.pop()
nt_B, dot, prod, h, w = state
# assert isinstance(prod[dot], Terminal)
# assert tokens[i].matches(prod[dot])
# y = MAKE_NODE(B ::= αai+1 · β, h, i + 1, w, v, V)
# noinspection PyUnboundLocalVariable
y = _make_node(nt_B, dot + 1, prod, h, i + 1, w, v, V)
newstate = (nt_B, dot + 1, prod, h, y)
_push(newstate, E[i + 1], Q0)
# Discard unnecessary cache stuff from memory
Ei.cleanup()
# if (S ::= τ ·, 0, w) ∈ En: return w
for nt, dot, prod, k, w in E[n]:
if nt == self._root and dot >= len(prod) and k == 0:
# Completed production that spans the entire chart: we're done
return w
# No parse at last token
raise ParseError("No parse available at token {0} ({1})"
.format(n, tokens[n-1]), n - 1, E[n].info(self)) # Token index is 1-based
def go_no_exc(self, tokens):
""" Simple version of go() that returns None instead of throwing ParseError """
try:
return self.go(tokens)
except ParseError:
return None
def _lookup(self, ix):
""" Convert a production item from an index to an object reference """
assert ix != 0
return self._nonterminals[ix] if ix < 0 else self._terminals[ix]
@staticmethod
def num_combinations(w):
""" Count the number of possible parse trees in the given forest """
if w is None or w.is_token:
# Empty (epsilon) node or token node
return 1
comb = 0
for _, f in w.enum_children():
if isinstance(f, tuple):
cnt = 1
for c in f:
cnt *= Parser.num_combinations(c)
comb += cnt
else:
comb += Parser.num_combinations(f)
return comb if comb > 0 else 1
@staticmethod
def make_schema(w):
""" Create a flattened parse schema from the forest w """
class CC:
""" Manages choice coordinates """
stack = []
level = 0
@classmethod
def push(cls, option):
""" Identify each option subtree with a different root index """
cls.level += 1
while len(cls.stack) < cls.level:
cls.stack.append(0)
r = cls.stack[cls.level - 1]
cls.stack[cls.level - 1] += 1
return r
@classmethod
def pop(cls):
""" Maintain one level count above the now current level """
while len(cls.stack) > cls.level:
cls.stack.pop()
cls.level -= 1
@classmethod
def coord(cls):
return tuple(cls.stack[0:cls.level])
def _part(w, level, index, parent, suffix):
""" Return a tuple (colheading + options, start_token, end_token, partlist, info)
where the partlist is again a list of the component schemas - or a terminal
matching a single token - or None if empty """
if w is None:
# Epsilon node: return empty list
return None
if w.is_token:
p = parent.production[index]
assert isinstance(p, Terminal)
return ([ level ] + suffix, w.start, w.end, None, (p, w.head.text))
# Interior nodes are not returned
# and do not increment the indentation level
if not w.is_interior:
level += 1
# Accumulate the resulting parts
plist = [ ]
ambig = w.is_ambiguous
add_suffix = [ ]
for ix, pc in enumerate(w.enum_children()):
prod, f = pc
if ambig:
# Uniquely identify the available parse options with a coordinate
add_suffix = [ ix ]
if w.is_completed:
# Completed nonterminal: start counting children from the last one
child_ix = -1
# parent = w
else:
# Interior node: continue the indexing from where we left off
child_ix = index
def add_part(p):
""" Add a subtuple p to the part list plist """
if p:
if p[0] is None:
# p describes an interior node
plist.extend(p[3])
elif p[2] > p[1]:
# Only include subtrees that actually contain terminals
plist.append(p)
if isinstance(f, tuple):
# len(f) is always 2
child_ix -= 1
add_part(_part(f[0], level, child_ix, prod, suffix + add_suffix))
add_part(_part(f[1], level, child_ix + 1, prod, suffix + add_suffix))
else:
add_part(_part(f, level, child_ix, prod, suffix + add_suffix))
if w.is_interior:
# Interior node: relay plist up the tree
# noinspection PyRedundantParentheses
return (None, 0, 0, plist, None)
# Completed nonterminal
assert w.is_completed
assert isinstance(w.head, Nonterminal)
return ([level - 1] + suffix, w.start, w.end, plist, w.head)
if w is None:
return None
return _part(w, 0, 0, None, [ ])
@staticmethod
def make_grid(w):
""" Make a 2d grid from a flattened parse schema """
if w is None:
return None
schema = Parser.make_schema(w)
assert schema[1] == 0
cols = [] # The columns to be populated
NULL_TUPLE = tuple()
def _traverse(p):
""" Traverse a schema subtree and insert the nodes into their
respective grid columns """
# p[0] is the coordinate of this subtree (level + suffix)
# p[1] is the start column of this subtree
# p[2] is the end column of this subtree
# p[3] is the subpart list
# p[4] is the nonterminal or terminal/token at the head of this subtree
col, option = p[0][0], p[0][1:] # Level of this subtree and option
if not option:
# No option: use a 'clean key' of NULL_TUPLE
option = NULL_TUPLE
else:
# Convert list to a frozen (hashable) tuple
option = tuple(option)
while len(cols) <= col:
# Add empty columns as required to reach this level
cols.append(dict())
# Add a tuple describing the rows spanned and the node info
assert isinstance(p[4], Nonterminal) or isinstance(p[4], tuple)
if option not in cols[col]:
# Put in a dictionary entry for this option
cols[col][option] = []
cols[col][option].append((p[1], p[2], p[4]))
# Navigate into subparts, if any
if p[3]:
for subpart in p[3]:
_traverse(subpart)
_traverse(schema)
# Return a tuple with the grid and the number of tokens
return (cols, schema[2])
class ParseForestPrinter(ParseForestNavigator):
""" Print a parse forest to stdout or a file """
def __init__(self, grammar, detailed = False, file = None):
super().__init__(grammar, visit_all = True) # Visit all nodes
self._detailed = detailed
self._file = file
def _visit_epsilon(self, level):
indent = " " * level # Two spaces per indent level
print(indent + "(empty)", file = self._file)
return None
def _visit_token(self, level, w, terminal):
indent = " " * level # Two spaces per indent level
print(indent + "{0}: {1}".format(terminal, w), file = self._file)
return None
def _visit_nonterminal(self, level, w):
# Interior nodes are not printed
# and do not increment the indentation level
if self._detailed or not w.is_interior:
h = w.to_str(self._grammar)
if not self._detailed:
if (h.endswith("?") or h.endswith("*")) and w.is_empty:
# Skip printing optional nodes that don't contain anything
return NotImplemented # Don't visit child nodes
indent = " " * level # Two spaces per indent level
print(indent + h, file = self._file)
return None # No results required, but visit children
def _visit_family(self, results, level, w, ix, prod):
if w.is_ambiguous:
indent = " " * level # Two spaces per indent level
print(indent + "Option " + str(ix + 1) + ":", file = self._file)
@classmethod
def print_forest(cls, grammar, root_node, detailed = False, file = None):
""" Print a parse forest to the given file, or stdout if none """
cls(grammar, detailed, file).go(root_node)