In [2]:
#| default_exp cfg
from nbdev import *
from nbdev.showdoc import *

# Earley Parser

> Parses it earley style

In [3]:
#| exporti
from pprint import pprint
from typing import NamedTuple
from functools import cache

In [52]:
class State(NamedTuple):
    symbol: str
    rule: tuple
    startidx: int
    position: int
    predecessor: object
    creator: object
       
    def __repr__(self):
        pos = self.position
        return f"{self.symbol:<3}: {str(self.rule[:pos]):<15} • {str(self.rule[pos:]):<30} {'idx('}{self.startidx}) {'Done' if self.is_complete() else ''})"
    
    def is_complete(self):
        return self.position == len(self.rule)
    
    def nextsym(self):
        return False if self.is_complete() else self.rule[self.position]
    # def __eq__(self, other):
    #     return all(x==y for x, y in zip(self, other))
        

def parse(grammar, message, start, verbose=False):
    def scan(column, parsedsym, advancedstate):
        # we are scanning the states in a column to check if we can advance based on the parsed symbol
        advancedstates = []
        for state in column:
            if not state.is_complete() and state.nextsym() == parsedsym:
                symbol, rule, startidx, position, predecessor, creator = state
                position += 1
                advancedstates.append(State(symbol, rule, startidx, position, state, advancedstate))
        if verbose: print('parsed symbol:', message[colidx-1], ':we have advances on', [s['symbol'] for s in advancedstates])
        return advancedstates

    @cache
    def predict(colidx, symbol, advancedstate):
        seen = {symbol} # otherwise you can have infinite loop when recursively adding
        toadd = {(symbol, advancedstate)}
        predicted = set()
        while toadd:
            cur, curstate = toadd.pop()
            for rule in grammar.get(cur, []):
                newstate = State(cur, rule, colidx, 0, False, 'creator')
                predicted.add(newstate)
                if (recursivesymbol := newstate.rule[0]) not in seen:
                    toadd.add((recursivesymbol, newstate))
                    seen.add(recursivesymbol)
        return frozenset(predicted)
    
    def completer(cols, advancedstates):
        # check if any of the states are completed, if yes do scan again (loop untill nothing is completed)
        while advancedstates:
            advancedstate = advancedstates.pop()
            cols[colidx].add(advancedstate)
            if advancedstate.is_complete(): # do scan again, looking to advance other states based on the symbol of the completed rule
                if verbose: print('completion:', advancedstate['symbol'])
                advancedstates += scan(cols[advancedstate.startidx], advancedstate.symbol, advancedstate)
            else: # predict new states based on the next expected symbol in the rule
                cols[colidx] |= predict(colidx, advancedstate.nextsym(), advancedstate)

        
        
    def isvalid(cols):
        for state in cols[colidx]:
            if state.is_complete() and state.startidx == 0 and state.symbol == start:
                return True #('valid')
        else:
            return False

    def __init__():        
        cols = [set() for _ in range(len(message)+1)]
        cols[0] |= predict(0, start, 'starter')
        if verbose: pprint(cols[0])
        return cols
    
    #check input
    assert all(isinstance(v, tuple) for v in grammar.values()), 'a symbol should contain a tuple of 1 or more production rules'
    assert all(isinstance(option, tuple) for v in grammar.values() for option in v), 'rules in the grammar should be tuples'
    assert all(isinstance(ch, str) for v in grammar.values() for option in v for ch in option), 'symbols in the grammar should be strings'

    cols = __init__()
    for colidx in range(1, len(message)+1): # we just populated the 0th col, now the first character has colidx 1
        if verbose: print(colidx, message[colidx-1], len(cols[colidx-1]))
        advancedstates = scan(cols[colidx-1], message[colidx-1], 'scanner')
        completer(cols, advancedstates)
    # return cols
    return (cols)

In [53]:
cols = parse(grammar, messages[0],s)
cols

[{0  : ()              • ('8', '11')                    idx(0) ),
  108: ()              • ('57', '95')                   idx(0) ),
  108: ()              • ('83', '83')                   idx(0) ),
  131: ()              • ('20', '83')                   idx(0) ),
  131: ()              • ('74', '57')                   idx(0) ),
  133: ()              • ('57', '79')                   idx(0) ),
  133: ()              • ('83', '13')                   idx(0) ),
  14 : ()              • ('57', '124')                  idx(0) ),
  14 : ()              • ('83', '138')                  idx(0) ),
  15 : ()              • ('35', '83')                   idx(0) ),
  15 : ()              • ('53', '57')                   idx(0) ),
  20 : ()              • ('57', '127')                  idx(0) ),
  20 : ()              • ('83', '80')                   idx(0) ),
  30 : ()              • ('57', '70')                   idx(0) ),
  30 : ()              • ('83', '7')                    idx(0) ),
  35 : () 

In [74]:
from collections import deque
ans = deque([])

def find(cols, ans, endstate=False, start = False):
    if start:
        for state in cols[-1]:
            if state.is_complete() and state.startidx == 0 and state.symbol == '0':
                ans.appendleft(state)
    elif not isinstance(endstate, State):
        return ans
    else:
        ans.appendleft(endstate)
    cur = ans[0]          
    while isinstance(cur, State) and isinstance(cur.predecessor, State):
        if isinstance(cur, State):
            ans = find(cols, ans, cur.creator)
        ans.appendleft(cur.predecessor)
        cur = ans[0]
    
    return ans

def printparse(ans):
    # currently only print the begin of a rule and items descendents idented
    ident = -2
    prev = None
    for idx, state in enumerate(ans):
        if state.position == 0:
            ident += 2
            print('.'*ident, state)
        
        if state.is_complete():
            ident -= 2
        prev = state
ans = find(cols, deque([]), endstate = False, start=True)
printparse(ans)



 0  : ()              • ('8', '11')                    idx(0) )
.. 8  : ()              • ('42',)                        idx(0) )
.... 42 : ()              • ('15', '83')                   idx(0) )
...... 15 : ()              • ('35', '83')                   idx(0) )
........ 35 : ()              • ('57', '89')                   idx(0) )
.......... 57 : ()              • ('b',)                         idx(0) )
.......... 89 : ()              • ('83', '40')                   idx(1) )
............ 83 : ()              • ('a',)                         idx(1) )
............ 40 : ()              • ('57', '101')                  idx(2) )
.............. 57 : ()              • ('b',)                         idx(2) )
.............. 101: ()              • ('57', '72')                   idx(3) )
................ 57 : ()              • ('b',)                         idx(3) )
................ 72 : ()              • ('83', '83')                   idx(4) )
.................. 83 : ()              • ('

In [66]:
for state in ans:
    print(state, state.is_complete())

0  : ()              • ('8', '11')                    idx(0) ) False
8  : ()              • ('42',)                        idx(0) ) False
42 : ()              • ('15', '83')                   idx(0) ) False
15 : ()              • ('35', '83')                   idx(0) ) False
35 : ()              • ('57', '89')                   idx(0) ) False
57 : ()              • ('b',)                         idx(0) ) False
57 : ('b',)          • ()                             idx(0) Done) True
35 : ('57',)         • ('89',)                        idx(0) ) False
89 : ()              • ('83', '40')                   idx(1) ) False
83 : ()              • ('a',)                         idx(1) ) False
83 : ('a',)          • ()                             idx(1) Done) True
89 : ('83',)         • ('40',)                        idx(1) ) False
40 : ()              • ('57', '101')                  idx(2) ) False
57 : ()              • ('b',)                         idx(2) ) False
57 : ('b',)          • ()   

In [6]:
# https://adventofcodecom/2020/day/19
rules, messages = open('cfg.txt').read().split('\n\n')
grammar = {}

for rule in rules.split('\n'):
    num, makefrom = rule.split(': ')
    makefrom = makefrom.replace('"', '')
    makefrom = tuple(makefrom.split(' | '))
    makefrom = tuple(tuple(option.split()) for option in makefrom)
    grammar[num] = makefrom
    
messages = messages.split('\n')
s = '0'
ans = 0
for m in messages:
    ans += parse(grammar, m,s)
    print(m, parse(grammar, m,s))
ans

TypeError: unsupported operand type(s) for +=: 'int' and 'list'

In [8]:
cols = parse(grammar, messages[0],s)