In [79]:
#| default_exp cfg
from nbdev import *
from nbdev.showdoc import *

# Earley Parser

> Parses it earley style

In [80]:
#| exporti
from collections import defaultdict
from itertools import product

In [81]:
class State(object):
    def __init__(self, label, rules, dot_idx, start_idx, end_idx, idx, made_from, producer):
        self.label = label
        self.rules = rules
        self.dot_idx = dot_idx
        self.start_idx = start_idx
        self.end_idx = end_idx
        self.idx = idx
        self.made_from = made_from
        self.producer = producer

    def next(self):
        """Returns the tag after the dot"""
        return self.rules[self.dot_idx]

    def complete(self):
        return len(self.rules) == self.dot_idx

    def __eq__(self, other):
        return (self.label == other.label and
                self.rules == other.rules and
                self.dot_idx == other.dot_idx and
                self.start_idx == other.start_idx and
                self.end_idx == other.end_idx)

    def __str__(self):
        rule_string = ''
        for i, rule in enumerate(self.rules):
            if i == self.dot_idx:
                rule_string += '\\bullet '
            rule_string += rule + ' '
        if self.dot_idx == len(self.rules):
            rule_string += '\\bullet'
        return 'S%d %s -> %s [%d, %d] %s %s' % (self.idx, self.label, rule_string, self.start_idx, 
                                                self.end_idx, self.made_from, self.producer)

class Earley:
    def __init__(self, words, grammar, terminals):
        self.chart = [[] for _ in range(len(words) + 1)]
        self.current_id = 0
        self.words = words
        self.grammar = grammar
        self.terminals = terminals

    def get_new_id(self):
        self.current_id += 1
        return self.current_id - 1

    def is_terminal(self, tag):
        return tag in self.terminals

    def is_complete(self, state):
        return len(state.rules) == state.dot_idx

    def enqueue(self, state, chart_entry):
        if state not in self.chart[chart_entry]:
            self.chart[chart_entry].append(state)
        else:
            self.current_id -= 1

    def predictor(self, state):
        for production in self.grammar[state.next()]:
            self.enqueue(State(state.next(), production, 0, state.end_idx, state.end_idx, self.get_new_id(), [], 'predictor'), state.end_idx)

    def scanner(self, state):
        if self.words[state.end_idx] in self.grammar[state.next()]:
            self.enqueue(State(state.next(), [self.words[state.end_idx]], 1, state.end_idx, state.end_idx + 1, self.get_new_id(), [], 'scanner'), state.end_idx + 1)

    def completer(self, state):
        for s in self.chart[state.start_idx]:
            if not s.complete() and s.next() == state.label and s.end_idx == state.start_idx and s.label != 'gamma':
                self.enqueue(State(s.label, s.rules, s.dot_idx + 1, s.start_idx, state.end_idx, self.get_new_id(), s.made_from + [state.idx], 'completer'), state.end_idx)

    def parse(self):
        self.enqueue(State('gamma', ['S'], 0, 0, 0, self.get_new_id(), [], 'dummy start state'), 0)
        
        for i in range(len(self.words) + 1):
            for state in self.chart[i]:
                if not state.complete() and not self.is_terminal(state.next()):
                    self.predictor(state)
                elif i != len(self.words) and not state.complete() and self.is_terminal(state.next()):
                    self.scanner(state)
                else:
                    self.completer(state)

    def __str__(self):
        res = ''
        
        for i, chart in enumerate(self.chart):
            res += '\nChart[%d]\n' % i
            for state in chart:
                res += str(state) + '\n'

        return res


def test():
    grammar = {
        'S':           [['NP', 'VP'], ['Aux', 'NP', 'VP'], ['VP']],
        'NP':          [['Det', 'Nominal'], ['Proper-Noun']],
        'Nominal':     [['Noun'], ['Noun', 'Nominal']],
        'VP':          [['Verb'], ['Verb', 'NP']],
        'Det':         ['that', 'this', 'a'],
        'Noun':        ['book', 'flight', 'meal', 'money'],
        'Verb':        ['book', 'include', 'prever'],
        'Aux':         ['does'],
        'Prep':        ['from', 'to', 'on'],
        'Proper-Noun': ['Houston', 'TWA']
    }
    terminals = ['Det', 'Noun', 'Verb', 'Aux', 'Prep', 'Proper-Noun']

    earley = Earley(['book', 'that', 'flight'], grammar, terminals)
    earley.parse()
    print(earley)
test()


Chart[0]
S0 gamma -> \bullet S  [0, 0] [] dummy start state
S1 S -> \bullet NP VP  [0, 0] [] predictor
S2 S -> \bullet Aux NP VP  [0, 0] [] predictor
S3 S -> \bullet VP  [0, 0] [] predictor
S4 NP -> \bullet Det Nominal  [0, 0] [] predictor
S5 NP -> \bullet Proper-Noun  [0, 0] [] predictor
S6 VP -> \bullet Verb  [0, 0] [] predictor
S7 VP -> \bullet Verb NP  [0, 0] [] predictor

Chart[1]
S8 Verb -> book \bullet [0, 1] [] scanner
S9 VP -> Verb \bullet [0, 1] [8] completer
S10 VP -> Verb \bullet NP  [0, 1] [8] completer
S11 S -> VP \bullet [0, 1] [9] completer
S12 NP -> \bullet Det Nominal  [1, 1] [] predictor
S13 NP -> \bullet Proper-Noun  [1, 1] [] predictor

Chart[2]
S14 Det -> that \bullet [1, 2] [] scanner
S15 NP -> Det \bullet Nominal  [1, 2] [14] completer
S16 Nominal -> \bullet Noun  [2, 2] [] predictor
S17 Nominal -> \bullet Noun Nominal  [2, 2] [] predictor

Chart[3]
S18 Noun -> flight \bullet [2, 3] [] scanner
S19 Nominal -> Noun \bullet [2, 3] [18] completer
S20 Nominal -> Nou

In [82]:
from pprint import pprint
from collections import UserDict
class StateDict(UserDict):
    def __repr__(self):
        pos = self.get('position')
        output = f"symbol {self.get('symbol'):<4}: {str(self.get('rule')[:pos]):<15} • {str(self.get('rule')[pos:]):<30} {'idx('}{self.get('startidx')}) {'Done' if self.get('completed') else ''})"
        return output
    def __hash__(self):
        return hash(''.join([str(v) for k,v in self.data.items()]))

def genstate(symbol, rule, idx):
    return StateDict({'symbol': symbol,
            'rule': rule,
            'startidx': idx,
            'position': 0,
            'steps': len(rule),
            'completed': False})
            # 'steps': len(rule)}

def predict(column, colidx, symbol):
    seen = {symbol} # otherwise you can have infinite loop when recursively adding
    toadd = {symbol}
    while toadd:
        cur = toadd.pop()
        
        for rule in grammar.get(cur, []):
            newstate = genstate(cur, rule, colidx)
            column.add(newstate)
            if (recursivesymbol := newstate['rule'][0]) not in seen:
                toadd.add(recursivesymbol)

def scan(column, parsedchar):
    # we are scanning the states in a column to check if we can advance
    advancedstates = []
    for state in column:
        if not state['completed']:
            # print(state, parsedchar, state['rule'][0])
            if state['rule'][state['position']] == parsedchar:
                state = state.copy() # nasty bug, make sure to copy the state, otherwise you change the old state inplace and have the possibility to advance multiple positions during 1 character!
                state['position'] += 1
                if state['position'] == state['steps']:
                    state['completed'] = True
                advancedstates.append(state)
    return advancedstates




# pprint(cols[0])
def parse(grammar, message, start):
    cols = [set() for _ in range(len(message)+1)]
    predict(cols[0], 0, start)
    for colidx in range(1, len(message)+1): # we just populated the 0th col, now the first character has colidx 1
        # print(colidx, message[colidx-1])
        advancedstates = scan(cols[colidx-1], message[colidx-1])
        # print('characted:', message[colidx-1], ':we have advanced on', [s['symbol'] for s in advancedstates])
        
        # check if any of the states are completed, if yes do scan again (loop untill nothing is completed)
        while advancedstates:
            advancedstate = advancedstates.pop()
            cols[colidx].add(advancedstate)
            if advancedstate['completed']:
                # print('we have completion', advancedstate['symbol'])
                # do scan again, looking to advance other states based on the symbol of the completed rule
                toadd = scan(cols[advancedstate['startidx']], advancedstate['symbol'])
                for newadvancedstate in toadd:
                    advancedstates.append(newadvancedstate)
            else:
                # predict new states based on the next expected symbol in the rule
                predict(cols[colidx], colidx, advancedstate['rule'][advancedstate['position']])
        # pprint(cols[colidx])

    for state in cols[colidx]:
        if state['completed'] and state['startidx'] == 0 and state['symbol'] == start:
            return True #('valid')
    else:
        return False
# messages = ['ababbb', 'bababa', 'abbbab', 'aaabbb', 'aaaabbb']
# message = messages[2]
m = 'babbaaaabbbbbbabaaaaabbb'
# message = 'babbaaaa'
# message = 'aaaabb'
# # message = 'aab'
# start = '60'
# # start = '16'


In [83]:
# https://adventofcode.com/2020/day/19
rules, messages = open('cfgloop.txt').read().split('\n\n')
grammar = {}

for rule in rules.split('\n'):
    num, makefrom = rule.split(': ')
    makefrom = makefrom.replace('"', '')
    makefrom = tuple(makefrom.split(' | '))
    makefrom = tuple(tuple(option.split()) for option in makefrom)
    grammar[num] = makefrom
    
messages = messages.split('\n')
s = '0'
ans = 0
for m in messages:
    ans += parse(grammar, m,s)
ans

243

In [84]:
import re
# https://adventofcode.com/2015/day/19
rules, mol = open('cfg2.txt', 'r').read().split('\n\n')
newrules = defaultdict(set)
counter = 1
for line in rules.splitlines():
    first, second = line.split(' => ')
    molecules = tuple(re.findall('[A-Z][^A-Z]*', second))
    newrules[first].add(molecules)

mollie = []
prev = ''
for ch in mol:
    if ch.islower():
        prev += ch
    else:
        if prev:
            mollie.append(prev)
        prev = ch
    
mollie.append(prev)
mollie

['C',
 'Rn',
 'Ca',
 'Ca',
 'Ca',
 'Si',
 'Rn',
 'B',
 'P',
 'Ti',
 'Mg',
 'Ar',
 'Si',
 'Rn',
 'Si',
 'Rn',
 'Mg',
 'Ar',
 'Si',
 'Rn',
 'Ca',
 'F',
 'Ar',
 'Ti',
 'Ti',
 'B',
 'Si',
 'Th',
 'F',
 'Y',
 'Ca',
 'F',
 'Ar',
 'Ca',
 'Ca',
 'Si',
 'Th',
 'Ca',
 'P',
 'B',
 'Si',
 'Th',
 'Si',
 'Th',
 'Ca',
 'Ca',
 'P',
 'Ti',
 'Rn',
 'P',
 'B',
 'Si',
 'Th',
 'Rn',
 'F',
 'Ar',
 'Ar',
 'Ca',
 'Ca',
 'Si',
 'Th',
 'Ca',
 'Si',
 'Th',
 'Si',
 'Rn',
 'Mg',
 'Ar',
 'Ca',
 'P',
 'Ti',
 'B',
 'P',
 'Rn',
 'F',
 'Ar',
 'Si',
 'Th',
 'Ca',
 'Si',
 'Rn',
 'F',
 'Ar',
 'B',
 'Ca',
 'Si',
 'Rn',
 'Ca',
 'P',
 'Rn',
 'F',
 'Ar',
 'P',
 'Mg',
 'Y',
 'Ca',
 'F',
 'Ar',
 'Ca',
 'P',
 'Ti',
 'Ti',
 'Ti',
 'B',
 'P',
 'B',
 'Si',
 'Th',
 'Ca',
 'P',
 'Ti',
 'B',
 'P',
 'B',
 'Si',
 'Rn',
 'F',
 'Ar',
 'B',
 'P',
 'B',
 'Si',
 'Rn',
 'Ca',
 'F',
 'Ar',
 'B',
 'P',
 'Rn',
 'Si',
 'Rn',
 'F',
 'Ar',
 'Rn',
 'Si',
 'Rn',
 'B',
 'F',
 'Ar',
 'Ca',
 'F',
 'Ar',
 'Ca',
 'Ca',
 'Ca',
 'Si',
 'Th',
 'Si',
 'Th',
 

In [85]:
parse(newrules, mollie, 'e')

False