In [5]:
#| default_exp cfg
from nbdev import *
from nbdev.showdoc import *

# Context free grammar

> Takes a grammar, converts it into Chomsky Normal Form (CNF)

In [6]:
#| exporti
from collections import defaultdict
from itertools import product

In [64]:
#| export

class CFG():
    """Takes a grammer as dict with tuple of options as values. Terminal values should not be in a tuple but as a string
    Usage:
        cfg = CFG(grammar_dict)
            reverse as optional parameter when k,v are reversed
            converts the grammar to Chomsky Normal form by taking care of options, unit productions and triplets
        cfg.solve(messages_list) returns dict of substrings with possible rules to make them
    """
    def __init__(self, grammar, terminals = None, reverse = True):
        self.outcomes = defaultdict(set)
        self.extra = 1
        if not terminals:
            print('no terminals specified!')
            sys.exit()
        self.terminals = terminals
        self.grammar = grammar
        # convert grammar to CNF and add terminals to outcomes
        self.grammar_to_cnf(reverse)
        assert all(isinstance(v, set) for k,v in self.grammar.items()), 'not all set'
        assert all(isinstance(option, tuple) for k,v in self.grammar.items() for option in v), 'not all tuples'
        assert all(isinstance(el, str) for k,v in self.grammar.items() for option in v for el in option), 'not all strings'
        assert  all(1 <= len(option) <=2 for k,v in self.grammar.items() for option in v), 'len not 1 or 2'
                    
        self.finalgrammar = defaultdict(set)
        for k,v in self.grammar.items():
            for option in v:
                self.finalgrammar[option].add(k)

        for t in self.terminals:
            if (t,) in self.finalgrammar:
                self.outcomes[t] = self.finalgrammar[(t,)]
        print('outcomes after grammar', self.outcomes)
                


    def grammar_to_cnf(self, reverse):
        self.grammar = self.to_cnf_remove_options(self.grammar, reverse)
        # eliminate unit rules
        self.grammar = self.to_cnf_remove_triplets(self.grammar)
        self.grammar = self.to_cnf_remove_unit_productions(self.grammar)

    def to_cnf_remove_options(self, grammar, reverse):
        # if reverse change from X : AB to AB : {X}
        # if there are options, these are given a separate entry, e.g.
        # X : (AB, CD) --> X: AB and X: CD
        new_grammar = defaultdict(set)
        
        for k,v in grammar.items():
            for option in v:
                print(k,v, 'option', option)
                new_grammar[k].add(option)
        return new_grammar
        
    def to_cnf_remove_triplets(self, grammar):
        # reduces triplets or larger to pairs
        # changes X : ABC to
        # X: AY, Y = BC
        new_grammar = defaultdict(set)
        for k,v in grammar.items():
            for option in v:
                if len(option) > 2:
                    option = list(option)
                    while len(option) > 2:
                        new_grammar['extra' + str(self.extra)].add(tuple(option[1:3]))
                        option[1:3] = ['extra' + str(self.extra)]
                        self.extra += 1
                    new_grammar[k].add(tuple(option))                    
                else:
                    new_grammar[k].add(option)
        return new_grammar
           
    
    def to_cnf_remove_unit_productions(self,grammar):
        # step to get to Chomsky Normal Form
        # if X : A, duplicate all A : Y with X : Y
        found = True
        while found:
            found = False
            for k,v in grammar.items():
                to_remove = set()
                singulars = {option[0] for option in v if len(option) == 1 and option[0] not in self.terminals}
                if singulars:
                    found = True
                grammar[k] = {option for option in grammar[k] if len(option) != 1 or option[0] not in singulars}
                for singular in singulars:
                    grammar[k] |= grammar[singular]
                        
        return grammar

    def pieces(self, test,l):
        # gets all possibilities of len l out of a string
        assert isinstance(test, str)
        return {test[i:i+l] for i in range(len(test)-l+1) if test[i:i+l] not in self.outcomes}

    def splitter(self,option):
        # splits string into all options of two substrings
        assert isinstance(option, str)
        return {(option[:i], option[i:]) for i in range(1,len(option))}

    def check_possible_option(self, option):
        first = self.outcomes.get(option[0],set())
        second = self.outcomes.get(option[1],set())
        res = set()
        for potential in product(first,second):
            if potential in self.finalgrammar:
                res |= self.finalgrammar[potential]

        return res

    def solve(self, messages):
        # takes a list of messages and returns all possibilities for the substrings of m
        # print(self.outcomes)
        for num, m in enumerate(messages):
            # print(m, len(m))
            if num % 100 == 0: print(num*10, 'messages done')
            for i in range(1,len(m)+1):
                # print(i)
                for j in self.pieces(m, i):
                    # print('  ' + j)
                    for k in self.splitter(j):
                        # print('    ', k)
                        res = self.check_possible_option(k)
                        if res:
                            # print('      yes')
                            self.outcomes[j] |= res # this was a bug
                            
        print('finished all messages, returning dict')
        return self.outcomes
          



In [65]:
grammar = {'0': (('4', '1', '5'),),
 '1': (('2', '3'), ('3', '2')),
 '2': (('4', '4'), ('5', '5')),
 '3': (('4', '5'), ('5', '4')),
 '4': (('a',),),
 '5': (('b',),)}

messages = ['ababbb', 'bababa', 'abbbab', 'aaabbb', 'aaaabbb']
cfg = CFG(grammar, terminals = {'a', 'b'})
out = cfg.solve(messages)
assert sum([1 for m in messages if (m in out) and ('0' in out[m])]) == 2

0 (('4', '1', '5'),) option ('4', '1', '5')
1 (('2', '3'), ('3', '2')) option ('2', '3')
1 (('2', '3'), ('3', '2')) option ('3', '2')
2 (('4', '4'), ('5', '5')) option ('4', '4')
2 (('4', '4'), ('5', '5')) option ('5', '5')
3 (('4', '5'), ('5', '4')) option ('4', '5')
3 (('4', '5'), ('5', '4')) option ('5', '4')
4 (('a',),) option ('a',)
5 (('b',),) option ('b',)
outcomes after grammar defaultdict(<class 'set'>, {'a': {'4'}, 'b': {'5'}})
0 messages done
finished all messages, returning dict


In [66]:
# https://adventofcode.com/2020/day/19
rules, messages = open('cfgloop.txt').read().split('\n\n')
grammar = {}

for rule in rules.split('\n'):
    num, makefrom = rule.split(': ')
    makefrom = makefrom.replace('"', '')
    makefrom = tuple(makefrom.split(' | '))
    makefrom = tuple(tuple(option.split()) for option in makefrom)
    grammar[num] = makefrom
    
messages = messages.split('\n')
# messages = ['babbaaaabbbbbbabaaaaabbb']
cfg = CFG(grammar, terminals = {'a', 'b'})
res = cfg.solve(messages)
sum('0' in v and k in messages for k,v in cfg.outcomes.items())

97 (('138', '57'), ('12', '83')) option ('138', '57')
97 (('138', '57'), ('12', '83')) option ('12', '83')
131 (('20', '83'), ('74', '57')) option ('20', '83')
131 (('20', '83'), ('74', '57')) option ('74', '57')
7 (('57', '110'), ('83', '51')) option ('57', '110')
7 (('57', '110'), ('83', '51')) option ('83', '51')
48 (('17', '83'), ('56', '57')) option ('17', '83')
48 (('17', '83'), ('56', '57')) option ('56', '57')
2 (('83', '57'),) option ('83', '57')
40 (('57', '101'), ('83', '93')) option ('57', '101')
40 (('57', '101'), ('83', '93')) option ('83', '93')
16 (('12', '83'), ('47', '57')) option ('12', '83')
16 (('12', '83'), ('47', '57')) option ('47', '57')
42 (('15', '83'), ('66', '57')) option ('15', '83')
42 (('15', '83'), ('66', '57')) option ('66', '57')
62 (('83', '134'), ('57', '18')) option ('83', '134')
62 (('83', '134'), ('57', '18')) option ('57', '18')
55 (('124', '57'), ('45', '83')) option ('124', '57')
55 (('124', '57'), ('45', '83')) option ('45', '83')
1 (('57', '

In [68]:
import re
# https://adventofcode.com/2015/day/19
rules, mol = open('cfg2.txt', 'r').read().split('\n\n')
newrules = defaultdict(set)
for line in rules.splitlines():
    first, second = line.split(' => ')
    newrules[first].add(tuple(re.findall('[A-Z][^A-Z]*', second)))
    
terminals = set()
mollie = []
prev = ''
for ch in mol:
    if ch.isupper():
        if prev:
            terminals.add(prev)
            mollie.append(prev)
        prev = ch
    else:
        prev += ch
    terminals.add(prev)
    mollie.append(prev)
print(terminals)
terminals -= set(newrules)
cfg = CFG(newrules, terminals)
cfg.solve(mollie)

{'Si', 'R', 'P', 'S', 'Ti', 'Ar', 'Mg', 'T', 'Th', 'Al', 'Y', 'Ca', 'Rn', 'B', 'A', 'M', 'F', 'C'}
Al {('Th', 'F'), ('Th', 'Rn', 'F', 'Ar')} option ('Th', 'F')
Al {('Th', 'F'), ('Th', 'Rn', 'F', 'Ar')} option ('Th', 'Rn', 'F', 'Ar')
B {('Ti', 'B'), ('B', 'Ca'), ('Ti', 'Rn', 'F', 'Ar')} option ('Ti', 'B')
B {('Ti', 'B'), ('B', 'Ca'), ('Ti', 'Rn', 'F', 'Ar')} option ('B', 'Ca')
B {('Ti', 'B'), ('B', 'Ca'), ('Ti', 'Rn', 'F', 'Ar')} option ('Ti', 'Rn', 'F', 'Ar')
Ca {('Si', 'Rn', 'Mg', 'Ar'), ('Si', 'Th'), ('P', 'B'), ('P', 'Rn', 'F', 'Ar'), ('Ca', 'Ca'), ('Si', 'Rn', 'F', 'Y', 'F', 'Ar')} option ('Si', 'Rn', 'Mg', 'Ar')
Ca {('Si', 'Rn', 'Mg', 'Ar'), ('Si', 'Th'), ('P', 'B'), ('P', 'Rn', 'F', 'Ar'), ('Ca', 'Ca'), ('Si', 'Rn', 'F', 'Y', 'F', 'Ar')} option ('Si', 'Th')
Ca {('Si', 'Rn', 'Mg', 'Ar'), ('Si', 'Th'), ('P', 'B'), ('P', 'Rn', 'F', 'Ar'), ('Ca', 'Ca'), ('Si', 'Rn', 'F', 'Y', 'F', 'Ar')} option ('P', 'B')
Ca {('Si', 'Rn', 'Mg', 'Ar'), ('Si', 'Th'), ('P', 'B'), ('P', 'Rn', 'F', 'Ar'),

defaultdict(set, {})

In [69]:
possible = set()
pnt = 0
for pnt in range(len(mol)):
    if mol[pnt].isupper():
        right = pnt + 1
        if pnt != len(mol) -1 and mol[pnt+1].islower():
            right += 1
        for option in newrules[mol[pnt:right]]:
            antwoord = mol[:pnt] + ''.join(option) + mol[right:]
            possible.add(antwoord)
len(possible)

535