In [None]:
#| default_exp special
from nbdev import *
from nbdev.showdoc import *

# Context free grammar

> Takes a grammar, converts it into Chomsky Normal Form (CNF)

In [None]:
#| exporti
from collections import defaultdict
from itertools import product

In [None]:
#| export

class CFG():
    """Takes a grammer as dict with tuple of options as values. Terminal values should not be in a tuple but as a string
    Usage:
        cfg = CFG(grammar_dict)
            reverse as optional parameter when k,v are reversed
            converts the grammar to Chomsky Normal form by taking care of options, unit productions and triplets
        cfg.solve(messages_list) returns dict of substrings with possible rules to make them
    """
    def __init__(self, grammar = None, reverse = True):
        self.outcomes = defaultdict(set)
        if grammar:
            # convert grammar to CNF and add terminals to outcomes
            self.grammar = self.grammar_to_cnf(grammar, reverse)
            self.outcomes.update({k:v for k,v in self.grammar.items() if isinstance(k, str)})


    def grammar_to_cnf(self, grammar, reverse):
        grammar = self.to_cnf_remove_options(grammar, reverse)
        grammar = self.to_cnf_remove_triplets(grammar)
        return self.to_cnf_remove_unit_productions(grammar)

    def to_cnf_remove_options(self, grammar, reverse):
        # if reverse change from X : AB to AB : {X}
        # if there are options, these are given a separate entry, e.g.
        # X : (AB, CD) --> X: AB and X: CD
        new_grammar = defaultdict(set)
        if reverse:
            for k,v in grammar.items():
                for option in v:
                    new_grammar[option].add(k)
        else:
            for k,v in grammar.items():
                for option in k:
                    new_grammar[option].add(v)
        return new_grammar
        
    def to_cnf_remove_triplets(self, grammar):
        # reduces triplets or larger to pairs
        # changes X : ABC to
        # X: AY, Y = BC
        new_grammar = defaultdict(set)
        for k,v in grammar.items():
            if len(k) > 2:
                for i, t in enumerate(k[0:-2]):
                    newvar = str(v) + '_' + str(i)
                    oldvar = str(v) + '_' + str(i-1)
                    if i == 0:
                        new_grammar[t,newvar] = v
                    else:
                        new_grammar[t,newvar] = {oldvar}
                new_grammar[k[-2:]].add(newvar)
            else:
                new_grammar[k] |= v
        return new_grammar
           
    
    def to_cnf_remove_unit_productions(self,grammar):
        # step to get to Chomsky Normal Form
        # if X : A, duplicate all A : Y with X : Y
        singulars = {k[0]:next(iter(v)) for k,v in grammar.items() if len(k)!=2 and not isinstance(k,str)}
        for k,v in singulars.items():
            for j in grammar.values():
                if k in j:
                    j.add(v)
        return grammar

    def pieces(self, test,l):
        # gets all possibilities of len l out of a string
        assert isinstance(test, str)
        return {test[i:i+l] for i in range(len(test)-l+1) if test[i:i+l] not in self.outcomes}

    def splitter(self,option):
        # splits string into all options of two substrings
        assert isinstance(option, str)
        return {(option[:i], option[i:]) for i in range(1,len(option))}

    def check_possible_option(self, option):
        first = self.outcomes[option[0]]
        second = self.outcomes[option[1]]
        res = set()
        for potential in product(first,second):
            if potential in self.grammar: res |= self.grammar[potential]
        return res

    def solve(self, messages):
        # takes a list of messages and returns all possibilities for the substrings of m
        for num, m in enumerate(messages):
            if num % 100 == 0: print(num*10, 'messages done')
            for i in range(2,len(m)+1):
                for j in self.pieces(m, i):
                    for k in self.splitter(j):
                        res = self.check_possible_option(k)
                        if res:
                            self.outcomes[j] |= res
        print('finished all messages, returning dict')
        return self.outcomes
          



In [None]:
#|echo: false
show_doc(CFG.to_cnf_remove_options)
show_doc(CFG.solve)
show_doc(CFG.pieces)

<h4 id="CFG.to_cnf_remove_options" class="doc_header"><code>CFG.to_cnf_remove_options</code><a href="__main__.py#L24" class="source_link" style="float:right">[source]</a></h4>

> <code>CFG.to_cnf_remove_options</code>(**`grammar`**, **`reverse`**)



<h4 id="CFG.solve" class="doc_header"><code>CFG.solve</code><a href="__main__.py#L87" class="source_link" style="float:right">[source]</a></h4>

> <code>CFG.solve</code>(**`messages`**)



<h4 id="CFG.pieces" class="doc_header"><code>CFG.pieces</code><a href="__main__.py#L69" class="source_link" style="float:right">[source]</a></h4>

> <code>CFG.pieces</code>(**`test`**, **`l`**)



In [None]:
cfg = CFG()
assert cfg.pieces('abcde',3) == {'abc', 'bcd', 'cde'}
assert cfg.splitter('abcd') == {('a', 'bcd'), ('ab', 'cd'), ('abc', 'd')}

In [None]:
grammar = {'0': (('4', '1', '5'),),
 '1': (('2', '3'), ('3', '2')),
 '2': (('4', '4'), ('5', '5')),
 '3': (('4', '5'), ('5', '4')),
 '4': ('a',),
 '5': ('b',)}

messages = ['ababbb', 'bababa', 'abbbab', 'aaabbb', 'aaaabbb']
cfg = CFG(grammar)
out = cfg.solve(messages)
assert sum([1 for m in messages if (m in out) and ('0' in out[m])]) == 2

0 messages done
finished all messages, returning dict


0

In [None]:
cfg.grammar.values()

dict_values([{'0'}, {"{'0'}_0"}, {'1'}, {'1'}, {'2'}, {'2'}, {'3'}, {'3'}, {'4'}, {'5'}])

In [None]:
rules, messages = open('test.txt').read().split('\n\n')
grammar = {}
for rule in rules.split('\n'):
    num, makefrom = rule.split(': ')
    makefrom = makefrom.split(' | ')
    if len(makefrom) > 1:
        grammar[num] = tuple([tuple(option.split()) for option in makefrom])
    else:
        grammar[num] = tuple(makefrom[0].split())
    

for k, v in grammar.items():
    if 'a' in v[0]:
        grammar[k] = 'a',
        
        
    if 'b' in v[0]:
        grammar[k] = 'b',

messages = messages.split('\n')
cfg = CFG(grammar)
res = cfg.solve(messages)

0 messages done
1000 messages done
2000 messages done
3000 messages done
finished all messages, returning dict
