In [None]:
from IPython.core.display import HTML
with open('../../style.css', 'r') as file:
    css = file.read()
HTML(css)

# Implementing an Earley Parser

## A Grammar for Grammars

Our grammar is stored in the file `Grammar.g4`.  This grammar describes the lexical structure of the grammars for the language 
`C` that is contained in the file `c-grammar.g`.

In [None]:
!cat -n Grammar.g4

In [None]:
!cat simple.g

We start by generating both scanner and parser.  

In [None]:
!antlr4 -Dlanguage=Python3 Grammar.g4

In [None]:
from GrammarLexer  import GrammarLexer
from GrammarParser import GrammarParser
import antlr4

The function `parse_grammar` takes a `filename` as its argument and returns the grammar that is stored in the given file.

In [None]:
def parse_grammar(filename):
    input_stream  = antlr4.FileStream(filename)
    lexer         = GrammarLexer(input_stream)
    token_stream  = antlr4.CommonTokenStream(lexer)
    parser        = GrammarParser(token_stream)
    grammar       = parser.start()
    return grammar.g

In [None]:
parse_grammar('simple.g')

In [None]:
class EarleyItem():
    def __init__(self, variable, alpha, beta, index):
        self.mVariable = variable
        self.mAlpha    = alpha
        self.mBeta     = beta
        self.mIndex    = index
        
    def __repr__(self):
        alphaStr = ''
        for x in self.mAlpha:
            alphaStr += x
        betaStr = ''
        for x in self.mBeta:
            betaStr += x
        return f'<{self.mVariable} -> {alphaStr} â€¢ {betaStr}, {self.mIndex}>'
    
    def __eq__(self, other):
        if isinstance(other, EarleyItem):
            return self.mVariable == other.mVariable and \
                   self.mAlpha    == other.mAlpha    and \
                   self.mBeta     == other.mBeta     and \
                   self.mIndex    == other.mIndex
        else:
            return False
    
    def __ne__(self, other):
        return not self.__eq__(other)
    
    def __hash__(self):
        return hash(self.__repr__())

In [None]:
def isComplete(self):
    return self.mBeta == ()

EarleyItem.isComplete = isComplete
del isComplete

The function `sameVar`$(C)$ checks, whether the item following the dot is the same as the variable 
given as argument.    

In [None]:
def sameVar(self, C):
    return len(self.mBeta) > 0 and self.mBeta[0] == C

EarleyItem.sameVar = sameVar
del sameVar

The function `scan` checks, whether the item following the dot matches the token $t$
that is given as argument.

In [None]:
def scan(self, t):
    if len(self.mBeta) > 0:
        return self.mBeta[0] == t or self.mBeta[0] == "'" + t + "'"
    return False

EarleyItem.scan = scan
del scan

Return the name of the variable following the dot.  If there is no variable 
following the dot, return `None`.

In [None]:
def nextVar(self):
    if len(self.mBeta) > 0:
        var = self.mBeta[0]
        if var[0] != "'" and var.islower():
            return var
    return None

EarleyItem.nextVar = nextVar
del nextVar

In [None]:
def moveDot(self):
    return EarleyItem(self.mVariable, 
                      self.mAlpha + (self.mBeta[0],), 
                      self.mBeta[1:], 
                      self.mIndex)

EarleyItem.moveDot = moveDot
del moveDot

In [None]:
class Grammar():
    def __init__(self, Rules):
        self.mRules = Rules   

In [None]:
def startItem(self):
    return EarleyItem('Start', (), (self.startVar(),), 0)

Grammar.startItem = startItem
del startItem

In [None]:
def finishItem(self):
    return EarleyItem('Start', (self.startVar(),), (), 0)

Grammar.finishItem = finishItem
del finishItem

The first rule has to start with the start variable.

In [None]:
def startVar(self):
    return self.mRules[0][0]

Grammar.startVar = startVar
del startVar

In [None]:
def toString(self):
    result = ''
    for head, *body in self.mRules:
        result += f'{head}: {body};\n'
    return result

Grammar.__str__ = toString
del toString

In [None]:
class EarleyParser():
    def __init__(self, grammar, TokenList):
        self.mGrammar   = grammar 
        self.mString    = [None] + TokenList  # dirty hack so mString[1] is first char
        self.mStateList = [ set() for i in range(len(TokenList)+1) ] 
        print('Grammar:\n')
        print(self.mGrammar)
        print(f'Input: {self.mString}\n')
        self.mStateList[0] = { self.mGrammar.startItem() }

This method implements Earley's algorithm.  For all states $Q_i$ we 
apply the completion operation followed by the prediction operation.
This is done until no more no states are added to $Q_i$.  The inner `while`
loop is not necessary if the grammar contains not $\varepsilon$-rules.  
Finally, the scanning operation is applied to $Q_i$.

In [None]:
def parse(self):
    "run Earley's algorithm"
    n = len(self.mString)
    for i in range(0, n):
        if i + 1 < n:
            print('_' * 80)
            print(f'next token = {self.mString[i+1]}')
            print('_' * 80)
        change = True
        while change:
            change = self.complete(i)
            change = self.predict(i) or change
        self.scan(i)
        # print states
        for i in range(n):
            print(f'\nQ{i}:')
            Qi = self.mStateList[i]
            for item in Qi: 
                print(item)
    if self.mGrammar.finishItem() in self.mStateList[-1]:
        print('Parsing successful!')
    else:
        print('Parsing failed!')

EarleyParser.parse = parse
del parse

Apply the completion operation on the state $Q_i$.  The parameter $i$
is the index of the state.

In [None]:
def complete(self, i):
    change = False
    added  = True
    Qi     = self.mStateList[i]
    while added:
        added = False
        newQi = set()
        for item in Qi:
            if item.isComplete():
                C  = item.mVariable
                j  = item.mIndex
                Qj = self.mStateList[j]
                for newItem in Qj:
                    if newItem.sameVar(C):
                        moved = newItem.moveDot()
                        newQi.add(moved)
        if not (newQi <= Qi):
            change = True
            added  = True
            print("completion:")
            for newItem in newQi:
                if newItem not in Qi:
                    print(newItem)
            self.mStateList[i] |= newQi
            Qi = self.mStateList[i]
        return change
    
EarleyParser.complete = complete
del complete

Apply the prediction operation to the state $Q_i$.  The parameter $i$
is the index of the state.

In [None]:
def predict(self, i):
    change = False
    added  = True
    Qi     = self.mStateList[i]
    while added:
        added = False
        newQi = set()
        for item in Qi:
            c = item.nextVar()
            if c != None:
                for rule in self.mGrammar.mRules:
                    if c == rule[0]:
                        newQi.add(EarleyItem(c, (), rule[1:], i))
        if not (newQi <= Qi):
            change = True
            added  = True
            print("prediction:")
            for newItem in newQi:
                if newItem not in Qi:
                    print(newItem)
            self.mStateList[i] |= newQi
            Qi = self.mStateList[i]
    return change

EarleyParser.predict = predict
del predict

Apply the scanning operation on the state Qi.

In [None]:
def scan(self, i):
    Qi = self.mStateList[i]
    n  = len(self.mString)
    if i + 1 < n:
        a = self.mString[i+1]
        for item in Qi:
            if item.scan(a):
                self.mStateList[i+1].add(item.moveDot())
                print('scanning:')
                print(item.moveDot())

EarleyParser.scan = scan
del scan

In [None]:
import re

In [None]:
def tokenize(s):
    '''Transform the string s into a list of tokens.  The string s
       is supposed to represent an arithmetic expression.
    '''
    lexSpec = r'''([ \t]+)        |  # blanks and tabs
                  ([1-9][0-9]*|0) |  # number
                  ([()])          |  # parentheses 
                  ([-+*/])        |  # arithmetical operators
                  (.)                # unrecognized character
               '''
    tokenList = re.findall(lexSpec, s, re.VERBOSE)
    result    = []
    for ws, number, parenthesis, operator, error in tokenList:
        if ws:        # skip blanks and tabs
            continue
        elif number:
            result += [ 'NUMBER' ]
        elif parenthesis:
            result += [ parenthesis ]
        elif operator:
            result += [ operator ]
        else:
            result += [ f'ERROR({error})']
    return result

In [None]:
tokenize('1 + 2 * 3')

In [None]:
def test(file, word): 
    Rules     = parse_grammar(file)
    grammar   = Grammar(Rules)
    TokenList = tokenize(word)
    ep        = EarleyParser(grammar, TokenList)
    ep.parse()

In [None]:
test('simple.g', '1 + 2 * 3')

The command below cleans the directory.  If you are running windows, you have to replace `rm`with `del`.

In [None]:
!rm GrammarLexer.* GrammarParser.* Grammar.tokens GrammarListener.py Grammar.interp
!rm -r __pycache__

In [None]:
!ls