In [None]:
from IPython.core.display import HTML
with open('../../style.css', 'r') as file:
    css = file.read()
HTML(css)

# Implementing an Earley Parser

## A Grammar for Grammars

Earley's algorithm has two inputs:
- a grammar $G$ and
- a string $s$.

It then checks whether the string $s$ can be parsed with the given grammar.

In order to input the grammar in a natural way, we first have to develop a parser for grammars.
An example grammar that we want to parse is stored in the file `simple.g`.

In [None]:
!cat simple.g

We use <span style="font-variant:small-caps;">Antlr</span> to develop a parser for this Grammar.  
The pure grammar to parse this type of grammar is stored in
the file `Pure.g4`.

In [None]:
!cat Pure.g4

The annotated grammar is stored in the file `Grammar.g4`. 

In [None]:
!cat -n Grammar.g4

We start by generating both scanner and parser.  

In [None]:
!antlr4 -Dlanguage=Python3 Grammar.g4

In [None]:
from GrammarLexer  import GrammarLexer
from GrammarParser import GrammarParser
import antlr4

The function `parse_grammar` takes a `filename` as its argument and returns the grammar that is stored in the given file.  The grammar is represented as list of rules.  Each rule is represented as a tuple.  The example below will clarify this structure.

In [None]:
def parse_grammar(filename):
    input_stream  = antlr4.FileStream(filename)
    lexer         = GrammarLexer(input_stream)
    token_stream  = antlr4.CommonTokenStream(lexer)
    parser        = GrammarParser(token_stream)
    grammar       = parser.start()
    return grammar.g

In [None]:
parse_grammar('simple.g')

## Earley's Algorithm

Given a context-free grammar $G = \langle V, \Sigma, R, S \rangle$ and a string $s = x_1x_2 \cdots x_n \in \Sigma^*$ of length $n$, 
an *Earley item* is a pair of the form
$$\langle A \rightarrow \alpha \bullet \beta, k \rangle$$
such that 
- $(A \rightarrow \alpha \beta) \in R\quad$  and
- $k \in \{0,1,\cdots,n\}$. 

The class `EarleyItem` represents a single *Earley item*.  
- `mVariable` is the variable $A$,
- `mAlpha` is $\alpha$,
- `mBeta` is $\beta$, and
- `mIndex` is $k$.

Since we later have to store objects of class `EarleyItem` in sets, we have to implement the functions
- `__eq__`,
- `__ne__`,
- `__hash__`.

It is easiest to implement `__hash__` by first converting the object into a string.  Hence we also
implement the function `__repr__`, that converts an `EarleyItem` into a string.

In [None]:
class EarleyItem():
    def __init__(self, variable, alpha, beta, index):
        self.mVariable = variable
        self.mAlpha    = alpha
        self.mBeta     = beta
        self.mIndex    = index
    
    def __eq__(self, other):
        return isinstance(other, EarleyItem)     and \
               self.mVariable == other.mVariable and \
               self.mAlpha    == other.mAlpha    and \
               self.mBeta     == other.mBeta     and \
               self.mIndex    == other.mIndex
    
    def __ne__(self, other):
        return not self.__eq__(other)
    
    def __hash__(self):
        return hash(self.__repr__())
    
    def __repr__(self):
        alphaStr = ' '.join(self.mAlpha)
        betaStr  = ' '.join(self.mBeta)
        return f'<{self.mVariable} → {alphaStr} • {betaStr}, {self.mIndex}>'

Given an Earley item `self`, the function `isComplete` checks, whether the Earley item `self` has the form
$$\langle A \rightarrow \alpha \bullet, k \rangle,$$
i.e. whether the $\bullet$ is at the end of the grammar rule.

In [None]:
def isComplete(self):
    return self.mBeta == ()

EarleyItem.isComplete = isComplete
del isComplete

The function `sameVar(self, C)` checks, whether the item following the dot is the same as the variable 
given as argument, i.e. `sameVar(self, C)` returns `True` if `self` is an Earley item of the form
$$\langle A \rightarrow \alpha \bullet C\beta, k \rangle.$$

In [None]:
def sameVar(self, C):
    return len(self.mBeta) > 0 and self.mBeta[0] == C

EarleyItem.sameVar = sameVar
del sameVar

The function `scan(self, t)` checks, whether the item following the dot matches the token `t`, 
i.e. `scan(self, t)` returns `True` if `self` is an Earley item of the form
$$\langle A \rightarrow \alpha \bullet t\beta, k \rangle.$$
The argument $t$ can either be the name of a token or a literal.

In [None]:
def scan(self, t):
    if len(self.mBeta) > 0:
        return self.mBeta[0] == t or self.mBeta[0] == "'" + t + "'"
    return False

EarleyItem.scan = scan
del scan

Given an Earley item, this function returns the name of the variable following the dot.  If there is no variable following the dot, the function returns `None`.  The function can distinguish variables from token names because variable names consist only of lower case letters.

In [None]:
def nextVar(self):
    if len(self.mBeta) > 0:
        var = self.mBeta[0]
        if var[0] != "'" and var.islower():
            return var
    return None

EarleyItem.nextVar = nextVar
del nextVar

The function `moveDot(self)` moves the $\bullet$ in the Earley item `self`, where `self` has the form 
$$\langle A \rightarrow \alpha \bullet \beta, k \rangle$$
over the next variable, token, or literal in $\beta$.  It assumes that $\beta$ is not empty.

In [None]:
def moveDot(self):
    return EarleyItem(self.mVariable, 
                      self.mAlpha + (self.mBeta[0],), 
                      self.mBeta[1:], 
                      self.mIndex)

EarleyItem.moveDot = moveDot
del moveDot

The class `Grammar` represents a context free grammar.  It stores a list of the rules of the grammar.
Each grammar rule of the form
$$ a \rightarrow \beta $$
is stored as the tuple $(a,) + \beta$.  The start symbol is assumed to be the variable on the left hand side of
the first rule. To distinguish syntactical variables form tokens, variables contain only lower case letters,
while tokens either contain only upper case letters or they start and end with a single quote character "`'`".

In [None]:
class Grammar():
    def __init__(self, Rules):
        self.mRules = Rules   

The function `startItem` returns the Earley item
$$ \langle\hat{S} \rightarrow \bullet S, 0\rangle $$
where $S$ is the start variable of the given grammar and $\hat{S}$ is a new variable.

In [None]:
def startItem(self):
    return EarleyItem('Start', (), (self.startVar(),), 0)

Grammar.startItem = startItem
del startItem

The function `finishItem` returns the Earley item
$$ \langle\hat{S} \rightarrow S \bullet, 0\rangle $$
where $S$ is the start variable of the given grammar and $\hat{S}$ is a new variable.

In [None]:
def finishItem(self):
    return EarleyItem('Start', (self.startVar(),), (), 0)

Grammar.finishItem = finishItem
del finishItem

The function `startVar` returns the start variable of the grammar.  It is assumed that
the first rule grammar starts with the start variable of the grammar.

In [None]:
def startVar(self):
    return self.mRules[0][0]

Grammar.startVar = startVar
del startVar

The function `toString` creates a readable presentation of the grammar rules.

In [None]:
def toString(self):
    result = ''
    for head, *body in self.mRules:
        result += f'{head}: {body};\n'
    return result

Grammar.__str__ = toString
del toString

The class `EarleyParser` implements the [parsing algorithm of Jay Earley](https://en.wikipedia.org/wiki/Earley_parser).
The class maintains the following member variables:
- `mGrammar` is the grammar that is used to parse the given token string.
- `mString` is the list of tokens and literals that has to be parsed.

   As a hack, the first element of this list in `None`.  
   Therefore, `mString[i]` is the `i`th token.
- `mStateList` is a list of sets of *Earley items*.  If $n$ is the length of the given token string
  (excluding the first element `None`), then $Q_i = \texttt{mStateList}[i]$. 
  The idea is that the set $Q_i$ is the set of those *Earley items* that the parser could be in 
  when it has read the tokens `mString[1]`, $\cdots$,  `mString[n]`.  $Q_0$ is initialized as follows:
  $$ Q_0 = \bigl\{\langle\hat{S} \rightarrow \bullet S, 0\rangle\bigr\}. $$
  
The *Earley items* are interpreted as follows: If we have
$$ \langle C \rightarrow \alpha \bullet \beta, k\rangle \in Q_i, $$
then we know the following:
- After having read the tokens `mString[:k+1]` the parser tries to parse the variable $C$
  in the token string `mString[k+1:]`.
- After having read the token string `mString[k+1:i+1]` the parser has already recognized $\alpha$
  and now needs to recognize $\beta$ in the token string `mString[i+1:]` in order to parse the variable $C$.

In [None]:
class EarleyParser():
    def __init__(self, grammar, TokenList):
        self.mGrammar   = grammar 
        self.mString    = [None] + TokenList  # dirty hack so mString[1] is first token
        self.mStateList = [set() for i in range(len(TokenList)+1)] 
        print('Grammar:\n')
        print(self.mGrammar)
        print(f'Input: {self.mString}\n')
        self.mStateList[0] = { self.mGrammar.startItem() }

The method `parse` implements Earley's algorithm.  For all states 
$Q_1$, $\cdots$, $Q_n$ we proceed as follows:
- We apply the completion operation followed by the prediction operation.
  This is done until no more states are added to $Q_i$.  
  
  (The inner `while` loop is not necessary if the grammar does not contain $\varepsilon$-rules.)
- Finally, the scanning operation is applied to $Q_i$.

After $Q_i$ has been computed, we proceed to compute $Q_{i+1}$.
Parsing is successful iff
$$ \langle\hat{S} \rightarrow S \bullet, 0\rangle \in Q_n $$

In [None]:
def parse(self):
    "run Earley's algorithm"
    n = len(self.mString) - 1 # mString[0] = None
    for i in range(0, n+1):
        if i + 1 <= n:
            next_token = self.mString[i+1]
        else:
            next_token = 'EOF'
        print('_' * 80)
        print(f'next token = {next_token}')
        print('_' * 80)
        change = True
        while change:
            change = self.complete(i)
            change = self.predict(i) or change
        self.scan(i)
        # print states
        print(f'\nQ{i}:')
        Qi = self.mStateList[i]
        for item in Qi: 
            print(item)
        if i + 1 <= n:
            print(f'\nQ{i+1}:')
            Qip1 = self.mStateList[i+1]
            for item in Qip1: 
                print(item)
    if self.mGrammar.finishItem() in self.mStateList[-1]:
        print('Parsing successful!')
    else:
        print('Parsing failed!')

EarleyParser.parse = parse
del parse

The method `complete(self, i)` applies the completion operation to the state $Q_i$:
If we have
- $\langle C \rightarrow \gamma \bullet, j\rangle \in Q_i$ and
- $\langle A \rightarrow \beta \bullet C \delta, k\rangle \in Q_j$,
then the parser tried to parse the variable $C$ after having read `mString[:j+1]`
and we know that 
$$ C \Rightarrow^* \texttt{mString[j+1:i+1]}, $$
i.e. the parser has recognized $C$ after having read `mString[j+1:i+1]`.
Therefore the parser should proceed to recognize $\delta$ in state $Q_i$.
Therefore we add the *Earley item* $\langle A \rightarrow \beta C \bullet \delta,k\rangle$ to the set $Q_i$:
$$\langle C \rightarrow \gamma \bullet, j\rangle \in Q_i \wedge
  \langle A \rightarrow \beta \bullet C \delta, k\rangle \in Q_j \;\rightarrow\;
          Q_i := Q_i \cup \bigl\{ \langle A \rightarrow \beta C \bullet \delta, k\rangle \bigr\}
$$



In [None]:
def complete(self, i):
    change = False
    added  = True
    Qi     = self.mStateList[i]
    while added:
        added = False
        newQi = set()
        for item in Qi:
            if item.isComplete():
                C  = item.mVariable
                j  = item.mIndex
                Qj = self.mStateList[j]
                for newItem in Qj:
                    if newItem.sameVar(C):
                        moved = newItem.moveDot()
                        newQi.add(moved)
        if not (newQi <= Qi):
            change = True
            added  = True
            print("completion:")
            for newItem in newQi:
                if newItem not in Qi:
                    print(f'{newItem} added to Q{i}')
            self.mStateList[i] |= newQi
            Qi = self.mStateList[i]
        return change
    
EarleyParser.complete = complete
del complete

The method `self.predict(i)` applies the prediction operation to the state $Q_i$: 
If $\langle A \rightarrow \beta \bullet C \delta, k \rangle \in Q_j$, then
the parser tries to recognize $C\delta$ after having read `mString[:j+1]`.  To this end
it has to parse $C$ in the string `mString[j+1:]`.
Therefore, if $C \rightarrow \gamma$ is a rule of our grammar,
we add the *Earley item* $\langle C \rightarrow \bullet \gamma, j\rangle$ to the set $Q_j$:
$$ \langle A \rightarrow \beta \bullet C \delta, k\rangle \in Q_j 
       \wedge (C \rightarrow \gamma) \in R 
       \;\rightarrow\;
       Q_j := Q_j \cup\bigl\{ \langle C \rightarrow \bullet\gamma, j\rangle\bigr\}.
$$
As the right hand side $\gamma$ might start with a variable, the function uses a fix point iteration
until no more *Earley items* are added to $Q_j$.

In [None]:
def predict(self, i):
    change = False
    added  = True
    Qi     = self.mStateList[i]
    while added:
        added = False
        newQi = set()
        for item in Qi:
            c = item.nextVar()
            if c != None:
                for rule in self.mGrammar.mRules:
                    if c == rule[0]:
                        newQi.add(EarleyItem(c, (), rule[1:], i))
        if not (newQi <= Qi):
            change = True
            added  = True
            print("prediction:")
            for newItem in newQi:
                if newItem not in Qi:
                    print(f'{newItem} added to Q{i}')
            self.mStateList[i] |= newQi
            Qi = self.mStateList[i]
    return change

EarleyParser.predict = predict
del predict

The function `self.scan(i)` applies the scanning operation to the state $Q_i$.

If $\langle A \rightarrow \beta \bullet a \gamma, k\rangle \in Q_i$ and $a$ is a token,
then the parser tries to recognize the right hand side of the grammar rule
$$ A \rightarrow \beta a \gamma$$ 
and after having read `mString[k+1:i+1]` it has already recognized  $\beta$.
If we now have `mString[i+1] == a`, then the parser still has to recognize $\gamma$ in `mString[i+2:]`.
Therefore, the *Earley object* $\langle A \rightarrow \beta a \bullet \gamma, k\rangle$ is added to
the set $Q_{i+1}$:
$$\langle A \rightarrow \beta \bullet a \gamma, k\rangle \in Q_i \wedge x_{i+1} = a
       \;\rightarrow\;
       Q_{i+1} := Q_{i+1} \cup \bigl\{ \langle A \rightarrow \beta a \bullet \gamma, k\rangle \bigr\}
$$

In [None]:
def scan(self, i):
    Qi = self.mStateList[i]
    n  = len(self.mString) - 1 # remember mStateList[0] == None
    if i + 1 <= n:
        a = self.mString[i+1]
        for item in Qi:
            if item.scan(a):
                self.mStateList[i+1].add(item.moveDot())
                print('scanning:')
                print(f'{item.moveDot()} added to Q{i+1}')

EarleyParser.scan = scan
del scan

In [None]:
import re

The function `tokenize` transforms the string `s` into a list of tokens. See below for an example.

In [None]:
def tokenize(s):
    '''Transform the string s into a list of tokens.  The string s
       is supposed to represent an arithmetic expression.
    '''
    lexSpec = r'''([ \t]+)        |  # blanks and tabs
                  ([1-9][0-9]*|0) |  # number
                  ([()])          |  # parentheses 
                  ([-+*/])        |  # arithmetical operators
                  (.)                # unrecognized character
               '''
    tokenList = re.findall(lexSpec, s, re.VERBOSE)
    result    = []
    for ws, number, parenthesis, operator, error in tokenList:
        if ws:        # skip blanks and tabs
            continue
        elif number:
            result += [ 'NUMBER' ]
        elif parenthesis:
            result += [ parenthesis ]
        elif operator:
            result += [ operator ]
        else:
            result += [ f'ERROR({error})']
    return result

In [None]:
tokenize('1 + 2 * 3')

The function `test` takes two arguments.
- `file` is the name of a file containing a grammar,
- `word` is a string that should be parsed.

`word` is first tokenized.  Then the resulting token list is parsed using *Earley's algorithm*.

In [None]:
def test(file, word): 
    Rules     = parse_grammar(file)
    grammar   = Grammar(Rules)
    TokenList = tokenize(word)
    ep        = EarleyParser(grammar, TokenList)
    ep.parse()

In [None]:
test('simple.g', '1 + 2 * 3')

The command below cleans the directory.  If you are running windows, you have to replace `rm`with `del`.

In [None]:
!rm GrammarLexer.* GrammarParser.* Grammar.tokens GrammarListener.py Grammar.interp
!rm -r __pycache__

In [None]:
!ls