In [None]:
from IPython.display import HTML
HTML(open('../style.css').read())

# Implementing an Earley Parser

In [None]:
!cat simple.g

```
grammar
    : rule
    | rule grammar
    ;

rule
    : VARIABLE ':' item_list ';' 
    ;

item_list
    : λ
    | item item_list
    ;

item 
    : VARIABLE 
    | TERMINAL   
    | LITERAL
    ;
```

In [None]:
import ply.lex as lex

In [None]:
tokens = [ 'VARIABLE',  # r'[a-z][a-z0-9_]*'
           'TERMINAL',  # r'[A-Z][A-Z0-9_]*'
           'LITERAL',   # r"'.'"
         ]

In [None]:
t_VARIABLE = r'[a-z][a-z0-9_]*'
t_TERMINAL = r'[A-Z][A-Z0-9_]*'

A literal is any character that is not the newline character and is enclosed in quotes.

In [None]:
def t_LITERAL(t):
    r"'.'"
    t.value = t.value[1]
    return t

In [None]:
literals = [':', ';']

In [None]:
t_ignore = ' \t\r'

In [None]:
def t_newline(t):
    r'\n'
    t.lexer.lineno += 1
    return

In [None]:
def find_column(token):
    program    = token.lexer.lexdata  # the complete string given to the scanner
    line_start = program.rfind('\n', 0, token.lexpos)
    return token.lexpos - line_start

In [None]:
def t_error(t):
    column = find_column(t)
    print(f"Illegal character '{t.value[0]}' in line {t.lineno}, column {column}.")
    t.lexer.skip(1)

In [None]:
__file__ = 'main'

In [None]:
lexer = lex.lex()

In [None]:
def test_scanner(file_name):
    with open(file_name, 'r') as handle:
        program = handle.read() 
    print(program)
    lexer.input(program)
    lexer.lineno = 1          # reset line number
    for t in lexer:           # start scanning and collect all tokens
        print(t) 

In [None]:
test_scanner('simple.g')

## Implementing the Parser

We use the following grammar to describe context free grammars:
```
    grammar
        : rule
        | rule grammar
        ;

    rule
        : VARIABLE ':' item_list ';'
        ;

    item_list
        : λ
        | item item_list
        ;

    item
        : VARIABLE
        | TERMINAL
        | LITERAL
        ;
```

In [None]:
import ply.yacc as yacc

In [None]:
start = 'grammar'

A grammar is represented as a list of grammar rules.

In [None]:
def p_grammar_one(p):
    "grammar : rule"
    p[0] = [p[1]]

def p_grammar_more(p):
    "grammar : rule grammar"
    p[0] = [p[1]] + p[2]

A grammar rule is represented as a list of items.

In [None]:
def p_rule(p):
    "rule : VARIABLE ':' item_list ';'"
    p[0] = [p[1]] + p[3]

In [None]:
def p_item_list_zero(p):
    "item_list : "
    p[0] = []

def p_item_list_more(p):
    "item_list : item item_list"
    p[0] = [p[1]] + p[2]

Items are strings.

In [None]:
def p_item_variable(p):
    "item : VARIABLE"
    p[0] = p[1]

def p_item_terminal(p):
    "item : TERMINAL"
    p[0] = p[1]

def p_item_literal(p):
    "item : LITERAL"
    p[0] = p[1]

In [None]:
def p_error(t):
    column = find_column(t)
    if t:
        print(f'Syntax error at token "{t.value}" in line {t.lineno}, column {column}.')
    else:
        print('Syntax error at end of input.')

In [None]:
yacc.yacc(write_tables=False, debug=True)

In [None]:
def parse(file):
    lexer.lineno = 1
    with open(file, 'r') as handle:
        grammar = handle.read() 
    print(grammar)
    ast = yacc.parse(grammar)
    return ast

In [None]:
parse('simple.g')

In [None]:
%load_ext nb_mypy

## Earley's Algorithm

Given a context-free grammar $G = \langle V, \Sigma, R, s \rangle$ and a string $w = x_1x_2 \cdots x_n \in \Sigma^*$ of length $n$, 
an *Earley item* is a pair of the form
$$\langle a \rightarrow \alpha \bullet \beta, k \rangle$$
such that 
- $(a \rightarrow \alpha \beta) \in R\quad$  and
- $k \in \{0,1,\cdots,n\}$. 

The class `EarleyItem` represents a single *Earley item*.  
- `mVariable` is the variable $a$,
- `mAlpha` is $\alpha$,
- `mBeta` is $\beta$, and
- `mIndex` is $k$.

Since we later have to store objects of class `EarleyItem` in sets, we have to implement the functions
- `__eq__`,
- `__ne__`,
- `__hash__`.

It is easiest to implement `__hash__` by first converting the object into a string.  Hence we also
implement the function `__repr__`, that converts an `EarleyItem` into a string.

The next import is needed for the type checker, because the class `EarleyItem` is used in the signature of `moveDot` and at this point the class `EarleyItem` is not yet defined.

In [None]:
from __future__ import annotations

In [None]:
class EarleyItem():
    def __init__(self, variable: str, alpha: tuple[str, ...], beta: tuple[str, ...], index: int) -> None:
        self.mVariable = variable
        self.mAlpha    = alpha
        self.mBeta     = beta
        self.mIndex    = index
    
    def __eq__(self, other: object) -> bool:
        return isinstance(other, EarleyItem)     and \
               self.mVariable == other.mVariable and \
               self.mAlpha    == other.mAlpha    and \
               self.mBeta     == other.mBeta     and \
               self.mIndex    == other.mIndex
    
    def __ne__(self, other: object):
        return not self.__eq__(other)
    
    def __hash__(self):
        return hash(self.__repr__())
    
    def __repr__(self):
        alphaStr = ' '.join(self.mAlpha)
        betaStr  = ' '.join(self.mBeta)
        return f'<{self.mVariable} → {alphaStr} • {betaStr}, {self.mIndex}>'

    # The following methods are stubs that are necessary for the type checker.
    def isComplete(self) -> bool:
        return None # type: ignore

    def sameVar(self, c: str) -> bool:
        return None # type: ignore

    def scan(self, T: str) -> bool:
        return None # type: ignore

    def nextVar(self) -> str | None:
        return None # type: ignore

    def moveDot(self) -> EarleyItem: # type: ignore
        return None # type: ignore

Given an Earley item `self`, the function `isComplete` checks, whether the Earley item `self` has the form
$$\langle A \rightarrow \alpha \bullet, k \rangle,$$
i.e. whether the $\bullet$ is at the end of the grammar rule.

In [None]:
def isComplete(self: EarleyItem) -> bool:
    return self.mBeta == ()

EarleyItem.isComplete = isComplete # type: ignore
del isComplete

The function `sameVar(self, c)` checks, whether the item following the dot is the same as the variable 
given as argument, i.e. `sameVar(self, c)` returns `True` if `self` is an Earley item of the form
$$\langle a \rightarrow \alpha \bullet c\beta, k \rangle.$$

In [None]:
def sameVar(self, c: str) -> bool:
    return len(self.mBeta) > 0 and self.mBeta[0] == c

EarleyItem.sameVar = sameVar # type: ignore
del sameVar

The function `scan(self, T)` checks, whether the item following the dot matches the token `T`, 
i.e. `scan(self, T)` returns `True` if `self` is an Earley item of the form
$$\langle a \rightarrow \alpha \bullet T\beta, k \rangle.$$
The argument $T$ can either be the name of a token or a literal.

In [None]:
def scan(self, T: str) -> bool:
    if len(self.mBeta) > 0:
        return self.mBeta[0] == T or self.mBeta[0] == "'" + T + "'"
    return False

EarleyItem.scan = scan # type: ignore
del scan

Given an Earley item, this function returns the name of the variable following the dot.  If there is no variable following the dot, the function returns `None`.  The function can distinguish variables from token names because variable names consist only of lower case letters.

In [None]:
def nextVar(self) -> str | None:
    if len(self.mBeta) > 0:
        var = self.mBeta[0]
        if var[0] != "'" and var.islower():
            return var
    return None

EarleyItem.nextVar = nextVar # type: ignore
del nextVar

The function `moveDot(self)` moves the $\bullet$ in the Earley item `self`, where `self` has the form 
$$\langle a \rightarrow \alpha \bullet \beta, k \rangle$$
over the next variable, token, or literal in $\beta$.  It assumes that $\beta$ is not empty.

In [None]:
def moveDot(self) -> EarleyItem:
    return EarleyItem(self.mVariable, 
                      self.mAlpha + (self.mBeta[0],), 
                      self.mBeta[1:], 
                      self.mIndex)

EarleyItem.moveDot = moveDot # type: ignore
del moveDot

The class `Grammar` represents a context free grammar.  It stores a list of the rules of the grammar.
Each grammar rule of the form
$$ a \rightarrow \beta $$
is stored as the tuple $(a,) + \beta$.  The start symbol is assumed to be the variable on the left hand side of
the first rule. To distinguish syntactical variables from tokens, variables contain only lower case letters,
while tokens either contain only upper case letters or they start and end with a single quote character "`'`".

In [None]:
class Grammar():
    def __init__(self, Rules: list[list[str]]):
        self.mRules = Rules   

    def startItem(self) -> EarleyItem:
        return None # type: ignore

    def finishItem(self) -> EarleyItem:
        return None # type: ignore

    def startVar(self) -> str:
        return None # type: ignore

    def toString(self) -> str:
        return None # type: ignore

The function `startItem` returns the Earley item
$$ \langle\hat{s} \rightarrow \bullet s, 0\rangle $$
where $s$ is the start variable of the given grammar and $\hat{s}$ is the new variable `Start`.

In [None]:
def startItem(self) -> EarleyItem:
    return EarleyItem('Start', (), (self.startVar(),), 0)

Grammar.startItem = startItem # type: ignore
del startItem

The function `finishItem` returns the Earley item
$$ \langle\hat{s} \rightarrow s \bullet, 0\rangle $$
where $s$ is the start variable of the given grammar and $\hat{s}$ is a new variable.

In [None]:
def finishItem(self) -> EarleyItem:
    return EarleyItem('Start', (self.startVar(),), (), 0)

Grammar.finishItem = finishItem # type: ignore
del finishItem

The function `startVar` returns the start variable of the grammar.  It is assumed that
the first rule grammar starts with the start variable of the grammar.

In [None]:
def startVar(self) -> str:
    return self.mRules[0][0]

Grammar.startVar = startVar # type: ignore
del startVar

The function `toString` creates a readable presentation of the grammar rules.

In [None]:
def toString(self) -> str:
    result = ''
    for head, *body in self.mRules:
        result += f'{head}: {body};\n'
    return result

Grammar.__str__ = toString # type: ignore
del toString

The class `EarleyParser` implements the [parsing algorithm of Jay Earley](https://en.wikipedia.org/wiki/Earley_parser).
The class maintains the following member variables:
- `mGrammar` is the grammar that is used to parse the given token string.
- `mString` is the list of tokens and literals that has to be parsed.

   As a hack, the first element of this list in `None`.  
   Therefore, `mString[i]` is the $i^\textrm{th}$ token.
- `mStateList` is a list of sets of *Earley items*.  If $n$ is the length of the given token string
  (excluding the first element `None`), then $Q_i = \texttt{mStateList}[i]$. 
  The idea is that the set $Q_i$ is the set of those *Earley items* that the parser could be in 
  when it has read the tokens `mString[1]`, $\cdots$,  `mString[i]`.  $Q_0$ is initialized as follows:
  $$ Q_0 = \bigl\{\langle\hat{s} \rightarrow \bullet s, 0\rangle\bigr\}. $$
  
The *Earley items* are interpreted as follows: If we have
$$ \langle c \rightarrow \alpha \bullet \beta, k\rangle \in Q_i, $$
then we know the following:
- After having read the tokens `mString[:k+1]` the parser tries to parse the variable $c$
  in the token string `mString[k+1:]`.
- After having read the token string `mString[k+1:i+1]` the parser has already recognized $\alpha$
  and now needs to recognize $\beta$ in the token string `mString[i+1:]` in order to parse the variable $c$.

In [None]:
class EarleyParser():
    def __init__(self, grammar, TokenList):
        self.mGrammar   = grammar 
        self.mString    = [None] + TokenList  # hack so mString[1] is the first token
        self.mStateList = [set() for i in range(len(TokenList)+1)] 
        print('Grammar:\n')
        print(self.mGrammar)
        print(f'Input: {self.mString}\n')
        self.mStateList[0] = { self.mGrammar.startItem() }

    def parse(self) -> None:
        return None

    def complete(self, i: int) -> bool:
        return None # type: ignore

    def predict(self, i: int) -> bool:
        return None # type: ignore

    def scan(self, i: int) -> None:
        return None

The method `parse` implements Earley's algorithm.  For all states 
$Q_1$, $\cdots$, $Q_n$ we proceed as follows:
- We apply the *completion* operation followed by the *prediction* operation.
  This is done until no more states are added to $Q_i$.  
  
  (The inner `while` loop is not necessary if the grammar does not contain $\varepsilon$-rules.)
- Finally, the *scanning* operation is applied to $Q_i$.  This operation adds
  items to the set $Q_{i+1}$.

After $Q_i$ has been computed, we proceed to process $Q_{i+1}$.
Parsing is successful iff
$$ \langle\hat{s} \rightarrow s \bullet, 0\rangle \in Q_n $$

In [None]:
def earley_parse(self) -> None:
    "run Earley's algorithm"
    print("starting...")
    n = len(self.mString) - 1 # mString[0] = None
    for i in range(0, n+1):
        if i + 1 <= n:
            next_token = self.mString[i+1]
        else:
            next_token = 'EOF'
        print('_' * 80)
        print(f'next token = {next_token}')
        print('_' * 80)
        change = True
        while change:
            change = self.complete(i)
            change = self.predict(i) or change
        self.scan(i)
        # print state
        print(f'\nQ{i}:')
        Qi = self.mStateList[i]
        for item in Qi: 
            print(item)
        if i + 1 <= n:
            print(f'\nQ{i+1}:')
            Qip1 = self.mStateList[i+1]
            for item in Qip1: 
                print(item)
    if self.mGrammar.finishItem() in self.mStateList[-1]:
        print('Parsing successful!')
    else:
        print('Parsing failed!')

EarleyParser.parse = earley_parse # type: ignore
del earley_parse

The method `complete(self, i)` applies the completion operation to the state $Q_i$:
If we have
- $\langle c \rightarrow \gamma \bullet, j\rangle \in Q_i$ and
- $\langle a \rightarrow \beta \bullet c \delta, k\rangle \in Q_j$,
then the parser tried to parse the variable $c$ after having read `mString[:j+1]`
and we know that 
$$ c \Rightarrow^* \texttt{mString[j+1:i+1]}, $$
i.e. the parser has recognized $c$ after having read `mString[j+1:i+1]`.
Therefore the parser should proceed to recognize $\delta$ in state $Q_i$.
Therefore we add the *Earley item* $\langle a \rightarrow \beta c \bullet \delta,k\rangle$ to the set $Q_i$:
$$\langle c \rightarrow \gamma \bullet, j\rangle \in Q_i \wedge
  \langle a \rightarrow \beta \bullet c \delta, k\rangle \in Q_j \;\rightarrow\;
          Q_i := Q_i \cup \bigl\{ \langle a \rightarrow \beta c \bullet \delta, k\rangle \bigr\}
$$



In [None]:
def complete(self, i: int) -> bool:
    change = False
    added  = True
    Qi     = self.mStateList[i]
    while added:
        added = False
        newQi = set()
        for item in Qi:
            if item.isComplete():
                C  = item.mVariable
                j  = item.mIndex
                Qj = self.mStateList[j]
                for newItem in Qj:
                    if newItem.sameVar(C):
                        moved = newItem.moveDot()
                        newQi.add(moved)
        if not (newQi <= Qi):
            change = True
            added  = True
            print("completion:")
            for newItem in newQi:
                if newItem not in Qi:
                    print(f'{newItem} added to Q{i}')
            self.mStateList[i] |= newQi
            Qi = self.mStateList[i]
    return change
    
EarleyParser.complete = complete # type: ignore
del complete

The method `self.predict(i)` applies the prediction operation to the state $Q_i$: 
If $\langle a \rightarrow \beta \bullet c \delta, k \rangle \in Q_j$, then
the parser tries to recognize $c\delta$ after having read `mString[:j+1]`.  To this end
it has to parse $c$ in the string `mString[j+1:]`.
Therefore, if $c \rightarrow \gamma$ is a rule of our grammar,
we add the *Earley item* $\langle c \rightarrow \bullet \gamma, j\rangle$ to the set $Q_j$:
$$ \langle a \rightarrow \beta \bullet c \delta, k\rangle \in Q_j 
       \wedge (c \rightarrow \gamma) \in R 
       \;\rightarrow\;
       Q_j := Q_j \cup\bigl\{ \langle c \rightarrow \bullet\gamma, j\rangle\bigr\}.
$$
As the right hand side $\gamma$ might start with a variable, the function uses a fix point iteration
until no more *Earley items* are added to $Q_j$.

In [None]:
def predict(self, i: int) -> bool:
    change = False
    added  = True
    Qi     = self.mStateList[i]
    while added:
        added = False
        newQi = set()
        for item in Qi:
            c = item.nextVar()
            if c != None:
                for rule in self.mGrammar.mRules:
                    if c == rule[0]:
                        newQi.add(EarleyItem(c, (), tuple(rule[1:]), i))
        if not (newQi <= Qi):
            change = True
            added  = True
            print("prediction:")
            for newItem in newQi:
                if newItem not in Qi:
                    print(f'{newItem} added to Q{i}')
            self.mStateList[i] |= newQi
            Qi = self.mStateList[i]
    return change

EarleyParser.predict = predict # type: ignore
del predict

The function `self.scan(i)` applies the scanning operation to the state $Q_i$.

If $\langle a \rightarrow \beta \bullet T \gamma, k\rangle \in Q_i$ and $T$ is a token,
then the parser tries to recognize the right hand side of the grammar rule
$$ a \rightarrow \beta T \gamma$$ 
and after having read `mString[k+1:i+1]` it has already recognized  $\beta$.
If we now have `mString[i+1] == a`, then the parser still has to recognize $\gamma$ in `mString[i+2:]`.
Therefore, the *Earley object* $\langle a \rightarrow \beta T \bullet \gamma, k\rangle$ is added to
the set $Q_{i+1}$:
$$\langle a \rightarrow \beta \bullet T \gamma, k\rangle \in Q_i \wedge x_{i+1} = T
       \;\rightarrow\;
       Q_{i+1} := Q_{i+1} \cup \bigl\{ \langle a \rightarrow \beta T \bullet \gamma, k\rangle \bigr\}
$$

In [None]:
def scan(self, i: int) -> None:
    Qi = self.mStateList[i]
    n  = len(self.mString) - 1 # remember mStateList[0] == None
    if i + 1 <= n:
        a = self.mString[i+1]
        for item in Qi:
            if item.scan(a):
                self.mStateList[i+1].add(item.moveDot())
                print('scanning:')
                print(f'{item.moveDot()} added to Q{i+1}')

EarleyParser.scan = scan # type: ignore
del scan

In [None]:
import re

The function `tokenize` transforms the string `s` that is to be parsed into a list of tokens. See below for an example.

In [None]:
def tokenize(s: str) -> list[str]:
    '''Transform the string s into a list of tokens.  The string s
       is supposed to represent an arithmetic expression.
    '''
    lexSpec = r'''([ \t]+)        |  # blanks and tabs
                  ([1-9][0-9]*|0) |  # number
                  ([()])          |  # parentheses 
                  ([-+*/])        |  # arithmetical operators
                  (.)                # unrecognized character
               '''
    tokenList = re.findall(lexSpec, s, re.VERBOSE)
    result    = []
    for ws, number, parenthesis, operator, error in tokenList:
        if ws:        # skip blanks and tabs
            continue
        elif number:
            result += [ 'NUMBER' ]
        elif parenthesis:
            result += [ parenthesis ]
        elif operator:
            result += [ operator ]
        else:
            result += [ f'ERROR({error})']
    return result

In [None]:
tokenize('1 + 2 * 3')

The function `test` takes two arguments.
- `file` is the name of a file containing a grammar,
- `word` is a string that should be parsed.

`word` is first tokenized.  Then the resulting token list is parsed using *Earley's algorithm*.

In [None]:
def test(file: str, word: str) -> None: 
    lexer.lineno = 1 # type: ignore
    with open(file, 'r') as handle:
        grammarStr = handle.read() 
    print(grammarStr)
    Rules = yacc.parse(grammarStr) # type: ignore
    grammar   = Grammar(Rules)
    TokenList = tokenize(word)
    ep        = EarleyParser(grammar, TokenList)
    ep.parse()

In [None]:
test('simple.g', '1 + 2 * 3')