recce.ltx

% Copyright 2013 Jeffrey Kegler
% This document is licensed under
% a Creative Commons Attribution-NoDerivs 3.0 United States License.
\documentclass[12pt]{amsart}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{url}

% This is now a "paper", but may be a chapter
% or something else someday
% This command will make any such change easier.
\newcommand{\doc}{paper}

\newcommand{\todo}[1]{\par{\large\bf Todo: #1}\par}
\newcommand{\mymathop}[1]{\mathop{\texttt{#1}}}

% For a type name, when it occurs in text
\newcommand{\type}[1]{\ensuremath{#1}}

\newcommand{\defined}{\underset{\text{def}}{\equiv}}
\newcommand{\dfn}[1]{{\bf #1}}
\newcommand{\sep}{\,\mid\,}
\newcommand{\mydot}{\raisebox{.05em}{$\,\bullet\,$}}
\newcommand{\cat}{\,.\,}
\newcommand{\size}[1]{\ensuremath{\left | {#1} \right |}}
\newcommand{\bigsize}[1]{\ensuremath{\bigl| {#1} \bigr|}}
\newcommand{\order}[1]{\ensuremath{{\mathcal O}(#1)}}
\newcommand{\Oc}{\order{1}}
\newcommand{\On}{\order{\var{n}}}
\newcommand{\inference}[2]{\genfrac{}{}{1pt}{}{#1}{#2}}

% I use hyphens in variable names,
% so I need to ensure that subtraction is
% clearly distinguished by the typography
\newcommand{\subtract}{\,-\,}

\newcommand{\var}[1]{\ensuremath{\texttt{#1}}}

\newcommand{\cfg}{CFG}

\newcommand{\de}{\rightarrow}
\newcommand{\derives}{\Rightarrow}
\newcommand{\destar}
    {\mathrel{\mbox{$\:\stackrel{\!{\ast}}{\Rightarrow\!}\:$}}}
\newcommand{\deplus}
    {\mathrel{\mbox{$\:\stackrel{\!{+}}{\Rightarrow\!}\:$}}}
\newcommand{\derivg}[1]{\mathrel{\mbox{$\:\Rightarrow\:$}}}
\newcommand{\derivrg}[2]{\mathrel{\mbox{$\:\stackrel{\!{#1}}%
        {\Rightarrow\!}\:$}}}

\newcommand{\set}[1]{{\left\lbrace #1 \right\rbrace} }
\newcommand{\bigset}[1]{{\bigl\lbrace #1 \bigr\rbrace} }
\newcommand{\Bigset}[1]{{\Bigl\lbrace #1 \Bigr\rbrace} }
\newcommand{\ah}[1]{#1_{AH}}
\newcommand{\Vah}[1]{\ensuremath{\var{#1}_{AH}}}
\newcommand{\bool}[1]{\var{#1}_{BOOL}}
\newcommand{\Vbool}[1]{\ensuremath{\bool{#1}}}
\newcommand{\dr}[1]{#1_{DR}}
\newcommand{\Vdr}[1]{\ensuremath{\var{#1}_{DR}}}
\newcommand{\Vdrset}[1]{\ensuremath{\var{#1}_{\set{DR}}}}
\newcommand{\eim}[1]{#1_{EIM}}
\newcommand{\Veim}[1]{\ensuremath{\var{#1}_{EIM}}}
\newcommand{\Veimt}[1]{\ensuremath{\var{#1}_{EIMT}}}
\newcommand{\Veimset}[1]{\ensuremath{\var{#1}_{\set{EIM}}}}
\newcommand{\Veimtset}[1]{\ensuremath{\var{#1}_{\set{EIMT}}}}
\newcommand{\Ees}[1]{\ensuremath{#1_{ES}}}
\newcommand{\Vlim}[1]{\ensuremath{\var{#1}_{LIM}}}
\newcommand{\Vlimt}[1]{\ensuremath{\var{#1}_{LIMT}}}
\newcommand{\Eloc}[1]{\ensuremath{{#1}_{LOC}}}
\newcommand{\Vloc}[1]{\Eloc{\var{#1}}}
\newcommand{\Ves}[1]{\Ees{\var{#1}}}
\newcommand{\Vrule}[1]{\ensuremath{\var{#1}_{RULE}}}
\newcommand{\Vruleset}[1]{\ensuremath{\var{#1}_{\set{RULE}}}}
\newcommand{\Vsize}[1]{\ensuremath{\size{\var{#1}}}}
\newcommand{\Vstr}[1]{\ensuremath{\var{#1}_{STR}}}
\newcommand{\sym}[1]{#1_{SYM}}
\newcommand{\Vsym}[1]{\ensuremath{\var{#1}_{SYM}}}
\newcommand{\Vorig}[1]{\ensuremath{\var{#1}_{ORIG}}}
\newcommand{\symset}[1]{#1_{\lbrace SYM \rbrace} }
\newcommand{\Vsymset}[1]{\ensuremath{\var{#1}_{\set{SYM}}}}
\newcommand{\term}[1]{#1_{TERM}}
\newcommand{\token}[1]{#1_{TOK}}

\newcommand{\alg}[1]{\ensuremath{\textsc{#1}}}
\newcommand{\AH}{\ensuremath{\alg{AH}}}
\newcommand{\Earley}{\ensuremath{\alg{Earley}}}
\newcommand{\Leo}{\ensuremath{\alg{Leo}}}
\newcommand{\Marpa}{\ensuremath{\alg{Marpa}}}

\newcommand{\Cfa}{\var{fa}}
\newcommand{\Cg}{\var{g}}
\newcommand{\Cw}{\var{w}}
\newcommand{\CVw}[1]{\ensuremath{\sym{\Cw[\var{#1}]}}}
\newcommand{\Crules}{\var{rules}}
\newcommand{\GOTO}{\mymathop{GOTO}}
\newcommand{\Next}[1]{\mymathop{Next}(#1)}
\newcommand{\Predict}[1]{\mymathop{Predict}(#1)}
\newcommand{\Postdot}[1]{\mymathop{Postdot}(#1)}
\newcommand{\Penult}[1]{\mymathop{Penult}(#1)}
\newcommand{\LHS}[1]{\mymathop{LHS}(#1)}
\newcommand{\RHS}[1]{\mymathop{RHS}(#1)}
\newcommand{\RightRecursive}[1]{\mymathop{Right-Recursive}(#1)}
\newcommand{\RightNN}[1]{\mymathop{Right-NN}(#1)}
\newcommand{\LeoEligible}[1]{\mymathop{Leo-Eligible}(#1)}
\newcommand{\LeoUnique}[1]{\mymathop{Leo-Unique}(#1)}
\newcommand{\ID}[1]{\mymathop{ID}(#1)}
\newcommand{\PSL}[2]{\mymathop{PSL}[#1][#2]}
\newcommand{\myL}[1]{\mymathop{L}(#1)}
\newcommand\Etable[1]{\ensuremath{\mymathop{table}[#1]}}
\newcommand\bigEtable[1]{\ensuremath{\mymathop{table}\bigl[#1\bigr]}}
\newcommand\Rtable[1]{\ensuremath{\mymathop{table}[#1]}}
\newcommand\Rtablesize[1]{\ensuremath{\bigl| \mymathop{table}[#1] \bigr|}}
\newcommand\Vtable[1]{\Etable{\var{#1}}}
\newcommand\EEtable[2]{\ensuremath{\mymathop{table}[#1,#2]}}
\newcommand\EVtable[2]{\EEtable{#1}{\var{#2}}}

% I want to use 'call' outside of pseudocode
\newcommand\call[2]{\textproc{#1}\ifthenelse{\equal{#2}{}}{}{(#2)}}%

\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}

\theoremstyle{definition}
\newtheorem*{definition}{Definition}

\theoremstyle{remark}
\newtheorem*{remark}{Remark}
\newtheorem{observation}[theorem]{Observation}

\hyphenation{oper-and oper-ands}
\hyphenation{look-ahead}
\hyphenation{memo-ization}

\begin{document}

\date{\today}

\title{Marpa, a practical general parser: the recognizer}

\author{Jeffrey Kegler}
\thanks{%
Copyright \copyright\ 2013 Jeffrey Kegler.
}
\thanks{%
This document is licensed under
a Creative Commons Attribution-NoDerivs 3.0 United States License.
}

\begin{abstract}
The Marpa recognizer is described.
Marpa is
a practical and fully implemented
algorithm for the recognition,
parsing and evaluation of context-free grammars.
The Marpa recognizer is the first
to unite the improvements
to Earley's algorithm found in
Joop Leo's 1991 paper
to those in Aycock and Horspool's 2002 paper.
Marpa tracks the full state of the parse,
at it proceeds,
in a form convenient for the application.
This greatly improves error detection
and enables event-driven parsing.
One such technique is
``Ruby Slippers'' parsing,
in which
the input is altered in response
to the parser's expectations.
\end{abstract}

\maketitle

\section{Introduction}

Despite the promise of general context-free parsing,
and the strong academic literature behind it,
it has never been incorporated into a highly available tool
like those that exist for LALR\cite{Johnson} or
regular expressions.
The Marpa project was intended
to take the best results from the literature
on Earley parsing off the pages
of the journals and bring them
to a wider audience.
Marpa::XS\cite{Marpa-XS},
a stable version of this tool,
was uploaded to the CPAN Perl archive
on Solstice Day in 2011.
This paper describes the algorithm of Marpa::R2\cite{Marpa-R2},
a later version.

As implemented,
Marpa parses,
without exception,
all context-free grammars.
Time bounds are the best of Leo\cite{Leo1991}
and Earley\cite{Earley1970}.
The Leo bound,
\On{} for LR-regular grammars,
is especially relevant to
Marpa's goal of being a practical parser:
If a grammar is in a class of grammar currently in practical use,
Marpa parses it in linear time.

Error-detection properties,
extremely important,
have been overlooked in the past.
Marpa breaks new ground in this respect.
Marpa has the immediate error detection property,
but goes well beyond that:
it is fully aware of the state of the parse,
and can report this to the user while tokens are
being scanned.

Marpa allows the lexer to check its list
of acceptable tokens before a token is scanned.
Because rejection of tokens is easily and
efficiently recoverable,
the lexer is also free to take an event-driven
approach.
Error detection is no longer
an act of desperation,
but a parsing technique in its own right.
If a token is rejected,
the lexer is free to create a new token
in the light of the parser's expectations.
This approach can be described
as making the parser's
``wishes'' come true,
and we have called this
``Ruby Slippers Parsing''.

One use of the Ruby Slippers technique is to
parse with a clean
but oversimplified grammar,
programming the lexical analyzer to make up for the grammar's
short-comings on the fly.
The author has implemented an HTML parser\cite{Marpa-HTML},
based on a grammar that assumes that all start
and end tags are present.
Such an HTML grammar is too simple even to describe perfectly
standard-conformant HTML,
but the lexical analyzer is
programmed to supply start and end tags as requested by the parser.
The result is a very simply and cleanly designed parser
that parses very liberal HTML
and accepts all input files,
in the worst case
treating them as highly defective HTML.

Section
\ref{s:preliminaries} describes the notation and conventions
of this \doc.
Section \ref{s:rewrite} deals with Marpa's
grammar rewrites.
Sections \ref{s:earley} and \ref{s:earley-ops}
introduce Earley's algorithm.
Section \ref{s:leo} describes Leo's modification
to Earley's algorithm.
Section \ref{s:AHFA} describes the modifications
proposed by Aycock and Horspool.
Section \ref{s:pseudocode} presents the pseudocode
for Marpa's recognizer.
Section
\ref{s:proof-preliminaries}
describes notation and deals with other
preliminaries
to the theoretical results.
Section
\ref{s:correct}
contain a proof of Marpa's correctness,
while Section \ref{s:complexity} contains
its complexity results.
Finally,
section \ref{s:input}
generalizes Marpa's input model.

\section{Preliminaries}
\label{s:preliminaries}
\label{s:start-prelim}

We assume familiarity with the theory of parsing,
as well as Earley's algorithm.
This \doc{} will
use subscripts to indicate commonly occurring types.
\begin{center}
\begin{tabular}{ll}
$\var{X}_T$ & The variable \var{X} of type $T$ \\
$\var{set-one}_\set{T}$ & The variable \var{set-one} of type set of $T$ \\
$SYM$ & The type for a symbol \\
\Vsym{a} & The variable \var{a} of type $SYM$ \\
\Vsymset{set-two} & The variable \var{set-two}, a set of symbols \\
\end{tabular}
\end{center}
Subscripts may be omitted when the type
is obvious from the context.
The notation for
constants is the same as that for variables.
Multi-character variable names will be common,
and operations will never be implicit.
\begin{center}
\begin{tabular}{ll}
Multiplication &  $\var{a} \times \var{b}$ \\
Concatenation & $\var{a} \cat \var{b}$ \\
Subtraction & $\var{symbol-count} \subtract \var{terminal-count}$ \\
\end{tabular}
\end{center}
Type names are often used in the text
as a convenient way to refer to
their type.

Where \Vsymset{vocab} is non-empty set of symbols,
let $\var{vocab}^\ast$ be the set of all strings
(type \type{STR}) formed
from those symbols.
Where \Vstr{s} is a string,
let \size{\Vstr{s}} be its length, counted in symbols.
Let $\var{vocab}^+$ be
\begin{equation*}
\bigl\{ \Vstr{x}
\bigm| \Vstr{x} \in \var{vocab}* \land \Vsize{\Vstr{x}} > 0
\bigr\}.
\end{equation*}

In this \doc{} we use,
without loss of generality,
the grammar \Cg{},
where \Cg{} is the 3-tuple
\begin{equation*}
    (\Vsymset{vocab}, \var{rules}, \Vsym{accept}).
\end{equation*}
Here $\Vsym{accept} \in \var{vocab}$.
Call the language of \var{g}, $\myL{\Cg}$,
where $\myL{\Cg} \subseteq \var{vocab}^\ast$.

\Vruleset{rules} is a set of rules (type \type{RULE}),
where a rule is a duple
of the form $[\Vsym{lhs} \de \Vstr{rhs}]$,
such that
\begin{equation*}
\Vsym{lhs} \in \var{vocab} \quad \text{and}
\quad \Vstr{rhs} \in \var{vocab}^+.
\end{equation*}
\Vsym{lhs} is referred to as the left hand side (LHS)
of \Vrule{r}.
\Vstr{rhs} is referred to as the right hand side (RHS)
of \Vrule{r}.
The LHS and RHS of \Vrule{r} may also be
referred to as 
$\LHS{\Vrule{r}}$ and $\RHS{\Vrule{r}}$, respectively.
This definition follows \cite{AH2002},
which departs from tradition by disallowing an empty RHS.

The rules imply the traditional rewriting system,
in which $\Vstr{x} \derives \Vstr{y}$
states that \Vstr{x} derives \Vstr{y} in exactly one step;
$\Vstr{x} \deplus \Vstr{y}$
states that \Vstr{x} derives \Vstr{y} in one or more steps;
and $\Vstr{x} \destar \Vstr{y}$
states that \Vstr{x} derives \Vstr{y} in zero or more steps.

We say that symbol \Vsym{x} is \dfn{nullable} if and only if
$\Vsym{x} \destar \epsilon$.
\Vsym{x} is \dfn{nonnull} if and only if it is not nullable.
Following Aycock and Horspool\cite{AH2002},
all nullable symbols in grammar \Cg{} are nulling -- every symbol
which can derive the null string always derives the null string.
It is shown in \cite{AH2002} how to do this without losing generality
or the ability to efficiently evaluate a semantics that is
defined in terms of an original grammar that includes symbols which
are both nullable and non-nulling,
empty rules, etc.

Also without loss of generality,
it is assumed
that there is a dedicated acceptance rule, \Vrule{accept}
and a dedicated acceptance symbol, $\Vsym{accept} = \LHS{\Vrule{accept}}$,
such that
for all \Vrule{x},
\begin{equation*}
\begin{split}
& \Vsym{accept} \notin \RHS{\Vrule{x}} \\
\land \quad & (\Vsym{accept} = \LHS{\Vrule{x}} \implies \Vrule{accept} = \Vrule{x}).
\end{split}
\end{equation*}


We define rightmost non-null symbol of a string
as
\begin{equation*}
\begin{split}
& \RightNN{\Vstr{x}} \defined \Vsym{rnn} \quad \text{such that} \quad
 \exists \, \Vstr{pre}, \Vstr{post} \mid \\
& \qquad \qquad \Vstr{x} = \Vstr{pre} \cat \Vsym{rnn} \cat \Vstr{post} \\
& \qquad \qquad \land \Vstr{post} \destar \epsilon \\
& \qquad \qquad \land \neg (\Vsym{rnn} \destar \epsilon).
\end{split}
\end{equation*}
We define the rightmost non-null symbol of a rule as
\begin{equation*}
\RightNN{[\Vsym{lhs} \de \Vstr{rhs}]} \defined \RightNN{\Vstr{rhs}}.
\end{equation*}
A rule \Vrule{x} is \dfn{directly right-recursive}
if and only if
\begin{equation*}
\LHS{\Vrule{x}} = \RightNN{\Vrule{x}}.
\end{equation*}
\Vrule{x} is \dfn{indirectly right-recursive}
if and only if 
\begin{equation*}
\exists \, \Vstr{y} \mid \RightNN{\Vrule{x}} \deplus \Vstr{y} \land \RightNN{\Vstr{y}} = \LHS{\Vrule{x}}.
\end{equation*}
\Vrule{x} is \dfn{right recursive},
$\RightRecursive{\Vrule{x}}$,
if and only if is either directly or indirectly right-recursive.

The definition of \Cg{} does not sharply distinguish terminals
from non-terminals.
\Marpa{}'s implementations allow terminals to be the LHS
of rules,
and every symbol except \Vsym{accept} can be a terminal.
The implementations have options that allow
the user to reinstate
the traditional restrictions,
in part or in whole.
Note that,
as a result of these definitions,
sentential forms will be of type \type{STR}.

Let the input to
the parse be \Cw{} such that $\Cw \in \var{vocab}^+$.
Locations in the input will be of type \type{LOC}.
Let \Vsize{w} be the length of the input, counted in symbols.
When we state our complexity results later,
they will often be in terms of $\var{n}$,
where $\var{n} = \Vsize{w}$.
Let \CVw{i} be character \var{i}
of the input,
$0 \le \Vloc{i} < \Vsize{w}$.

The alert reader may have noticed that the previous definition
of \Cw{} did not allow zero-length inputs.
To simplify the mathematics, we exclude null parses
and trivial grammars from consideration.
In its implementations,
the Marpa parser
deals with null parses and trivial grammars as special cases.
(Trivial grammars are those that recognize only the null string.)

In this \doc{},
\Earley{} will refer to the Earley's original
recognizer\cite{Earley1970}.
\Leo{} will refer to Leo's revision of \Earley{}
as described in~\cite{Leo1991}.
\AH{} will refer to the Aycock and Horspool's revision
of \Earley{}
as described in~\cite{AH2002}.
\Marpa{} will refer to the parser described in
this \doc{}.
Where $\alg{Recce}$ is a recognizer,
$\myL{\alg{Recce},\Cg}$ will be the language accepted by $\alg{Recce}$
when parsing \Cg{}.

\section{Rewriting the grammar}
\label{s:rewrite}

We have already noted
that no rules of \Cg{}
have a zero-length RHS,
and that all symbols must be either nulling or non-nullable.
These restrictions follow Aycock and Horspool\cite{AH2002}.
The elimination of empty rules and proper nullables
is done by rewriting the grammar.
\cite{AH2002} shows how to do this
without loss of generality.

Because Marpa claims to be a practical parser,
it is important to emphasize
that all grammar rewrites in this \doc{}
are done in such a way that the semantics
of the original grammar can be reconstructed
simply and efficiently at evaluation time.
As one example,
when a rewrite involves the introduction of new rule,
semantics for the new rule can be defined to pass its operands
up to a parent rule as a list.
Where needed, the original semantics
of a pre-existing parent rule can
be ``wrapped'' to reassemble these lists
into operands that are properly formed
for that original semantics.

As implemented,
the Marpa parser allows users to associate
semantics with an original grammar
that has none of the restrictions imposed
on grammars in this \doc{}.
The user of a Marpa parser
may specify any context-free grammar,
including one with properly nullable symbols,
empty rules, etc.
The user specifies his semantics in terms
of this original, ``free-form'', grammar.
Marpa implements the rewrites,
and performs evaluation,
in such a way as to keep them invisible to
the user.
From the user's point of view,
the ``free-form'' of his grammar is the
one being used for the parse,
and the one to which
his semantics are applied.

\section{Earley's algorithm}
\label{s:earley}

Let $\Vrule{r} \in \Crules$
be a rule,
and $\Vsize{r}$ the length of its RHS.
A dotted rule (type \type{DR}) is a duple, $[\Vrule{r}, \var{pos}]$,
where $0 \le \var{pos} \le \size{\Vrule{r}}$.
The position, \var{pos}, indicates the extent to which
the rule has been recognized,
and is represented with a large raised dot,
so that if
\begin{equation*}
[\Vsym{A} \de \Vsym{X} \cat \Vsym{Y} \cat \Vsym{Z}]
\end{equation*}
is a rule,
\begin{equation*}
[\Vsym{A} \de \var{X} \cat \var{Y} \mydot \var{Z}]
\end{equation*}
is the dotted rule with the dot at
$\var{pos} = 2$,
between \Vsym{Y} and \Vsym{Z}.

If we let \Vdr{x} be a dotted rule, such that
\begin{equation*}
\Vdr{x} =
\bigl[ [\Vsym{A} \de \Vstr{pre} \cat \Vsym{next} \cat \Vstr{post}],
    \var{pos} \bigr],
\end{equation*}
then
%
\begin{gather*}
%
\LHS{\Vdr{x}} \defined \Vsym{A} \\
%
\Postdot{\Vdr{x}} \defined
\begin{cases}
\Vsym{next}, \quad \text{if $\var{x} = [\var{A} \de \var{pre} \mydot \var{next} \cat \var{post}]$} \\
\Lambda, \quad \text{if $\var{x} = [\var{A} \de \var{pre} \cat \var{next} \cat \var{post} \mydot]$}
\end{cases} \\
%
\Next{\Vdr{x}} \defined
\begin{cases}
[\var{A} \de \var{pre} \cat \var{next} \mydot \var{post}],  \\
\qquad \text{if $\Postdot{\Vdr{x}} = \var{next}$} \\
\text{$\Lambda$, otherwise}
\end{cases} \\
%
\Penult{\Vdr{x}} \defined
\begin{cases}
\Vsym{next}, \quad \text{if} \\
\qquad \Postdot{\Vdr{x}} = \var{next} \\
\qquad \land \quad \Vstr{post} \destar \epsilon \\
\qquad \land \quad \neg (\Vsym{next} \destar \epsilon) \\
\Lambda, \quad \text{otherwise}
\end{cases}
%
\end{gather*}

A \dfn{penult} is a dotted rule \Vdr{d} such that $\Penult{\var{d}} \neq \Lambda$.
Note that $\Penult{\Vdr{x}}$
is never a nullable symbol.
The \dfn{initial dotted rule} is
\begin{equation*}
\Vdr{initial} = [\Vsym{accept} \de \mydot \Vsym{start} ].
\end{equation*}
A \dfn{predicted dotted rule} is a dotted rule,
other than the initial dotted rule,
with a dot position of zero,
for example,
\begin{equation*}
\Vdr{predicted} = [\Vsym{A} \de \mydot \Vstr{alpha} ].
\end{equation*}
A \dfn{confirmed dotted rule}
is the initial dotted rule,
or a dotted rule
with a dot position greater than zero.
A \dfn{completed dotted rule} is a dotted rule with its dot
position after the end of its RHS,
for example,
\begin{equation*}
\Vdr{completed} = [\Vsym{A} \de \Vstr{alpha} \mydot ].
\end{equation*}
Predicted, confirmed and completed dotted rules
are also called, respectively,
\dfn{predictions}, \dfn{confirmations} and \dfn{completions}.

A traditional Earley item (type \type{EIMT}) is a duple
\[
    [\Vdr{dotted-rule}, \Vorig{x}]
\]
of dotted rule and origin.
(The origin is the location where recognition of the rule
started.
It is sometimes called the ``parent''.)
For convenience, the type \type{ORIG} will be a synonym
for \type{LOC}, indicating that the variable designates
the origin element of an Earley item.

An Earley parser builds a table of Earley sets,
\begin{equation*}
\EVtable{\Earley}{i},
\quad \text{where} \quad
0 \le \Vloc{i} \le \size{\Cw}.
\end{equation*}
Earley sets are of type \type{ES}.
Earley sets are often named by their location,
so that \Ves{i} means the Earley set at \Vloc{i}.
The type designator \type{ES} is often omitted to avoid clutter,
especially in cases where the Earley set is not
named by location.

At points,
we will need to compare the Earley sets
produced by the different recognizers.
\EVtable{\alg{Recce}}{i} will be
the Earley set at \Vloc{i}
in the table of Earley sets of
the \alg{Recce} recognizer.
For example,
\EVtable{\Marpa}{j} will be Earley set \Vloc{j}
in \Marpa's table of Earley sets.
In contexts where it is clear which recognizer is
intended,
\Vtable{k}, or \Ves{k}, will symbolize Earley set \Vloc{k}
in that recognizer's table of Earley sets.
If \Ees{\var{working}} is an Earley set,
$\size{\Ees{\var{working}}}$ is the number of Earley items
in \Ees{\var{working}}.

\Rtablesize{\alg{Recce}} is the total number
of Earley items in all Earley sets for \alg{Recce},
\begin{equation*}
\Rtablesize{\alg{Recce}} =
     \sum\limits_{\Vloc{i}=0}^{\size{\Cw}}
	{\bigsize{\EVtable{\alg{Recce}}{i}}}.
\end{equation*}
For example,
\Rtablesize{\Marpa} is the total number
of Earley items in all the Earley sets of
a \Marpa{} parse.

Recall that
there was a unique acceptance symbol,
\Vsym{accept}, in \Cg{}.
The input \Cw{} is accepted if and only if,
for some \Vstr{rhs},
\begin{equation*}
\bigl[[\Vsym{accept} \de \Vstr{rhs} \mydot], 0\bigr] \in \bigEtable{\Vsize{\Cw}}
\end{equation*}

\section{Operations of the Earley algorithm}
\label{s:earley-ops}

In this section,
each Earley operation is shown in the form of
an inference rule,
the conclusion of which
is the set of the Earley items
that is that operation's \dfn{result}.
The Earley sets correspond to parse locations,
and for any Earley operation there is a
current parse location, \Vloc{current}, and
a current Earley set, \Ves{current}.

Each location starts with an empty Earley set.
For the purposes of this description of
\Earley{}, the order of the
Earley operations
when building an
Earley set is non-deterministic.
After each Earley operation is performed,
its result is unioned with the
current Earley set.
When no more Earley items
can be added, the Earley set is
complete.
The Earley sets are built in
order from 0 to \Vsize{w}.

\subsection{Initialization}
\label{d:initial}
\begin{equation*}
\inference{
   \Vloc{current} = 0
}{
    \Bigset{\bigl[ [ \Vsym{accept} \de \mydot \Vsym{start} ], 0 \bigr]}
}
\end{equation*}
Earley {\bf initialization} has no operands
and only takes
place in Earley set 0.

\subsection{Scanning}
\label{d:scan}
\begin{equation*}
\inference{
    \begin{array}{c}
	\Vloc{current} > 0 \\
	\Vloc{previous} = \Vloc{current} \subtract 1
	\\[3pt]
	\Vsym{token} = \Cw\bigl[\Vloc{previous}\bigr]
	\\[3pt]
	\Veimt{predecessor} = [ \Vdr{before}, \Vorig{predecessor} ] \\
	\Veimt{predecessor} \in \Ves{previous} \\
	\Postdot{\Vdr{before}} = \Vsym{token}
    \end{array}
}{
    \bigset{ [ \Next{\Vdr{before}}, \Vorig{predecessor} ] }
}
\end{equation*}
\Veimt{predecessor}
and \Vsym{token} are the operands of an Earley scan.
\Veimt{predecessor} is called the predecessor of the scanning operation.
The token, \Vsym{token}, is the transition symbol
of the scanning operation.

\subsection{Reduction}
\label{d:reduction}
\begin{equation*}
\inference{
    \begin{array}{c}
    \Veimt{component} = \bigl[
     [ \Vsym{lhs} \de \Vstr{rhs} \mydot ]
    , \Vloc{component-orig} \bigr] \\
    \Veimt{component} \in \Ves{current} \\[3pt]
    \Veimt{predecessor} = [ \Vdr{before}, \Vorig{predecessor} ] \\
    \Veimt{predecessor} \in \Ves{component-orig} \\
    \Postdot{\Vdr{before}} = \Vsym{lhs} \\
    \end{array}
}{
    \bigset{ [ \Next{\Vdr{before}}, \Vorig{predecessor} ] }
}
\end{equation*}

\Veimt{component} is called
the component of the reduction operation.\footnote{
The term ``component'' comes from Irons \cite{Irons}.
}
\Veimt{predecessor} is the predecessor
of the reduction operation.
\Vsym{lhs} is the transition symbol
of the reduction operation.

\Veimt{predecessor} and
\Veimt{component} are the operands of the reduction
operation.
In some contexts, it is convenient to treat the transition
symbol
as an operand,
so that the operands
are \Veimt{predecessor} and \Vsym{lhs}.

\subsection{Prediction}
\label{d:prediction}
\begin{equation*}
\inference{
    \begin{array}{c}
	\Veimt{predecessor} = [ \Vdr{predecessor}, \Vorig{predecessor} ] \\
	\Veimt{predecessor} \in \Ves{current}
    \end{array}
}{
  \left\{
  \begin{aligned}
  & \Bigl[ \bigl[ \Vsym{L} \de \mydot \Vstr{rh} \bigr],
  \Vloc{current}
  \Bigr]
  \quad \text{such that} \\
  & \qquad \bigl[ \Vsym{L} \de \Vstr{rh} \bigr] \in \Crules  \\
  & \qquad \land \bigl( \exists \, \Vstr{z} \mid
  \Postdot{\Vdr{predecessor}} \destar \Vsym{L} \cat \Vstr{z} \bigr)
  \end{aligned}
  \right\}
}
\end{equation*}
A prediction operation can add several Earley items
to Earley set \Vloc{current}.
\Veimt{predecessor} is called the predecessor of the prediction operation,
and is its only operand.

\subsection{Causation}
An operand of an operation is also called a \dfn{cause}
of that operation,
and the set of operands for an Earley operation
is its \dfn{causation}.

\section{The Leo algorithm}
\label{s:leo}

In \cite{Leo1991}, Joop Leo presented a method for
dealing with right recursion in \On{} time.
Leo shows that,
with his modification, Earley's algorithm
is \On{} for all LR-regular grammars.
(LR-regular is LR where lookahead
is infinite length, but restricted to
distinguishing between regular expressions.)

Summarizing Leo's method,
it consists of spotting potential right recursions
and memoizing them.
Leo restricts the memoization to situations where
the right recursion is unambiguous.
Potential right recursions are memoized by
Earley set, using what Leo called
``transitive items''.
In this \doc{} Leo's ``transitive items''
will be called Leo items.
Leo items in the form that Marpa uses
will be type \type{LIM}.
``Traditional'' Leo items,
that is, those of the form used in Leo's paper\cite{Leo1991},
will be type \type{LIMT}.

In each Earley set, there is at most one Leo item per symbol.
A traditional Leo item (LIMT) is the triple
\begin{equation*}
[ \Vdr{top}, \Vsym{transition}, \Vorig{top} ]
\end{equation*}
where \Vsym{transition} is the transition symbol,
and
\begin{equation*}
\Veimt{top} = [\Vdr{top}, \Vorig{top}]
\end{equation*}
is the Earley item to be added on reductions over
\Vsym{transition}.

Leo items memoize what would otherwise be sequences
of Earley items.
Leo items only memoize unambiguous (or
deterministic) sequences,
so that the top of the sequence can represent
the entire sequence --
the only role the other EIMT's in the sequence
play in the parse is to derive the top EIMT.
We will call these memoized sequences, Leo sequences.

To quarantee that a Leo sequence is deterministic,
\Leo{} enforced \dfn{Leo uniqueness}.
Define containment of a dotted rule in a Earley set
of EIMT's as
\begin{equation*}
\begin{split}
& \mymathop{Contains}(\Ves{i}, \Vdr{d}) \defined \exists \, \Veimt{b}, \Vorig{j} \mid  \\
& \qquad \Veimt{b} = [ \Vdr{d}, \Vorig{j} ] 
  \land \Veimt{b} \in \Ves{i}.
\end{split}
\end{equation*}
A dotted rule \Vdr{d} is \dfn{Leo unique} in the Earley set
at \Ves{i}
if and only if
\begin{equation*}
\begin{split}
&    \Penult{\Vdr{d}} \neq \Lambda \\
&    \land \forall \, \Vdr{d2} \bigl( \mymathop{Contains}(\Ves{i}, \Vdr{d2}) \implies \\
&    \qquad \Postdot{\Vdr{d}} = \Postdot{\Vdr{d2}} \implies \Vdr{d} = \Vdr{d2} \bigr).
\end{split}
\end{equation*}
If \Vdr{d} is Leo unique, then the symbol $\Postdot{\Vdr{d}}$ is
also said to be \dfn{Leo unique}.
In cases where a symbol \Vsym{transition} is Leo unique in \Ves{i},
we can speak of the dotted rule for \Vsym{transition},
and the rule for \Vsym{transition},
because there can be only one of each.
In the previous definitions,
it is important to emphasize that \Vdr{d2} ranges over all the dotted
rules of Earley set \Ves{i},
even those which are ineligible for Leo memoization.

Let \var{n} be the length of a Leo sequence.
In \Earley, each such sequence would be expanded in every
Earley set that is the origin of an EIMT included in the
sequence, and the total number of EIMT's would be
\order{\var{n}^2}.

With Leo memoization, a single EIMT stands in for the sequence.
There are \Oc{} Leo items per Earley set,
so the cost of the sequence is \Oc{} per Earley set,
or \On{} for the entire sequence.
If, at evaluation time,
it is desirable to expand the Leo sequence,
only those items actually involved in the parse
need to be expanded.
All the EIMT's of a potential right-recursion
will be in one Earley set and the number of EIMT's
will be \On{},
so that even including expansion of the Leo sequence
for evaluation, the time and space complexity of
the sequence remains \On{}.

\begin{sloppypar}
Recall that we
call a dotted rule \Vdr{d} a \dfn{penult} if $\Penult{\var{d}} \neq \Lambda$.
In Leo's original algorithm, any penult
was treated as a potential right-recursion.
\Marpa{} applies the Leo memoizations in more restricted circumstances.
For \Marpa{} to consider a dotted rule
\begin{equation*}
\Vdr{candidate} = [\Vrule{candidate}, \var{i}]
\end{equation*}
for Leo memoization,
\Vdr{candidate} must be a penult and
\Vrule{candidate} must be right-recursive.
\end{sloppypar}

By restricting Leo memoization to right-recursive rules,
\Marpa{} incurs the cost of Leo memoization only in cases
where Leo sequences could be infinitely
long.
This more careful targeting of the memoization is for efficiency reasons.
If all penults are memoized,
many memoizations will be performed where
the longest potential Leo sequence is short,
and the payoff is therefore very limited.
One future extension might be to identify
non-right-recursive rules
which generate Leo sequences long enough to
justify inclusion in the Leo memoizations.
Such cases are unusual, but may occur.

Omission of a memoization does not affect correctness,
so \Marpa{}'s restriction of Leo memoization
preserves the correctness as shown in Leo\cite{Leo1991}.
Later in this \doc{} we will
show that this change also leaves
the complexity results of 
Leo\cite{Leo1991} intact.

Implementing the Leo logic requires
adding Leo reduction as a new basic operation,
adding a new premise to the Earley reduction
operation,
and extending the Earley sets to memoize Earley
items as LIMT's.

\subsection{Leo reduction}

\begin{equation*}
\inference{
    \begin{array}{c}
    \Veimt{component} = \bigl[
     [ \Vsym{lhs} \de \Vstr{rhs} \mydot ]
    , \Vloc{component-origin} \bigr] \\
    \Veimt{component} \in \Ves{current} \\[3pt]
	\Vlimt{predecessor} = [ \Vdr{top}, \Vsym{lhs}, \Vorig{top} ] \\
	\Vlimt{predecessor} \in \Ves{component-orig} \\
    \end{array}
}{
    \bigset{ [ \Vdr{top}, \Vorig{top} ] }
}
\end{equation*}
The new Leo reduction operation resembles the Earley reduction
operation, except that it looks for an LIMT,
instead of a predecessor EIMT.
\Vlimt{predecessor} and
\Veimt{component} are the operands of the Leo reduction
operation.
\Vsym{lhs} is the transition symbol
of the Leo reduction.
As with Earley reduction,
it may be convenient to treat the transition
symbol as an operand,
so that the operands
are \Vlimt{predecessor} and \Vsym{lhs}.

\subsection{Changes to Earley reduction}

Earley reduction still applies, with an additional premise:
\begin{multline*}
\neg \exists \, \Vlimt{x} \; \mid \; \Vlimt{x} \in \Ves{component-orig} \\
	\land \Vlimt{x} = [ \Vdr{x}, \Vsym{lhs}, \Vorig{x} ]
\end{multline*}
The additional premise
prevents Earley reduction from being applied
where there is an LIMT with \Vsym{lhs} as its transition symbol.
This reflects the fact that
Leo reduction replaces Earley reduction if and only if
there is a Leo memoization.

\subsection{Leo memoization}

We define uniqueness of a penult in an Earley set as
\begin{equation*}
\begin{split}
& \mymathop{Penult-Unique}(\Vsym{penult},\Ves{i}) \defined \\
& \qquad \forall \, \Vdr{y} \bigl( \mymathop{Contains}(\Ves{current}, \Vdr{y}) 
\land \Vsym{penult} = \Penult{\Vdr{y}} \bigr) \\
& \qquad \qquad \implies \Vdr{x} = \Vdr{y}.
\end{split}
\end{equation*}
We define
Leo uniqueness as
\begin{equation*}
\begin{split}
& \LeoUnique{\Vdr{x},\Vloc{current}} \defined \mymathop{Contains}(\Ves{current}, \Vdr{x})  \\
& \qquad \land \Penult{\Vdr{x}} \neq \Lambda \\
& \qquad \land \mymathop{Penult-Unique}({\Penult{\Vdr{x}}}, \Ves{current})
\end{split}
\end{equation*}
and Leo eligibility as
\begin{equation*}
\begin{split}
& \LeoEligible{\Vdr{x},\Vloc{current}} \defined \\
& \qquad \exists \, \Vrule{x}, \Vorig{x} \; \mid \; \Vdr{x} = [ \Vrule{x}, \Vorig{i} ] \\
& \qquad \land \RightRecursive{\Vrule{x}} \\
& \qquad \land \LeoUnique{\Ves{current},\Vdr{x}}.
\end{split}
\end{equation*}
For convenience, we define a relation that is
true if \Vlimt{pred} is the LIMT predecessor of
an EIMT, and false otherwise:
\begin{equation*}
\begin{split}
& \mymathop{LIMT-Predecessor}({\Vlimt{pred},\Veimt{bottom}}) \defined \\
& \quad \exists \Ves{bottom-origin}, \Vdr{bottom}, \Vdr{pred}, \\
& \quad \qquad \Vloc{pred-origin}, \Vloc{bottom-origin} \quad \text{such that}  \\
& \quad \qquad \qquad \Veimt{bottom} = [ \Vdr{bottom}, \Vloc{bottom-origin} ] \\
& \quad \qquad \qquad \land \Vlimt{pred} = [ \Vdr{pred}, \LHS{\Vdr{bottom}}, \Vloc{pred-origin} ] \\
& \quad \qquad \qquad \land \Vlimt{pred} \in \Ves{bottom-origin} 
\end{split}
\end{equation*}

We are now ready to define an inference rule which holds if
a LIMT predecessor can be found for an EIMT \Veimt{bottom}
\begin{equation*}
\inference{
    \begin{array}{c}
    \mymathop{LIMT-Predecessor}({\Vlimt{pred},\Veimt{bottom}}) \\
    \Vlimt{pred} = [ \Vdr{pred}, \LHS{\Vdr{bottom}}, \Vorig{pred} ] \\
    \Veimt{bottom} = [ \Vdr{bottom}, \Vloc{bottom} ] \\
    \Veimt{bottom} \in \Ves{current} \\
    \LeoEligible{\Vdr{bottom}}
    \end{array}
}{
    \left \lbrace [ \Vdr{pred}, \Penult{\Vdr{bottom}}, \Vorig{pred} ] \right \rbrace
}
\end{equation*}
and another, which holds if
\Veimt{bottom} has no predecessor LIMT,
\begin{equation*}
\inference{
    \begin{array}{c}
    \neg \mymathop{LIMT-Predecessor}({\Vlimt{pred},\Veimt{bottom}}) \\
    \Veimt{bottom} = [ \Vdr{bottom}, \Vorig{bottom} ] \\
    \Veimt{bottom} \in \Ves{current} \\
    \LeoEligible{\Vdr{bottom}}
    \end{array}
}{
    \bigset{ [ \Next{\Vdr{bottom}}, \Penult{\Vdr{bottom}}, \Vorig{bottom} ] }
}
\end{equation*}

\section{The Aycock-Horspool finite automaton}
\label{s:AHFA}
\label{s:end-prelim}

In this \doc{} a
``split LR(0) $\epsilon$-DFA''
as described by Aycock and Horspool\cite{AH2002},
will be called an Aycock-Horspool Finite Automaton,
or AHFA.
This section will
summarize the ideas
from \cite{AH2002}
that are central to Marpa.

Aycock and Horspool based their AHFA's
on a few observations.
\begin{itemize}
\item
In practice, Earley items sharing the same origin,
but having different dotted rules,
often appear together in the same Earley set.
\item
There is in the literature a method
for associating groups of dotted rules that often appear together
when parsing.
This method is the LR(0) DFA used in the much-studied
LALR and LR parsers.
\item
The LR(0) items that are the components of LR(0)
states are, exactly, dotted rules.
\item
By taking into account symbols that derive the
null string, the LR(0) DFA could be turned into an
LR(0) $\epsilon$-DFA,
which would be even more effective
at grouping dotted rules that often occur together
into a single DFA state.
\end{itemize}

AHFA states are, in effect,
a shorthand
for groups of dotted rules that occur together frequently.
Aycock and Horspool realized that,
by changing Earley items to track AHFA states
instead of individual dotted rules,
the size of Earley sets could be reduced,
and Earley's algorithm made faster in practice.

As a reminder,
the original Earley items (EIMT's)
were duples, $[\Vdr{x}, \Vorig{x}]$,
where \Vdr{x} is a dotted rule.
An Aycock-Horspool Earley item is a duple
\begin{equation*}
[\Vah{y}, \Vorig{y}],
\end{equation*}
where $\Vah{y}$ is an AHFA state.

\Marpa{} uses
Earley items of the form
created by Aycock and Horspool.
A Marpa Earley item has type \type{EIM},
and a Marpa Earley item is often referred to as an EIM.

\begin{sloppypar}
Aycock and Horspool did not consider
Leo's modifications,
but \Marpa{} incorporates them,
and \Marpa{} also changes its Leo items to use AHFA states.
Marpa's Leo items (LIM's) are triples
of the form
\end{sloppypar}
\begin{equation*}
[\Vah{top}, \Vsym{transition}, \Vorig{top}],
\end{equation*}
where \Vsym{transition} and \Vorig{top}
are as in the traditional Leo items,
and \Vah{top} is an AHFA state.
A Marpa Leo item has type \type{LIM}.

\cite{AH2002} also defines
a partial transition function for
pairs of AHFA state and symbol,
\begin{equation*}
\GOTO: \Cfa, (\epsilon \cup \var{vocab}) \mapsto \Cfa.
\end{equation*}
$\GOTO(\Vah{from}, \epsilon)$ is a
\dfn{null transition}.
(AHFA's are not fully deterministic.)
If \Vah{predicted} is the result of a null transition,
it is called a \dfn{predicted} AHFA state.
If an AHFA state is not a \dfn{predicted} AFHA state,
it is called a \dfn{confirmed} AHFA state.
The initial AHFA state is a confirmed AHFA state.
(In \cite{AH2002} confirmed states are called ``kernel states'',
and predicted states are called ``non-kernel states''.)

The states of an AHFA
are not a partition of the dotted
rules --
a single dotted rule can occur
in more than one AHFA state.
In combining
the improvements of Leo~\cite{Leo1991} and
Aycock and Horspool\cite{AH2002},
the following theorem is crucial.

\begin{theorem}\label{t:leo-singleton}
If a Marpa Earley item (EIM) is the result of a
Leo reduction,
then its AHFA state contains only one dotted rule.
\end{theorem}

\begin{proof}
Let the EIM that is the result of the Leo
reduction be
\begin{equation*}
\Veim{result} = [\Vah{result}, \Vorig{result}]
\end{equation*}
Let the Earley set that contains \Veim{result} be
\Ves{i}.
Since \Veim{result} is the result of a Leo reduction
we know, from the definition of a Leo reduction,
that
\begin{equation*}
\Vdr{complete} \in \Vah{result}
\end{equation*}
where
\Vdr{complete} is a completed rule.
Let \Vrule{c} be the rule of \Vdr{complete},
and \var{cp} its dot position,
\begin{equation*}
\Vdr{complete} = [ \Vrule{c}, \var{cp} ].
\end{equation*}
$\var{cp} > 0$ because, in \Marpa{}
grammars, completions are never
predictions.

Suppose, for a reduction to absurdity,
that the AHFA state contains another dotted rule,
\Vdr{other}, that is, that
\begin{equation*}
\Vdr{other} \in \Vah{result},
\end{equation*}
where $\Vdr{complete} \neq \Vdr{other}$.
Let \Vrule{o} be the rule of \Vdr{other},
and \var{op} its dot position,
\begin{equation*}
\Vdr{other} = [ \Vrule{o}, \var{op} ].
\end{equation*}
AHFA construction never places a prediction in the same
AHFA state as a completion, so
\Vdr{other} is not a prediction.
Therefore, $\var{op} > 0$.
To show this outer reduction to absurdity, we first prove
by a first inner reductio that
$\Vrule{c} \neq \Vrule{o}$,
then by a second inner reductio that
$\Vrule{c} = \Vrule{o}$.

Assume, for the first inner reductio,
that
$\Vrule{c} = \Vrule{o}$.
By the construction of an AHFA
state,
both \Vdr{complete} and \Vdr{other}
resulted from the same series
of transitions.
But the same series of transitions over the
same rule would result in the same dot position,
$\var{cp} = \var{op}$,
so that if $\Vrule{c} = \Vrule{o}$,
$\Vdr{complete} = \Vdr{other}$,
which is contrary to the assumption for the outer reductio.
This shows the first inner reductio.

Next, we assume for the second inner reductio that
$\Vrule{c} \ne \Vrule{o}$.
Since both \Vdr{complete} and \Vdr{other}
are in the same EIM
and neither is a prediction,
both must result from transitions,
and their transitions must have been from the same Earley set.
Since they are in the same AHFA state,
by the AHFA construction,
that transition must have been
over the same transition symbol,
call it \Vsym{transition}.
But Leo uniqueness applies to \Vdr{complete},
and requires that the transition
over \Vsym{transition} be unique in \Ves{i}.

But if $\Vrule{c} \ne \Vrule{o}$,
\Vsym{transition} was the transition symbol
of two different dotted rules,
and the Leo uniqueness requirement does not hold.
The conclusion that the Leo uniqueness requirement
both does and does not hold
is a contradiction,
which shows the second inner reductio.
Since the assumption for
the second inner reductio was that
$\Vrule{c} \ne \Vrule{o}$,
we conclude that
$\Vrule{c} = \Vrule{o}$.

By the two inner reductio's,
we have both
$\Vrule{c} \neq \Vrule{o}$
and $\Vrule{c} = \Vrule{o}$,
which completes the outer reduction to absurdity.
For the outer reductio, we assumed that
\Vdr{other}
was a second dotted rule in \Vah{result},
such that
$\Vdr{other} \neq \Vdr{complete}$.
We can therefore conclude that
\begin{equation*}
\Vdr{other} \in \Vah{result} \implies \Vdr{other} = \Vdr{complete}.
\end{equation*}
If \Vdr{complete} is a dotted rule
in the AHFA state of a Leo reduction EIM,
then it must be the only dotted rule in that AHFA state.
\end{proof}

\section{The Marpa Recognizer}
\label{s:recce}
\label{s:pseudocode}

\subsection{Complexity}

Alongside the pseudocode of this section
are observations about its space and time complexity.
In what follows,
we will charge all time and space resources
to Earley items,
or to attempts to add Earley items.
We will show that,
to each Earley item actually added,
or to each attempt to add a duplicate Earley item,
we can charge amortized \Oc{} time and space.

At points, it will not be immediately
convenient to speak of
charging a resource
to an Earley item
or to an attempt to add a duplicate
Earley item.
In those circumstances,
we speak of charging time and space
\begin{itemize}
\item to the parse; or
\item to the Earley set; or
\item to the current procedure's caller.
\end{itemize}

We can charge time and space to the parse itself,
as long as the total time and space charged is \Oc.
Afterwards, this resource can be re-charged to
the initial Earley item, which is present in all parses.
Soft and hard failures of the recognizer use
worst-case \Oc{} resource,
and are charged to the parse.

We can charge resources to the Earley set,
as long as the time or space is \Oc.
Afterwards,
the resource charged to the Earley set can be
re-charged to an arbitrary member of the Earley set,
for example, the first.
If an Earley set is empty,
the parse must fail,
and the time can be charged to the parse.

In a procedure,
resource can be ``caller-included''.
Caller-included resource is not accounted for in
the current procedure,
but passed upward to the procedure's caller,
to be accounted for there.
A procedure to which caller-included resource is passed will
sometimes pass the resource upward to its own caller,
although of course the top-level procedure does not do this.

For each procedure, we will state whether
the time and space we are charging is inclusive or exclusive.
The exclusive time or space of a procedure is that
which it uses directly,
ignoring resource charges passed up from called procedures.
Inclusive time or space includes
resource passed upward to the
current procedure from called procedures.

Earley sets may be represented by \Ves{i},
where \var{i} is the Earley set's locaiton \Vloc{i}.
The two notations should be regarded as interchangeable.
The actual implementation of either
should be the equivalent of a pointer to 
a data structure containing,
at a minium,
the Earley items,
a memoization of the Earley set's location as an integer,
and a per-set-list.
Per-set-lists will be described in Section \ref{s:per-set-lists}.

\begin{algorithm}[h]
\caption{Marpa Top-level}
\begin{algorithmic}[1]
\Procedure{Main}{}
\State \Call{Initial}{}
\For{ $\var{i}, 0 \le \var{i} \le \Vsize{w}$ }
\State \Comment At this point, $\Ves{x}$ is complete, for $0 \le \var{x} < \var{i}$
\State \Call{Scan pass}{$\var{i}, \var{w}[\var{i} \subtract 1]$}
\If{$\size{\Ves{i}} = 0$}
\State reject \Cw{} and return
\EndIf
\State \Call{Reduction pass}{\var{i}}
\EndFor
\For{every $[\Vah{x}, 0] \in \Etable{\Vsize{w}}$}
\If{$\Vdr{accept} \in \Vah{x}$}
\State accept \Cw{} and return
\EndIf
\EndFor
\State reject \Cw{}
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Top-level code}

Exclusive time and space for the loop over the Earley sets
is charged to the Earley sets.
Inclusive time and space for the final loop to
check for \Vdr{accept} is charged to
the Earley items at location \size{\Cw}.
Overhead is charged to the parse.
All these resource charges are obviously \Oc.

\subsection{Ruby Slippers parsing}
This top-level code represents a significant change
from \AH{}.
\call{Scan pass}{} and \call{Reduction pass}{}
are separated.
As a result,
when the scanning of tokens that start at location \Vloc{i} begins,
the Earley sets for all locations prior to \Vloc{i} are complete.
This means that the scanning operation has available, in
the Earley sets,
full information about the current state of the parse,
including which tokens are acceptable during the scanning phase.


\begin{algorithm}[h]
\caption{Initialization}
\begin{algorithmic}[1]
\Procedure{Initial}{}
\State \Call{Add EIM pair}{$0_{ES}, \ah{start}, 0$}
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Initialization}
\label{p:initial-op}

Inclusive time and space is \Oc{}
and is charged to the parse.

\begin{algorithm}[h]
\caption{Marpa Scan pass}
\begin{algorithmic}[1]
\Procedure{Scan pass}{$\Vloc{i},\Vsym{a}$}
\State Note: Each pass through this loop is an EIM attempt
\For{each $\Veim{predecessor} \in \var{transitions}((\var{i} \subtract 1),\var{a})$}
\State $[\Vah{from}, \Vloc{origin}] \gets \Veim{predecessor}$
\State $\Vah{to} \gets \GOTO(\Vah{from}, \Vsym{a})$
\State \Call{Add EIM pair}{$\Ves{i}, \Vah{to}, \Vloc{origin}$}
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Scan pass}
\label{p:scan-op}

\var{transitions} is a set of tables, one per Earley set.
The tables in the set are indexed by symbol.
Symbol indexing is \Oc, since the number of symbols
is a constant, but
since the number of Earley sets grows with
the length of the parse,
it cannot be assumed that Earley sets can be indexed by location
in \Oc{} time.
For the operation $\var{transitions}(\Vloc{l}, \Vsym{s})$
to be in \Oc{} time,
\Vloc{l} must represent a link directly to the Earley set.
In the case of scanning,
the lookup is always in the previous Earley set,
which can easily be tracked in \Oc{} space
and retrieved in \Oc{} time.
Inclusive time and space can be charged to the
\Veim{predecessor}.
Overhead is charged to the Earley set at \Vloc{i}.

\begin{algorithm}[h]
\caption{Reduction pass}
\begin{algorithmic}[1]
\Procedure{Reduction pass}{\Vloc{i}}
\State Note: \Vtable{i} may include EIM's added by
\State \hspace{2.5em} by \Call{Reduce one LHS}{} and
\State \hspace{2.5em} the loop must traverse these
\For{each Earley item $\Veim{work} \in \Vtable{i}$}
\State $[\Vah{work}, \Vloc{origin}] \gets \Veim{work}$
\State $\Vsymset{lh-sides} \gets$ a set containing the LHS
\State \hspace\algorithmicindent of every completed rule in \Veim{work}
\For{each $\Vsym{lhs} \in \Vsymset{lh-sides}$}
\State \Call{Reduce one LHS}{\Vloc{i}, \Vloc{origin}, \Vsym{lhs}}
\EndFor
\EndFor
\State \Call{Memoize transitions}{\Vloc{i}}
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Reduction pass}

The loop over \Vtable{i} must also include
any items added by \call{Reduce one LHS}{}.
This can be done by implementing \Vtable{i} as an ordered
set and adding new items at the end.

Exclusive time is clearly \Oc{} per
\Veim{work},
and is charged to the \Veim{work}.
Additionally,
some of the time required by
\call{Reduce one LHS}{} is caller-included,
and therefore charged to this procedure.
Inclusive time from \call{Reduce one LHS}{}
is \Oc{} per call,
as will be seen in section \ref{p:reduce-one-lhs},
and is charged to the \Veim{work}
that is current
during that call to \call{Reduce one LHS}{}.
Overhead may be charged to the Earley set at \Vloc{i}.

\begin{algorithm}[h]
\caption{Memoize transitions}
\begin{algorithmic}[1]
\Procedure{Memoize transitions}{\Vloc{i}}
\For{every \Vsym{postdot}, a postdot symbol of $\Ves{i}$}
\State Note: \Vsym{postdot} is ``Leo eligible" if it is
\State \hspace\algorithmicindent  Leo unique and its rule is right recursive
\If{\Vsym{postdot} is Leo eligible}
\State Set $\var{transitions}(\Vloc{i},\Vsym{postdot})$
\State \hspace\algorithmicindent to a LIM
\Else
\State Set $\var{transitions}(\Vloc{i},\Vsym{postdot})$
\State \hspace\algorithmicindent to the set of EIM's that have
\State \hspace\algorithmicindent \Vsym{postdot} as their postdot symbol
\EndIf
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Memoize transitions}

The \var{transitions} table for \Ves{i}
is built once all EIMs have been
added to \Ves{i}.
We first look at the resource,
excluding the processing of Leo items.
The non-Leo processing can be done in
a single pass over \Ves{i},
in \Oc{} time per EIM.
Inclusive time and space are charged to the
Earley items being examined.
Overhead is charged to \Ves{i}.

We now look at the resource used in the Leo processing.
A transition symbol \Vsym{transition}
is Leo eligible if it is Leo unique
and its rule is right recursive.
(If \Vsym{transition} is Leo unique in \Ves{i}, it will be the
postdot symbol of only one rule in \Ves{i}.)
All but one of the determinations needed to decide
if \Vsym{transition} is Leo eligible can be precomputed
from the grammar,
and the resource to do this is charged to the parse.
The precomputation, for example,
for every rule, determines if it is right recursive.

One part of the test for
Leo eligibility cannot be done as a precomputation.
This is the determination whether there is only one dotted
rule in \Ves{i} whose postdot symbol is
\Vsym{transition}.
This can be done
in a single pass over the EIM's of \Ves{i}
that notes the postdot symbols as they are encountered
and whether any is enountered twice.
The time and space,
including that for the creation of a LIM if necessary,
will be \Oc{} time per EIM examined,
and can be charged to EIM being examined.

\begin{algorithm}[h]
\caption{Reduce one LHS symbol}
\begin{algorithmic}[1]
\Procedure{Reduce one LHS}{\Vloc{i}, \Vloc{origin}, \Vsym{lhs}}
\State Note: Each pass through this loop is an EIM attempt
\For{each $\var{pim} \in \var{transitions}(\Vloc{origin},\Vsym{lhs})$}
\State \Comment \var{pim} is a ``postdot item'', either a LIM or an EIM
\If{\var{pim} is a LIM, \Vlim{pim}}
\State Perform a \Call{Leo reduction operation}{}
\State \hspace\algorithmicindent for operands \Vloc{i}, \Vlim{pim}
\Else
\State Perform a \Call{Earley reduction operation}{}
\State \hspace\algorithmicindent for operands \Vloc{i}, \Veim{pim}, \Vsym{lhs}
\EndIf
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Reduce one LHS}
\label{p:reduce-one-lhs}

To show that
\begin{equation*}
\var{transitions}(\Vloc{origin},\Vsym{lhs})
\end{equation*}
can be traversed in \Oc{} time,
we note
that the number of symbols is a constant
and assume that \Vloc{origin} is implemented
as a link back to the Earley set,
rather than as an integer index.
This requires that \Veim{work}
in \call{Reduction pass}{}
carry a link
back to its origin.
As implemented, Marpa's
Earley items have such links.

Inclusive time
for the loop over the EIM attempts
is charged to each EIM attempt.
Overhead is \Oc{} and caller-included.

\begin{algorithm}[h]
\caption{Earley reduction operation}
\begin{algorithmic}[1]
\Procedure{Earley reduction operation}{\Vloc{i}, \Veim{from}, \Vsym{trans}}
\State $[\Vah{from}, \Vloc{origin}] \gets \Veim{from}$
\State $\Vah{to} \gets \GOTO(\Vah{from}, \Vsym{trans})$
\State \Call{Add EIM pair}{\Ves{i}, \Vah{to}, \Vloc{origin}}
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Earley Reduction operation}
\label{p:reduction-op}

\begin{sloppypar}
Exclusive time and space is clearly \Oc.
\call{Earley reduction operation}{} is always
called as part of an EIM attempt,
and inclusive time and space is charged to the EIM
attempt.
\end{sloppypar}

\begin{algorithm}[h]
\caption{Leo reduction operation}
\begin{algorithmic}[1]
\Procedure{Leo reduction operation}{\Vloc{i}, \Vlim{from}}
\State $[\Vah{from}, \Vsym{trans}, \Vloc{origin}] \gets \Vlim{from}$
\State $\Vah{to} \gets \GOTO(\Vah{from}, \Vsym{trans})$
\State \Call{Add EIM pair}{\Ves{i}, \Vah{to}, \Vloc{origin}}
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Leo reduction operation}
\label{p:leo-op}

Exclusive time and space is clearly \Oc.
\call{Leo reduction operation}{} is always
called as part of an EIM attempt,
and inclusive time and space is charged to the EIM
attempt.

\begin{algorithm}[h]
\caption{Add EIM pair}\label{a:pair}
\begin{algorithmic}[1]
\Procedure{Add EIM pair}{$\Ves{i},\Vah{confirmed}, \Vloc{origin}$}
\State $\Veim{confirmed} \gets [\Vah{confirmed}, \Vloc{origin}]$
\State $\Vah{predicted} \gets \GOTO(\Vah{confirmed}, \epsilon)$
\If{\Veim{confirmed} is new}
\State Add \Veim{confirmed} to \Vtable{i}
\EndIf
\If{$\Vah{predicted} \neq \Lambda$}
\State $\Veim{predicted} \gets [\Vah{predicted}, \Vloc{i}]$
\If{\Veim{predicted} is new}
\State Add \Veim{predicted} to \Vtable{i}
\EndIf
\EndIf
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Adding a pair of Earley items}
\label{p:add-eim-pair}

This operation adds a confirmed EIM
item and, if it exists, the EIM for
its null-transition.
Inclusive time and space is charged to the
calling procedure.
Trivially, the space is \Oc{} per call.

We show that time is also \Oc{}
by singling out the two non-trivial cases:
checking that an Earley item is new,
and adding it to the Earley set.
\Marpa{} checks whether an Earley item is new
in \Oc{} time
by using a data structure that called PSL.
PSL's are the subject of Section \ref{s:per-set-lists}.
An Earley item can be added to the current
set in \Oc{} time
if Earley set is seen as a linked
list, to the head of which the new Earley item is added.

The resource used by \call{Add EIM Pair}{}
is always caller-included.
No time or space is ever charged
to a predicted Earley item.
At most one attempt to add a \Veim{predicted} will
be made per attempt to add a \Veim{confirmed},
so that the total resource charged
remains \Oc.

\subsection{Per-set lists}
\label{s:per-set-lists}

In the general case,
where \var{x} is an arbitrary datum,
it is not possible 
to use duple $[\Ves{i}, x]$
as a search key and expect the search to use
\Oc{} time.
Within \Marpa, however, there are specific cases
where it is desirable to do exactly that.
This is accomplished by
taking advantage of special properties of the search.

If it can be arranged that there is
a link direct to the Earley set \Ves{i},
and that $0 \leq \var{x} < \var{c}$,
where \var{c} is a constant of reasonable size,
then a search can be made in \Oc{} time,
using a data structure called a PSL.
Data structures identical to or very similar to PSL's are
briefly outlined in both
\cite[p. 97]{Earley1970} and
\cite[Vol. 1, pages 326-327]{AU1972}.
But neither source gives them a name.
The term PSL
(``per-Earley set list'')
is new
with this \doc{}.

A PSL is a fixed-length array of
integers, indexed by an integer,
and kept as part of each Earley set.
While \Marpa{} is building a new Earley set,
\Ves{j},
the PSL for every previous Earley set, \Vloc{i},
tracks the Earley items in \Ves{j} that have \Vloc{i}
as their origin.
The maximum number of Earley items that must be tracked
in each PSL is
the number of AHFA states,
\Vsize{\Cfa},
which is a constant of reasonable size
that depends on \Cg{}.

It would take more than \Oc{} time
to clear and rebuild the PSL's each time
that a new Earley set is started.
This overhead is avoided by ``time-stamping'' each PSL
entry with the Earley set
that was current when that PSL
entry was last updated.

As before,
where \Ves{i} is an Earley set,
let \Vloc{i} be its location,
and vice versa.
\Vloc{i} is an integer which is
assigned as Earley sets are created.
Let $\ID{\Vah{x}}$ be the integer ID of an AHFA state.
Numbering the AHFA states from 0 on up as they are created
is an easy way to create $\ID{\Vah{x}}$.
Let $\PSL{\Ves{x}}{\var{y}}$
be the entry for integer \var{y} in the PSL in
the Earley set at \Vloc{x}.

Consider the case where Marpa is building \Ves{j}
and wants to check whether Earley item \Veim{x} is new,
where $\Veim{x} = [ \Vah{x}, \Vorig{x} ]$.
To check if \Veim{x} is new,
Marpa checks
\begin{equation*}
\var{time-stamp} = \PSL{\Ves{x}}{\ID{\Vah{x}}}
\end{equation*}
If the entry has never been used,
we assume that $\var{time-stamp} = \Lambda$.
If $\var{time-stamp} \ne \Lambda \land \var{time-stamp} = \Vloc{j}$,
then \Veim{x} is not new,
and will not be added to the Earley set.

If $\Vloc{p} = \Lambda \lor \var{time-stamp} \ne \Vloc{j}$,
then \Veim{x} is new.
\Veim{x} is added to the Earley set,
and a new time-stamp is set, as follow:
\begin{equation*}
\PSL{\Ves{x}}{\ID{\Vah{x}}} \gets \Vloc{j}.
\end{equation*}

\subsection{Complexity summary}

For convenience, we collect and summarize here
some of the observations of this section.

\begin{observation}
The time and space charged to an Earley item
which is actually added to the Earley sets
is \Oc.
\end{observation}

\begin{observation}
The time charged to an attempt
to add a duplicate Earley item to the Earley sets
is \Oc.
\end{observation}

For evaluation purposes, \Marpa{} adds a link to
each EIM that records each attempt to
add that EIM,
whether originally or as a duplicate.
Traditionally, complexity results treat parsers
as recognizers, and such costs are ignored.
This will be an issue when the space complexity
for unambiguous grammars is considered.

\begin{observation}
The space charged to an attempt
to add a duplicate Earley item to the Earley sets
is \Oc{} if links are included,
zero otherwise.
\end{observation}

As noted in Section \ref{p:add-eim-pair},
the time and space used by predicted Earley items
and attempts to add them is charged elsewhere.

\begin{observation}
No space or time is charged to predicted Earley items,
or to attempts to add predicted Earley items.
\end{observation}

\section{Preliminaries to the theoretical results}
\label{s:proof-preliminaries}

\subsection{Nulling symbols}
\label{s:nulling}

Recall that Marpa grammars,
without loss of generality,
contain neither empty rules or
properly nullable symbols.
This corresponds directly
to a grammar rewrite in the \Marpa{} implementation,
and its reversal during \Marpa's evaluation phase.
For the correctness and complexity proofs in this \doc{},
we assume an additional rewrite,
this time to eliminate nulling symbols.

Elimination of nulling symbols is also
without loss of generality, as can be seen
if we assume that a history
of the rewrite is kept,
and that the rewrite is reversed
after the parse.
Clearly, whether a grammar \Cg{} accepts
an input \Cw{}
will not depend on the nulling symbols in its rules.

In its implementation,
\Marpa{} does not directly rewrite the grammar
to eliminate nulling symbols.
But nulling symbols are ignored in
creating the AHFA states,
and must be restored during \Marpa's evaluation phase,
so that the implementation and
this simplification for theory purposes
track each other closely.

\subsection{Comparing Earley items}

\begin{definition}
A Marpa Earley item \dfn{corresponds}
to a traditional Earley item
$\Veimt{x} = [\Vdr{x}, \Vorig{x}]$
if and only if the Marpa Earley item is a
$\Veim{y} = [\Vah{y}, \Vorig{x}]$
such that $\Vdr{x} \in \Vah{y}$.
A traditional Earley item, \Veimt{x}, corresponds to a
Marpa Earley item, \Veim{y}, if and only if
\Veim{y} corresponds to \Veimt{x}.
\end{definition}

\begin{definition}
A set of EIM's is \dfn{consistent} with respect to
a set of EIMT's,
if and only if each of the EIM's in the first set
corresponds to at least one of the
EIMT's in the second set.
A Marpa Earley set \EVtable{\Marpa}{i}
is \dfn{consistent} if and only if
all of its EIM's correspond to
EIMT's in
\EVtable{\Leo}{i}.
\end{definition}

\begin{sloppypar}
\begin{definition}
A set of EIM's is \dfn{complete} with respect to
a set of EIMT's,
if and only if for every EIMT in the second set,
there is a corresponding EIM in the first set.
A Marpa Earley set \EVtable{\Marpa}{i}
is \dfn{complete} if and only if for every
traditional Earley item in \EVtable{\Leo}{i}
there is a corresponding Earley item in
\EVtable{\Marpa}{i}.
\end{definition}
\end{sloppypar}

\begin{definition}
A Marpa Earley set is \dfn{correct}
if and only that Marpa Earley set is complete
and consistent.
\end{definition}

\subsection{About AHFA states}

Several facts from \cite{AH2002}
will be heavily used in the following proofs.
For convenience, they are restated here.

\begin{observation}
Every dotted rule is an element of one
or more AHFA states, that is,
\begin{equation*}
\forall \, \Vdr{x} \, \exists \, \Vah{y} \; \mid \; \Vdr{x} \in \Vah{y}.
\end{equation*}
\end{observation}

\begin{observation}
\label{o:confirmed-AHFA-consistent}
AHFA confirmation is consistent with respect to the dotted rules.
That is,
for all \Vah{from}, \Vsym{t}, \Vah{to}, \Vdr{to} such that
\begin{equation*}
\begin{split}
& \GOTO(\Vah{from}, \Vsym{t}) = \Vah{to} \\
\qquad \qquad \land \quad & \Vdr{to} \in \Vah{to}, \\
\intertext{there exists \Vdr{from} such that}
& \Vdr{from} \in \Vah{from} \\
\qquad \qquad \land \quad & \Vsym{t} = \Postdot{\Vdr{from}} \\
\qquad \qquad \land \quad & \Next{\Vdr{from}} = \Vdr{to}. \\
\end{split}
\end{equation*}
\end{observation}

\begin{observation}
\label{o:confirmed-AHFA-complete}
AHFA confirmation is complete with respect to the dotted rules.
That is,
for all \Vah{from}, \Vsym{t}, \Vdr{from}, \Vdr{to} if
\begin{equation*}
\begin{split}
& \Vdr{from} \in \Vah{from} \\
\qquad \land \quad & \Postdot{\Vdr{from}} = \Vsym{t}, \\
\qquad \land \quad & \Next{\Vdr{from}} = \Vdr{to} \\
\intertext{then there exists \Vah{to} such that }
& \GOTO(\Vah{from}, \Vsym{t}) = \Vah{to}  \\
\qquad \land \quad & \Vdr{to} \in \Vah{to}. \\
\end{split}
\end{equation*}
\end{observation}

\begin{observation}
\label{o:predicted-AHFA-consistent}
AHFA prediction is consistent with respect to the dotted rules.
That is,
for all \Vah{from}, \Vah{to}, \Vdr{to} such that
\begin{equation*}
 \GOTO(\Vah{from}, \epsilon) = \Vah{to} 
\, \land  \,
 \Vdr{to} \in \Vah{to}, 
\end{equation*}
there exists \Vdr{from} such that
\begin{equation*}
 \Vdr{from} \in \Vah{from} 
\, \land  \,
 \Vdr{to} \in \Predict{\Vdr{from}}.
\end{equation*}
\end{observation}

\begin{observation}
\label{o:predicted-AHFA-complete}
AHFA prediction is complete with respect to the dotted rules.
That is,
for all \Vah{from}, \Vdr{from}, \Vdr{to}, if
\begin{equation*}
 \Vdr{from} \in \Vah{from} 
\, \land \,
\Vdr{to} \in \Predict{\Vdr{from}},
\end{equation*}
then there exists \Vah{to} such that
\begin{equation*}
\Vdr{to} \in \Vah{to}
\, \land  \,
\GOTO(\Vah{from}, \epsilon) = \Vah{to}
\end{equation*}
\end{observation}

\section{Marpa is correct}
\label{s:correct}

\subsection{Marpa's Earley sets grow at worst linearly}

\begin{theorem}\label{t:es-count}
For a context-free grammar,
and a parse location \Vloc{i},
\begin{equation*}
\textup{
    $\bigsize{\EVtable{\Marpa}{i}} = \order{\var{i}}$.
}
\end{equation*}
\end{theorem}

\begin{proof}
EIM's have the form $[\Vah{x}, \Vorig{x}]$.
\Vorig{x} is the origin of the EIM,
which in Marpa cannot be after the current
Earley set  at \Vloc{i},
so that
\begin{equation*}
0 \le \Vorig{x} \le \Vloc{i}.
\end{equation*}
The possibilities for \Vah{x} are finite,
since the number of AHFA states is a constant,
$\size{\Cfa}$,
which depends on \Cg{}.
Since duplicate EIM's are never added to an Earley set,
the maximum size of Earley set \Vloc{i} is therefore
\begin{equation*}
\Vloc{i} \times \size{\Cfa} = \order{\Vloc{i}}.\qedhere
\end{equation*}
\end{proof}

\subsection{Marpa's Earley sets are correct}

\begin{theorem}\label{t:table-correct}
Marpa's Earley sets are correct.
\end{theorem}

The proof
is by triple induction,
that is, induction with a depth down to 3 levels.
We number the levels of induction
0, 1 and 2,
starting with the outermost.
The level 0 induction is usually called the outer induction.
The level 1 induction is usually called the inner induction.
Level 2 induction is referred to by number.

The outer induction is on the Earley sets.
The outer induction hypothesis is that all Earley sets
\EVtable{\Marpa}{i},
$0 \le \Vloc{i} \le \Vloc{n}$,
are complete and consistent,
and therefore correct.
We leave it as an exercise to show, as the
basis of the induction, that
\EEtable{\Marpa}{0} is complete and consistent.

To show the outer induction step, we show first
consistency, then completeness.
We show consistency by
an inner induction on the Marpa operations.
The inner induction hypothesis is that
\EVtable{\Marpa}{i},
as so far built,
is consistent with respect to
\EVtable{\Leo}{i}.

As the basis of the inner induction,
an empty Marpa Earley set is
consistent, trivially.
We show the step of the inner induction by cases:
\begin{itemize}
\item \Marpa{} scanning operations;
\item \Marpa{} reductions when there are no Leo reductions; and
\item \Marpa{}'s Leo reductions
\end{itemize}

\subsubsection{Marpa scanning is consistent}
\label{s:scan-consistent}

For Marpa's scanning operation, we know
that the predecessor EIM is correct
by the outer induction hypothesis,
and that the token is correct
by the definitions in the preliminaries.
We know, from Section \ref{p:scan-op},
that at most two EIM's will be added.
We now examine them in detail.

Let
\begin{equation*}
    \Vah{confirmed} = \GOTO(\Vah{predecessor}, \Vsym{token})
\end{equation*}
If $\Vah{confirmed} = \Lambda$,
the pseudocode of Section \ref{p:scan-op} shows
that we do nothing.
If we do nothing,
since \EVtable{\Marpa}{i} is consistent by the inner
induction hypothesis,
it remains consistent, trivially.

Otherwise, let
$\Veim{confirmed} = [\Vah{confirmed}, \Vloc{i}]$.
We see that \Veim{confirmed} is consistent with respect
to \EVtable{\Leo}{i},
by the definition of Earley scanning (Section~\ref{d:scan})
and Observation~\ref{o:confirmed-AHFA-consistent}.
Consistency is invariant under union,
and since \EVtable{\Marpa}{i} is consistent by the inner induction,
\EVtable{\Marpa}{i} remains consistent after
\Veim{confirmed} is added.

For predictions,
if $\Vah{confirmed} \ne \Lambda$, let
\begin{equation*}
\Vah{predicted} = \GOTO(\Vah{confirmed}, \epsilon)
\end{equation*}
If $\Vah{predicted} = \Lambda$,
the pseudocode of Section \ref{p:add-eim-pair} shows
that we do nothing.
If we do nothing,
since \EVtable{\Marpa}{i} is consistent by the inner
induction hypothesis,
it remains consistent, trivially.
Otherwise, let
\begin{equation*}
\Veim{predicted} = [\Vah{predicted}, \Vloc{i}].
\end{equation*}
We see that \Veim{predicted} is consistent with respect
to \EVtable{\Leo}{i},
by the definition of Earley prediction (Section~\ref{d:prediction}) and
Observation ~\ref{o:predicted-AHFA-consistent}.
Consistency is invariant under union and,
since \EVtable{\Marpa}{i} is consistent by the inner induction,
\EVtable{\Marpa}{i} remains consistent after
\Veim{predicted} is added.

\subsubsection{Earley reduction is consistent}
\label{s:reduction-consistent}

\begin{sloppypar}
Next,
we show that \Marpa{}'s reduction operation
is consistent,
in the case where there is no Leo reduction.
There will be two cause EIM's, \Veim{predecessor}
and \Veim{component}.
\Veim{predecessor} will be correct by the outer induction
hypothesis
and \Veim{component}
will be consistent by the inner induction hypothesis.
From \Veim{component}, we will find zero or more transition
symbols, \Vsym{lhs}.
From this point,  the argument is very similar to
that for the case of the scanning operation.
\end{sloppypar}

Let
\begin{equation*}
\Vah{confirmed} = \GOTO(\Vah{predecessor}, \Vsym{lhs})
\end{equation*}
If $\Vah{confirmed} = \Lambda$, we do nothing,
and \EVtable{\Marpa}{i} remains consistent, trivially.
Otherwise, let
\begin{equation*}
\Veim{confirmed} = [\Vah{confirmed}, \Vloc{i}].
\end{equation*}
We see that \Veim{confirmed} is consistent with respect
to \EVtable{\Leo}{i}
by the definition of Earley reduction (Section~\ref{d:reduction}),
and Observation~\ref{o:confirmed-AHFA-consistent}.
By the invariance of consistency under union,
\EVtable{\Marpa}{i} remains consistent after
\Veim{confirmed} is added.

For predictions, the argument exactly repeats that of
Section \ref{s:scan-consistent}.
\EVtable{\Marpa}{i} remains consistent,
whether or not a \Veim{predicted} is added.

\subsubsection{Leo reduction is consistent}
\label{s:leo-consistent}

\begin{sloppypar}
We now show consistency for \Marpa{}'s
reduction operation,
in the case where there is a Leo reduction.
If there is a Leo reduction, it is signaled by the
presence of \Vlim{predecessor},
\begin{equation*}
\Vlim{predecessor} = [ \Vah{top}, \Vsym{lhs}, \Vorig{top} ]
\end{equation*}
in the Earley set where we would look
for the \Veim{predecessor}.
We treat
the logic to create \Vlim{predecessor} as a matter of memoization
of the previous Earley sets,
and its correctness follows from
the outer induction hypothesis.
\end{sloppypar}

As the result of a Leo reduction,
\Leo{} will add
$[\Vdr{top}, \Vorig{top}]$
to \EVtable{\Leo}{j}.
Because the \Marpa{} LIM is correct,
using Observations \ref{o:confirmed-AHFA-consistent}
and \ref{o:confirmed-AHFA-complete}
and Theorem \ref{t:leo-singleton},
we see that \Vah{top} is the singleton set
$\set{ \Vdr{top} }$.
From Section \ref{p:leo-op}, we see
that, as the result of the Leo reduction,
\Marpa{} will add
\begin{equation*}
\Veim{leo} = [\Vah{top}, \Vorig{top}]
\end{equation*}
to \EVtable{\Marpa}{j}.
The consistency of \Veim{leo} follows from the definition
of EIM consistency.
The consistency of
\EVtable{\Marpa}{i},
once \Veim{leo} is added,
follows by the invariance
of consistency under union.

\subsubsection{Marpa's Earley sets are consistent}
\label{s:sets-consistent}

Sections
\ref{s:scan-consistent},
\ref{s:reduction-consistent}
and
\ref{s:leo-consistent}
show the cases for the step of the inner induction,
which shows the induction.
It was the purpose of the inner induction to show
that consistency of \EVtable{\Marpa}{i} is invariant
under Marpa's operations.

\subsubsection{The inner induction for completeness}

It remains to show that,
when Marpa's operations are run as described
in the pseudocode of Section \ref{s:pseudocode},
that
\EVtable{\Marpa}{i} is complete.
To do this,
we show that
at least one EIM in \EVtable{\Marpa}{i}
corresponds to every EIMT in
\EVtable{\Leo}{i}.
We will proceed by cases,
where the cases are \Leo{} operations.
For every operation that \Leo{} would perform,
we show that
\Marpa{} performs an operation that
produces a corresponding Earley item.
Our cases for the operations of \Leo{} are
Earley scanning operations;
Earley reductions;
Leo reductions;
and Earley predictions.

\subsubsection{Scanning is complete}
\label{s:scan-complete}

For scanning, the Marpa pseudocode shows
that a scan is attempted for every
pair
\begin{equation*}
[\Veim{predecessor}, \Vsym{token}],
\end{equation*}
where \Veim{predecessor} is an EIM in the previous
Earley set,
and \Vsym{token} is the token scanned at \Vloc{i}.
(The pseudocode actually finds
\Veim{predecessor} in a set
returned by $\mymathop{transitions}()$.
This is a memoization for efficiency
and we will ignore it.)

By the preliminary definitions, we know that \Vsym{token}
is the same in both \Earley{} and \Leo.
By the outer induction hypothesis we know that,
for every traditional Earley item in the previous
Earley set,
there is at least one corresponding Marpa Earley item.
Therefore, \Marpa{} performs its scan operation on a complete set
of correct operands.

Comparing the Marpa pseudocode (section \ref{p:scan-op}),
with the Earley scanning operation (section \ref{d:scan})
and using
Observations~\ref{o:confirmed-AHFA-complete}
and \ref{o:predicted-AHFA-complete},
we see that a Earley item will be added to
\EVtable{\Marpa}{i} corresponding to every scanned Earley item
of \EVtable{\Leo}{i}.
We also see,
from the pseudocode of Section \ref{p:add-eim-pair},
that the \Marpa{} scanning operation will
add to \EVtable{\Marpa}{i}
an Earley item for
every prediction that results from
a scanned Earley item in \EVtable{\Leo}{i}.

\subsubsection{Earley reduction is complete}
\label{s:reduction-complete}

We now examine Earley reduction,
under the assumption that there is
no Leo transition.
The Marpa pseudocode shows that the Earley items
in \EVtable{\Marpa}{i}
are traversed in a single pass for reduction.

To show that we traverse a complete and consistent
series of component Earley items,
we stipulate that
the Earley set is an ordered set,
and that new Earley items are added at the end.
From Theorem \ref{t:es-count}, we know
that
the number of Earley items is finite,
so a traversal of them must terminate.

Consider, for the purposes of the level 2 induction,
the reductions of \Leo{} to occur in generations.
Let the scanned Earley items be generation 0.
An EIMT produced by a reduction is generation $\var{n} + 1$
if its component Earley item was generation was \var{n}.
Predicted Earley items do not need to be assigned generations.
In Marpa grammars they can never contain completions,
and therefore can never act as the component of a reduction.

The induction hypothesis for the level 2 induction
is that for some \var{n},
the Earley items of \EVtable{\Marpa}{i} for generations 0 through \var{n}
are complete and consistent.
From Section \ref{s:sets-consistent},
we know that all Earley items in Marpa's sets are consistent.
In Section \ref{s:scan-complete},
we showed that generation 0 is complete --
it contains Earley items
corresponding to all of the generation 0 EIMT's of \Leo.
This is the basis of the level 2 induction.

Since we stipulated that \Marpa{} adds Earley items
at the end of each set,
we know that they occur in generation order.
Therefore \Marpa{},
when creating Earley items of generation $\var{n}+1$
while traversing \EVtable{\Marpa}{i},
can rely
on the level 2 induction hypothesis for
the completeness of Earley items
in generation \var{n}.

Let
$\Veim{working} \in \Ves{i}$
be the Earley item
currently being considered as a potential component for
an Earley reduction operation.
From the pseudocode, we see
that reductions are attempted for every
pair \Veim{predecessor}, \Veim{working}.
(Again, $\mymathop{transitions}()$ is ignored
as a memoization.)
By the outer induction hypothesis we know that,
for every traditional Earley item in the previous
Earley set,
there is at least one corresponding Marpa Earley item.
We see from the pseudocode, therefore,
that for each \Veim{working}
that \Marpa{} performs its reduction operation on a complete set
of correct predecessors.
Therefore \Marpa{} performs its reduction operations on a
complete set of operand pairs.

Comparing the Marpa pseudocode (Section \ref{p:reduction-op})
with the Earley reduction operation (Section \ref{d:reduction})
and using
Observations~\ref{o:confirmed-AHFA-complete}
and \ref{o:predicted-AHFA-complete},
we see that a Earley reduction result of
generation $\var{n}+1$
will be added to
\EVtable{\Marpa}{i} corresponding to every Earley reduction result
in generation $\var{n}+1$
of \EVtable{\Leo}{i},
as well as one corresponding
to every prediction that results from
an Earley reduction result
of generation $\var{n}+1$ in \EVtable{\Leo}{i}.
This shows the level 2 induction
and the case of reduction completeness.

\subsubsection{Leo reduction is complete}
\label{s:leo-complete}

\begin{sloppypar}
We now show completeness for \Marpa{}'s reduction operation,
in the case where there is a Leo reduction.
In Section \ref{s:leo-consistent},
we found that where \Leo{} would create
the EIMT $[\Vdr{top}, \Vorig{top}]$,
Marpa adds
$[\Vah{top}, \Vorig{top}]$
such that $\Vdr{top} \in \Vah{top}$.
Since \Vdr{top} is a completed rule,
there are no predictions.
This shows the case immediately,
by the definition of completeness.
\end{sloppypar}

\subsubsection{Prediction is complete}
\label{s:prediction-complete}

\begin{sloppypar}
Predictions result only from items in the same Earley set.
In Sections \ref{s:scan-complete},
\ref{s:reduction-complete}
and \ref{s:leo-complete},
we showed that,
for every prediction that would result
from an item added to \EVtable{\Leo}{i},
a corresponding prediction
was added to \EVtable{\Marpa}{i}.
\end{sloppypar}

\subsubsection{Finishing the proof}
Having shown the cases in Sections
\ref{s:scan-complete},
\ref{s:reduction-complete},
\ref{s:leo-complete} and
\ref{s:prediction-complete},
we know that Earley set
\EVtable{\Marpa}{i} is complete.
In section \ref{s:sets-consistent}
we showed that \EVtable{\Marpa}{i} is consistent.
It follows that \EVtable{\Marpa}{i} is correct,
which is the step of the outer induction.
Having shown its step, we have the outer induction,
and the theorem.
\qedsymbol

\subsection{Marpa is correct}

We are now is a position to show that Marpa is correct.
\begin{theorem}
\textup{ $\myL{\Marpa,\Cg} = \myL{\Cg}$ }
\end{theorem}

\begin{proof}
From Theorem \ref{t:table-correct},
we know that
\begin{equation*}
[\Vdr{accept},0] \in \EVtable{\Leo}{\Vsize{w}}
\end{equation*}
if and only there is a
\begin{equation*}
[\Vah{accept},0] \in \EVtable{\Marpa}{\Vsize{w}}
\end{equation*}
such that $\Vdr{accept} \in \Vah{accept}$.
From the acceptance criteria in the \Leo{} definitions
and the \Marpa{} pseudocode,
it follows that
\begin{equation*}
\myL{\Marpa,\Cg} = \myL{\Leo,\Cg}.
\end{equation*}
By Theorem 4.1 in \cite{Leo1991}, we know that
\begin{equation*}
\myL{\Leo,\Cg} = \myL{\Cg}.
\end{equation*}
The theorem follows from
the previous two equalities.
\end{proof}

\section{Marpa recognizer complexity}
\label{s:complexity}

\subsection{Complexity of each Earley item}

For the complexity proofs,
we consider only Marpa grammars without nulling
symbols.
We showed that this rewrite
is without loss of generality
in Section \ref{s:nulling},
when we examined correctness.
For complexity we must also show that
the rewrite and its reversal can be done
in amortized \Oc{} time and space
per Earley item.

\begin{lemma}\label{l:nulling-rewrite}
All time and space required
to rewrite the grammar to eliminate nulling
symbols, and to restore those rules afterwards
in the Earley sets,
can be allocated
to the Earley items
in such a way that each Earley item
requires \Oc{} time and space.
\end{lemma}

\begin{proof}
The time and space used in the rewrite is a constant
that depends on the grammar,
and is charged to the parse.
The reversal of the rewrite can be
done in a loop over the Earley items,
which will have time and space costs
per Earley item,
plus a fixed overhead.
The fixed overhead is \Oc{}
and is charged to the parse.
The time and space per Earley item
is \Oc{}
because the number of
rules into which another rule must be rewritten,
and therefore the number of Earley items
into which another Earley item must be rewritten,
is a constant that depends
on the grammar.
\end{proof}

\begin{theorem}\label{t:O1-time-per-eim}
All time in \Marpa{} can be allocated
to the Earley items,
in such a way that each Earley item,
and each attempt to
add a duplicate Earley item,
requires \Oc{} time.
\end{theorem}

\begin{theorem}\label{t:O1-space-per-eim}
All space in \Marpa{} can be allocated
to the Earley items,
in such a way that each Earley item
requires \Oc{} space and,
if links are not considered,
each attempt to add a duplicate
Earley item adds no additional space.
\end{theorem}

\begin{theorem}\label{t:O1-links-per-eim}
If links are considered,
all space in \Marpa{} can be allocated
to the Earley items
in such a way that each Earley item
and each attempt to
add a duplicate Earley item
requires \Oc{} space.
\end{theorem}

\begin{proof}[Proof of Theorems
\ref{t:O1-time-per-eim},
\ref{t:O1-space-per-eim},
and \ref{t:O1-links-per-eim}]
These theorems follows from the observations
in Section \ref{s:pseudocode}
and from Lemma \ref{l:nulling-rewrite}.
\end{proof}

\subsection{Duplicate dotted rules}

The same complexity results apply to \Marpa{} as to \Leo,
and the proofs are very similar.
\Leo's complexity results\cite{Leo1991}
are based on charging
resource to Earley items,
as were the results
in Earley's paper\cite{Earley1970}.
But both assume that there is one dotted rule
per Earley item,
which is not the case with \Marpa.

\Marpa's Earley items group dotted rules into AHFA
states, but this is not a partitioning in the strict
sense -- dotted rules can fall into more than one AHFA
state.
This is an optimization,
in that it allows dotted rules,
if they often occur together,
to be grouped together aggressively.
But it opens up the possibility
that, in cases where \Earley{} and \Leo{} disposed
of a dotted rule once and for all,
\Marpa{} might have to deal with it multiple times.

From an efficiency perspective,
\Marpa's duplicate rules
are by all the evidence, a plus.
And they do not change the complexity results,
although the price of showing this is the
theoretical apparatus of this section.

\begin{theorem}\label{t:marpa-O-leo}
\begin{equation*}
\textup{
    $\Rtablesize{\Marpa} < \var{c} \times \Rtablesize{\Leo}$,
}
\end{equation*}
where \var{c} is a constant that depends on the grammar.
\end{theorem}

\begin{proof}
We know from Theorem \ref{t:table-correct}
that every Marpa Earley item corresponds to one of
\Leo's traditional Earley items.
If an EIM corresponds to an EIMT,
the AHFA state of the EIM contains the
EIMT's dotted rule,
while their origins are identical.
Even in the worst case, a dotted rule cannot
appear in every AHFA state,
so that
the number of Marpa items corresponding to a single
traditional Earley item must be less
than $\size{\Cfa}$.
Therefore,
\begin{equation*}
    \Rtablesize{\Marpa} < \size{\Cfa} \times \Rtablesize{\Leo}\qedhere
\end{equation*}
\end{proof}

Earley\cite{Earley1970} shows that,
for unambiguous grammars,
every attempt to add
an Earley item will actually add one.
In other words, there will be no attempts to
add duplicate Earley items.
Earley's proof shows that for each attempt
to add a duplicate,
the causation must be different --
that the EIMT's causing the attempt
differ in either their dotted
rules or their origin.
Multiple causations for an Earley item
would mean multiple derivations
for the sentential form that it represents.
That in turn would mean that
the grammar is ambiguous,
contrary to assumption.

In \Marpa, there is an slight complication.
A dotted rule can occur in more than one AHFA
state.
Because of that,
it is possible that two of \Marpa's
operations to add an EIM
will represent identical Earley causations,
and therefore will be
consistent with an unambiguous grammar.
Dealing with this complication requires us
to prove a result that is weaker than that of \cite{Earley1970},
but that is
still sufficient to produce the same complexity results.

\begin{theorem}\label{t:tries-O-eims}
For an unambiguous grammar,
the number of attempts to add
Earley items will be less than or equal to
\begin{equation*}
\textup{
    $\var{c} \times \Rtablesize{\Marpa}$,
}
\end{equation*}
where \var{c} is a constant
that depends on the grammar.
\end{theorem}

\begin{proof}
Let \var{initial-tries} be the number of attempts to add the initial item to
the Earley sets.
For Earley set 0, it is clear from the pseudocode
that there will be no attempts to add duplicate EIM's:
\begin{equation*}
\var{initial-tries} = \bigsize{\Vtable{0}}
\end{equation*}

Let \var{leo-tries} be the number of attempted Leo reductions in
Earley set \Vloc{j}.
For Leo reduction,
we note that by its definition,
duplicate attempts at Leo reduction cannot occur.
Let \var{max-AHFA} be the maximum number of
dotted rules in any AHFA state.
From the pseudo-code of Sections \ref{p:reduce-one-lhs}
and \ref{p:leo-op},
we know there will be at most one Leo reduction for
each each dotted rule in the current Earley set,
\Vloc{j}.
\begin{equation*}
\var{leo-tries} \le \var{max-AHFA} \times \bigsize{\Vtable{j}}
\end{equation*}

Let \var{scan-tries} be the number of attempted scan operations in
Earley set \Vloc{j}.
Marpa attempts a scan operation,
in the worst case,
once for every EIM in the Earley set
at $\Vloc{j} \subtract 1$.
Therefore, the number of attempts
to add scans
must be less than equal to \bigsize{\Etable{\var{j} \subtract 1}},
the number
of actual Earley items at
$\Vloc{j} \subtract 1$.
\begin{equation*}
\var{scan-tries} \le \bigsize{\Etable{\var{j} \subtract 1}}
\end{equation*}

Let \var{predict-tries} be the number of attempted predictions in
Earley set \Vloc{j}.
\Marpa{} includes prediction
in its scan and reduction operations,
and the number of attempts to add duplicate predicted EIM's
must be less than or equal
to the number of attempts
to add duplicate confirmed EIM's
in the scan and reduction operations.
\begin{equation*}
\var{predict-tries} \le \var{reduction-tries} + \var{scan-tries}
\end{equation*}

The final and most complicated case is Earley reduction.
Recall that \Ves{j} is the current Earley set.
Consider the number of reductions attempted.
\Marpa{} attempts to add an Earley reduction result
once for every triple
\begin{equation*}
[\Veim{predecessor}, \Vsym{transition}, \Veim{component}].
\end{equation*}
where
\begin{equation*}
\begin{split}
& \Veim{component} = [ \Vah{component}, \Vloc{component-origin} ]  \\
\land \quad & \Vdr{component} \in \Vah{component} \\
 \land \quad & \Vsym{transition} = \LHS{\Vdr{component}}. \\
\end{split}
\end{equation*}

We now put an upper bound on number of possible values of this triple.
The number of possibilities for \Vsym{transition} is clearly at most
\size{\var{symbols}},
the number of symbols in \Cg{}.
We have $\Veim{component} \in \Ves{j}$,
and therefore there are at most 
$\bigsize{\Ves{j}}$ choices for \Veim{component}.

\begin{sloppypar}
We can show that the number of possible choices of 
\Veim{predecessor} is at most
the number of AHFA states, \Vsize{fa}, by a reductio.
Suppose, for the reduction,
there were more than \Vsize{fa} possible choices of \Veim{predecessor}.
Then there are two possible choices of \Veim{predecessor} with
the same AHFA state.
Call these \Veim{choice1} and \Veim{choice2}.
We know, by the definition of Earley reduction, that
$\Veim{predecessor} \in \Ves{j}$,
and therefore we have
$\Veim{choice1} \in \Ves{j}$ and
$\Veim{choice2} \in \Ves{j}$.
Since all EIM's in an Earley set must differ,
and
\Veim{choice1} and \Veim{choice2} both have the same
AHFA state,
they must differ in their origin.
But two different origins would produce two different derivations for the
reduction, which would mean that the parse was ambiguous.
This is contrary to the assumption for the theorem
that the grammar is unambiguous.
This shows the reductio
and that the number of choices for \Veim{predecessor},
compatible with \Vorig{component}, is as most \Vsize{fa}.
\end{sloppypar}

\begin{sloppypar}
Collecting the results we see that the possibilities for 
each \Veim{component} are
\begin{equation*}
\begin{alignedat}{2}
& \Vsize{fa} &&
\qquad \text{choices of \Veim{predecessor}} \\
\times \; & \Vsize{symbols} &&
\qquad \text{choices of \Vsym{transition}} \\
\times \; & \size{\Ves{j}} &&
\qquad \text{choices of \Veim{component}} \\
\end{alignedat}
\end{equation*}
\end{sloppypar}

The number of reduction attempts will therefore be at most
\begin{equation*}
\var{reduction-tries} \leq \Vsize{fa} \times \Vsize{symbols} \times \bigsize{\Ves{j}}.
\end{equation*}

Summing
\begin{multline*}
\var{tries} =
\var{scan-tries} +
\var{leo-tries} + \\
\var{predict-tries} +
\var{reduction-tries} +
\var{initial-tries},
\end{multline*}
we have,
where $\var{n} = \Vsize{\Cw}$,
the size of the input,
\begin{equation*}
\begin{alignedat}{2}
& \bigsize{\Vtable{0}} & \quad &
\qquad \text{initial EIM's} \\
+ \; & \sum\limits_{i=0}^{n}{
\var{max-AHFA} \times \bigsize{\Vtable{j}}
} &&
\qquad \text{LIM's} \\
+ \; & 2 \times \sum\limits_{i=1}^{n}{
\bigsize{\Etable{\var{j} \subtract 1}}
} &&
\qquad \text{scanned EIM's} \\
+ \; & 2 \times \sum\limits_{i=0}^{n}{\Vsize{fa} \times \Vsize{symbols} \times \bigsize{\Ves{j}}} &&
\qquad \text{reduction EIM's}.
\end{alignedat}
\end{equation*}
In this summation,
\var{prediction-tries} was accounted for by counting the scanned and predicted
EIM attempts twice.
Since \var{max-AHFA} and \Vsize{symbols} are both constants
that depend only on \Cg{},
if we collect the terms of the summation,
we will find a constant \var{c}
such that
\begin{equation*}
\var{tries} \leq \var{c} \times \sum\limits_{i=0}^{n}{\bigsize{\Vtable{j}}},
\end{equation*}
and
\begin{equation*}
\var{tries} \leq \var{c} \times \Rtablesize{\Marpa},
\end{equation*}
where \var{c} is a constant that depends on \Cg{}.\qedhere
\end{proof}

As a reminder,
we follow tradition by
stating complexity results in terms of \var{n},
setting $\var{n} = \Vsize{\Cw}$,
the length of the input.

\begin{theorem}\label{t:eim-count}
For a context-free grammar,
\begin{equation*}
\textup{
    $\Rtablesize{\Marpa} = \order{\var{n}^2}$.
}
\end{equation*}
\end{theorem}

\begin{proof}
By Theorem \ref{t:es-count},
the size of the Earley set at \Vloc{i}
is $\order{\var{i}}$.
Summing over the length of the input,
$\Vsize{\Cw} = \var{n}$,
the number of EIM's in all of \Marpa's Earley sets
is
\begin{equation*}
\sum\limits_{\Vloc{i}=0}^{\var{n}}{\order{\var{i}}}
= \order{\var{n}^2}.\qedhere
\end{equation*}
\end{proof}

\begin{theorem}\label{t:ambiguous-tries}
For a context-free grammar,
the number of attempts to add
Earley items is $\order{\var{n}^3}$.
\end{theorem}

\begin{proof}
Reexamining the proof of Theorem \ref{t:tries-O-eims},
we see that the only bound that required
the assumption that \Cg{} was unambiguous
was \var{reduction-tries},
the count of the number of attempts to
add Earley reductions.
Let \var{other-tries}
be attempts to add EIM's other than
as the result of Earley reductions.
By Theorem \ref{t:eim-count},
\begin{equation*}
\Rtablesize{\Marpa} = \order{\var{n}^2},
\end{equation*}
and by Theorem \ref{t:tries-O-eims},
\begin{equation*}
\var{other-tries} \le \var{c} \times \Rtablesize{\Marpa},
\end{equation*}
so that
$\var{other-tries} = \order{\var{n}^2}$.

\begin{sloppypar}
Looking again at \var{reduction-tries}
for the case of ambiguous grammars,
we need to look again at the triple
\begin{equation*}
[\Veim{predecessor}, \Vsym{transition}, \Veim{component}].
\end{equation*}
We did not use the fact that the grammar was unambigous in counting
the possibilities for \Vsym{transition} or \Veim{component}, but
we did make use of it in determining the count of possibilities
for \Veim{predecessor}.
We know still know that
\begin{equation*}
\Veim{predecessor} \in \Ves{component-origin},
\end{equation*}
where 
\Vloc{component-origin} is the origin of \Veim{component}.
Worst case, every EIM in \Ves{component-origin} is a possible
match, so that
the number of possibilities for \Veim{predecessor} now grows to
\size{\Ves{component-origin}}, and
\begin{equation*}
\var{reduction-tries} =
\bigsize{\Ves{component-origin}} \times \Vsize{symbols} \times \bigsize{\Ves{j}}.
\end{equation*}
\end{sloppypar}

In the worst case $\var{component-origin} \simeq \var{j}$,
so that by Theorem \ref{t:es-count},
\begin{equation*}
\size{\Ves{component-origin}} \times \size{\Ves{j}} = \order{\var{j}^2}.
\end{equation*}
Adding \var{other-tries}
and summing over the Earley sets,
we have
\begin{equation*}
\order{\var{n}^2} +
\! \sum\limits_{\Vloc{j}=0}^{n}{\order{\var{j}^2}} = \order{\var{n}^3}.
\qedhere
\end{equation*}
\end{proof}

\begin{theorem}\label{t:leo-right-recursion}
Either
a right derivation has a step
that uses a right recursive rule,
or it has length is at most \var{c},
where \var{c} is a constant which depends
on the grammar.
\end{theorem}

\begin{proof}
Let the constant \var{c} be the number
of symbols.
Assume, for a reductio, that a right derivation
expands to a 
Leo sequence of length
$\var{c}+1$, but that none of its steps uses a right recursive rule.

Because it is of length $\var{c}+1$,
the same symbol must appear twice as the rightmost symbol of
a derivation step.
(Since for the purposes of these
complexity results we ignore nulling symbols,
the rightmost symbol of a string will also be its rightmost
non-nulling symbol.)
So part of the rightmost derivation must take the form
\begin{equation*}
\Vstr{earlier-prefix} \cat \Vsym{A} \deplus \Vstr{later-prefix} \cat \Vsym{A}.
\end{equation*}
But the first step of this derivation sequence must use a rule of the
form
\begin{equation*}
\Vsym{A} \de \Vstr{rhs-prefix} \cat \Vsym{rightmost},
\end{equation*}
where $\Vsym{rightmost} \deplus \Vsym{A}$.
Such a rule is right recursive by definition.
This is contrary to the assumption for the reductio.
We therefore conclude that the length of a right derivation
must be less than or equal to \var{c},
unless at least one step of that derivation uses a right recursive rule.
\end{proof}

\subsection{The complexity results}
We are now in a position to show
specific time and space complexity results.

\begin{theorem}
For every LR-regular grammar,
\Marpa{} runs in $\On{}$ time and space.
\end{theorem}

\begin{proof}
By Theorem 4.6 in \cite[p. 173]{Leo1991},
the number of traditional Earley items produced by
\Leo{} when parsing input \Cw{} with an LR-regular grammar \Cg{} is
\begin{equation*}
\order{\Vsize{\Cw}} = \order{\var{n}}.
\end{equation*}
\Marpa{} may produce more Earley items than \Leo{}
for two reasons:
First, \Marpa{} does not apply Leo memoization to Leo sequences
which do not contain right recursion.
Second, \Marpa{}'s Earley items group dotted rules into
states and this has the potential to increase the number
of Earley items.

By theorem \ref{t:leo-singleton},
the definition of an EIMT,
and the construction of a Leo sequence,
it can be seen that a Leo sequence
corresponds step-for-step with a
right derivation.
It can therefore be seen that 
the number of EIMT's in the Leo sequence
and the number of right derivation steps
in its corresponding right derivation
will be the same.

Consider one EIMT that is memoized in \Leo{}.
By theorem \ref{t:leo-singleton} it corresponds to
a single dotted rule, and therefore a single rule.
If not memoized because it is not a right recursion,
this EIMT will be expanded to a sequence
of EIMT's.
How long will this sequence of non-memoized EIMT's
be, if we still continue to memoize EIMT's
which correspond to right recursive rules?
The EIMT sequence, which was formerly a memoized Leo sequence,
will correspond to a right
derivation that does not include
any steps that use right recursive rules.
By Theorem \ref{t:leo-right-recursion},
such a 
right derivation can be
of length at most \var{c1},
where \var{c1} is a constant that depends on \Cg{}.
As noted, this right derivation has
the same length as its corresponding EIMT sequence,
so that each EIMT not memoized in \Marpa{} will expand
to at most \var{c1} EIMT's.

By Theorem \ref{t:marpa-O-leo},
when EIMT's are replaced with EIM's,
the number of EIM's \Marpa{} requires is at worst,
$\var{c2}$ times the number of EIMT's,
where \var{c2} is a constant that depends on \Cg{}.
Therefore the number of EIM's per Earley set
for an LR-regular grammar in a \Marpa{} parse
is less than
\begin{equation*}
    \var{c1} \times \var{c2} \times \order{\var{n}} = \order{\var{n}}.
\end{equation*}

LR-regular grammar are unambiguous, so that
by Theorem \ref{t:tries-O-eims},
the number of attempts that \Marpa{} will make to add
EIM's is less than or equal to
\var{c3} times the number of EIM's,
where \var{c3} is a constant that depends on \Cg{}.
Therefore,
by Theorems \ref{t:O1-time-per-eim}
and \ref{t:O1-links-per-eim},
the time and space complexity of \Marpa{} for LR-regular
grammars is
\begin{equation*}
    \var{c3} \times \order{\var{n}}
    = \order{\var{n}}.\qedhere
\end{equation*}
\end{proof}

\begin{theorem}
For every unambiguous grammar,
\Marpa{} runs in $\order{n^2}$ time and space.
\end{theorem}

\begin{proof}
By assumption, \Cg{} is unambiguous, so that
by Theorem \ref{t:tries-O-eims},
and Theorem \ref{t:eim-count},
the number of attempts that \Marpa{} will make to add
EIM's is
\begin{equation*}
\var{c} \times \order{\var{n}^2},
\end{equation*}
where \var{c} is a constant that depends on \Cg{}.
Therefore,
by Theorems \ref{t:O1-time-per-eim}
and \ref{t:O1-links-per-eim},
the time and space complexity of \Marpa{}
for unambiguous grammars is \order{\var{n}^2}.
\end{proof}

\begin{theorem}
For every context-free grammar,
\Marpa{} runs in $\order{\var{n}^3}$ time.
\end{theorem}

\begin{proof}
By Theorem \ref{t:O1-time-per-eim},
and Theorem \ref{t:ambiguous-tries}.
\end{proof}

\begin{theorem}\label{t:cfg-space}
For every context-free grammar,
\Marpa{} runs in $\order{\var{n}^2}$ space,
if it does not tracks links.
\end{theorem}

\begin{proof}
By Theorem \ref{t:O1-space-per-eim}
and Theorem \ref{t:eim-count}.
\end{proof}

Traditionally only the space result stated for a parsing algorithm
is that 
without links, as in \ref{t:cfg-space}.
This is sufficiently relevant
if the parser is only used as a recognizer.
In practice, however, 
algorithms like \Marpa{}
are typically used in anticipation
of an evaluation phase,
for which links are necessary.

\begin{theorem}
For every context-free grammar,
\Marpa{} runs in $\order{\var{n}^3}$ space,
including the space for tracking links.
\end{theorem}

\begin{proof}
By Theorem \ref{t:O1-links-per-eim},
and Theorem \ref{t:ambiguous-tries}.
\end{proof}

\section{The Marpa Input Model}
\label{s:input}

In this \doc{},
up to this point,
the traditional input stream model
has been assumed.
As implemented,
Marpa generalizes the idea of
input streams beyond the traditional
model.

Marpa's generalized input model
replaces the input \Cw{}
with a set of tokens,
\var{tokens},
whose elements are triples of symbol,
start location and length:
\begin{equation*}
    [\Vsym{t}, \Vloc{start}, \var{length}]
\end{equation*}
such that
$\var{length} \ge 1$
and
$\Vloc{start} \ge 0$.
The size of the input, \size{\Cw},
is the maximum over
\var{tokens} of $\Vloc{start}+\var{length}$.

Multiple tokens can start at a single location.
(This is how \Marpa{} supports ambiguous tokens.)
The variable-length,
ambiguous and overlapping tokens
of \Marpa{}
bend the conceptual framework of ``parse location''
beyond its breaking point,
and a new term for parse location is needed.
Start and end of tokens are described in terms
of \dfn{earleme} locations,
or simply \dfn{earlemes}.
Token length is also measured in earlemes.

Like standard parse locations, earlemes start at 0,
and run up to \size{\Cw}.
Unlike standard parse locations,
there is not necessarily a token ``at'' any particular earleme.
(A token is considered to be ``at an earleme'' if it ends there,
so that there is never a token ``at'' earleme 0.)
In fact,
there may be earlemes at which no token either starts or ends,
although for the parse to succeed, such an earleme would have to be
properly inside at least one token.
Here ``properly inside'' means after the token's start earleme
and before the token's end earleme.

In the Marpa input stream, tokens
may interweave and overlap freely,
but gaps are not allowed.
That is, for all \Vloc{i} such
that $0 \le \Vloc{i} < \size{\Cw}$,
there must exist
\begin{equation*}
	 \var{token} = [\Vsym{t}, \Vloc{start}, \var{length}]
\end{equation*}
such that
\begin{gather*}
	 \var{token} \in \var{tokens} \quad \text{and} \\
	 \Vloc{start} \le \Vloc{i} < \Vloc{start}+\var{length}.
\end{gather*}

The intent of Marpa's generalized input model is to allow
users to define alternative input models for special
applications.
An example that arises in current practice is natural
language, features of which are most
naturally expressed with ambiguous tokens.
The traditional input stream can be seen as the special case of
the Marpa input model where
for all \Vsym{x}, \Vsym{y}, \Vloc{x}, \Vloc{y},
\var{xlength}, \var{ylength},
if we have both of
\begin{align*}
    [\Vsym{x}, \Vloc{x}, \var{xlength}] & \in \var{tokens} \quad \text{and} \\
    [\Vsym{y}, \Vloc{y}, \var{ylength}] & \in \var{tokens},
\end{align*}
then we have both of
\begin{gather*}
\var{xlength} = \var{ylength} = 1 \quad \text{and} \\
     \Vloc{x} = \Vloc{y} \implies \Vsym{x} = \Vsym{y}.
\end{gather*}

The correctness results hold for Marpa input streams,
but to preserve the time complexity bounds,
restrictions must be imposed.
In stating them,
let it be understood that
\begin{equation*}
	\token{[ \Vsym{x}, \Vloc{x}, \var{length} ]} \in \var{tokens}
\end{equation*}
We require that,
for some constant \var{c},
possibly dependent on the grammar \Cg{},
that every token length be less than \var{c},
\begin{equation}
\label{e:restriction1}
\forall \, \token{[\Vsym{x}, \Vloc{x}, \var{length}]},
\; \var{length} < \var{c},
\end{equation}
and that
the cardinality of the set of tokens starting at any
one location
be less than \var{c},
\begin{equation}
\label{e:restriction2}
 \forall \Vloc{i}, \;
 \Bigl|
 \bigl \lbrace
	\token{[ \Vsym{x}, \Vloc{x}, \var{length} ]} \bigm|
	\Vloc{x} = \Vloc{i}
  \bigr \rbrace
  \Bigr| < \var{c}
\end{equation}
Restrictions \ref{e:restriction1}
and \ref{e:restriction2}
impose little or no obstacle
to the practical use
of Marpa's generalized input model.
And with them,
the complexity results for \Marpa{} stand.

\bibliographystyle{plain}

\begin{thebibliography}{10}

\bibitem{AU1972}
Alfred H.~Aho and Jeffrey D.~Ullman.
\newblock The Theory of Parsing, Translation, and Computing
\newblock Prentice-Hall, Englewood Cliff, N.J., 1972.

\bibitem{AH2002}
John~Aycock and R.~Nigel~Horspool.
\newblock Practical Earley Parsing
\newblock {\em The Computer Journal},
    Vol. 45, No. 6, 2002, pp. 620-630.

\bibitem{Earley1970}
J.~Earley.
\newblock An efficient context-free parsing algorithm.
\newblock {\em Communications of the Association for Computing Machinery},
  13(2):94--102, 1970.

\bibitem{GJ2008}
Dirk~Grune and Ceriel~J.H Jacobs
\newblock {\em Parsing Techniques: A Practical Guide}.
\newblock Springer, Amsterdam, 2008.

\bibitem{Irons}
Edgar~T.~Irons.
\newblock A syntax-directed compiler for ALGOL 60.
\newblock {\em Communications of the Association for Computing Machinery},
 4(1):51-55, Jan. 1961

\bibitem{Johnson}
Stephen~C. Johnson.
\newblock Yacc: Yet another compiler-compiler.
\newblock In {\em Unix Programmer's Manual Supplementary Documents 1}. 1986.

\bibitem{Marpa-HTML}
Jeffrey~Kegler, 2011: Marpa-HTML.
\newblock \url{http://search.cpan.org/dist/Marpa-HTML/}.

\bibitem{Marpa-R2}
Jeffrey~Kegler, 2012: Marpa-R2.
\newblock \url{http://search.cpan.org/dist/Marpa-R2/}.

\bibitem{Marpa-XS}
Jeffrey~Kegler, 2011: Marpa-XS-1.002000.
\newblock \url{http://search.cpan.org/dist/Marpa-XS/}.

\bibitem{Leo1991}
J.~M. I.~M. Leo.
\newblock A general context-free parsing algorithm running in linear time on
  every {LR($k$)} grammar without using lookahead.
\newblock {\em Theoretical Computer Science}, 82:165--176, 1991.

\end{thebibliography}

\clearpage
\tableofcontents

\end{document}