Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

4800 lines (4356 sloc) 156.73 kB
% Copyright 2014 Jeffrey Kegler
% This document is licensed under
% a Creative Commons Attribution-NoDerivs 3.0 United States License.
\documentclass[12pt]{amsart}
\usepackage{amssymb}
\usepackage{algorithm, algpseudocode}
\usepackage{url}
\usepackage{amsfonts}% to get the \mathbb alphabet
\makeindex
\newcommand{\field}[1]{\mathbb{#1}}
\newcommand{\TP}{\field{TP}}
% This is now a "paper", but may be a chapter
% or something else someday
% This command might make any such change easier.
\newcommand{\doc}{paper}
% TODO -- these comments mark stuff to be worked on
\newcommand{\todo}[1]{\par{\large\bf Todo: #1}\par}
\newcommand{\mymathop}[1]{\mathop{\texttt{#1}}}
% For a type name, when it occurs in text
\newcommand{\type}[1]{\ensuremath{#1}}
\newcommand{\defined}{\underset{\text{def}}{\equiv}}
\newcommand{\dfn}[1]{{\bf #1}}
\newcommand{\sep}{\,\mid\,}
\newcommand{\mydot}{\raisebox{.05em}{$\,\bullet\,$}}
\newcommand{\cat}{\,.\,}
\newcommand{\size}[1]{\ensuremath{\left | {#1} \right |}}
\newcommand{\bigsize}[1]{\ensuremath{\bigl| {#1} \bigr|}}
\newcommand{\order}[1]{\ensuremath{{\mathcal O}(#1)}}
\newcommand{\Oc}{\order{1}}
\newcommand{\On}{\order{\var{n}}}
% I use hyphens in variable names,
% so I need to ensure that subtraction is
% clearly distinguished by the typography
\newcommand{\subtract}{\,-\,}
\newcommand{\var}[1]{\ensuremath{\texttt{#1}}}
\newcommand{\de}{\rightarrow}
\newcommand{\derives}{\Rightarrow}
\newcommand{\destar}
{\mathrel{\mbox{$\:\stackrel{\!{\ast}}{\Rightarrow\!}\:$}}}
\newcommand{\deplus}
{\mathrel{\mbox{$\:\stackrel{\!{+}}{\Rightarrow\!}\:$}}}
\newcommand{\deR}
{\mathrel{\mbox{$\:\stackrel{\!{R}}{\Rightarrow\!}\:$}}}
\newcommand{\deRstar}
{\mathrel{\mbox{$\:\stackrel{\!{R\ast}}{\Rightarrow\!}\:$}}}
\newcommand{\deRplus}
{\mathrel{\mbox{$\:\stackrel{\!{R+}}{\Rightarrow\!}\:$}}}
\newcommand{\derivg}[1]{\mathrel{\mbox{$\:\Rightarrow\:$}}}
\newcommand{\derivrg}[2]{\mathrel{\mbox{$\:\stackrel{\!{#1}}%
{\Rightarrow\!}\:$}}}
\newcommand{\set}[1]{{\left\lbrace #1 \right\rbrace} }
\newcommand{\bigset}[1]{{\bigl\lbrace #1 \bigr\rbrace} }
\newcommand{\Bigset}[1]{{\Bigl\lbrace #1 \Bigr\rbrace} }
\newcommand{\typeSubscript}[1]{
\mbox{\scriptsize\normalfont\textsc{#1}}
}
\newcommand{\typed}[2]{\var{#1}_{\typeSubscript{#2}}}
\newcommand{\setTyped}[2]{\var{#1}_{\set{\typeSubscript{#2}}}}
\newcommand{\typedPrime}[3]{\ensuremath{\var{#1}^{#2}_{\typeSubscript{#3}}}}
\newcommand{\dr}[1]{\typed{#1}{dr}}
\newcommand{\Vdr}[1]{\ensuremath{\dr{#1}}}
\newcommand{\eim}[1]{\typed{#1}{eim}}
\newcommand{\Veim}[1]{\ensuremath{\eim{#1}}}
\newcommand{\es}[1]{\typed{#1}{es}}
\newcommand{\Ves}[1]{\ensuremath{\es{#1}}}
\newcommand{\mylim}[1]{\typed{#1}{lim}}
\newcommand{\Vlim}[1]{\ensuremath{\mylim{#1}}}
\newcommand{\loc}[1]{\typed{#1}{loc}}
\newcommand{\Vloc}[1]{\ensuremath{\loc{#1}}}
\newcommand{\myrule}[1]{\typed{#1}{rule}}
\newcommand{\Vrule}[1]{\ensuremath{\myrule{#1}}}
\newcommand{\mysf}[1]{\typed{#1}{sf}}
\newcommand{\Vsf}[1]{\ensuremath{\mysf{#1}}}
\newcommand{\str}[1]{\typed{#1}{str}}
\newcommand{\Vstr}[1]{\ensuremath{\str{#1}}}
\newcommand{\sym}[1]{\typed{#1}{sym}}
\newcommand{\Vsym}[1]{\ensuremath{\sym{#1}}}
\newcommand{\ts}[1]{\typed{#1}{ts}}
\newcommand{\Vts}[1]{\ensuremath{\ts{#1}}}
\newcommand{\orig}[1]{\typed{#1}{orig}}
\newcommand{\Vorig}[1]{\ensuremath{\orig{#1}}}
\newcommand{\token}[1]{\typed{#1}{token}}
\newcommand{\Vtoken}[1]{\ensuremath{\token{#1}}}
\newcommand{\tp}[1]{\typed{#1}{tp}}
\newcommand{\Vtp}[1]{\ensuremath{\tp{#1}}}
\newcommand{\Vtpm}[1]{\ensuremath{\overrightarrow{\var{#1}}}}
\newcommand{\Vtpoffset}[2]{\ensuremath{
\xrightarrow[\mbox{\normalsize\texttt{#1}}]
{#2}
} }
\newcommand{\ws}[1]{\typed{#1}{ws}}
\newcommand{\Vws}[1]{\ensuremath{\ws{#1}}}
\newcommand{\eimset}[1]{\setTyped{#1}{eim}}
\newcommand{\Veimset}[1]{\ensuremath{\eimset{#1}}}
\newcommand{\ruleset}[1]{\setTyped{#1}{rule}}
\newcommand{\Vruleset}[1]{\ensuremath{\ruleset{#1}}}
\newcommand{\symset}[1]{\setTyped{#1}{sym}}
\newcommand{\Vsymset}[1]{\ensuremath{\symset{#1}}}
\newcommand{\Vsize}[1]{\ensuremath{\size{\var{#1}}}}
\newcommand{\alg}[1]{\ensuremath{\textsc{#1}}}
\newcommand{\Earley}{\ensuremath{\alg{Earley}}}
\newcommand{\Leo}{\ensuremath{\alg{Leo}}}
\newcommand{\Marpa}{\ensuremath{\alg{Marpa}}}
\newcommand{\Cdr}{\var{dr}}
\newcommand{\Cg}{\var{g}}
\newcommand{\Crules}{\var{P}}
\newcommand{\Cw}{\var{w}}
\newcommand{\Crange}[3]{\ensuremath{#1[#2 \dots #3]}}
\newcommand{\Cwrange}[2]{\Crange{\Cw}{#1}{#2}}
\newcommand{\CVstr}[3]{\ensuremath{{#1[#2]}}}
\newcommand{\CVwstr}[1]{\CVstr{w}{#1}}
\newcommand{\CVw}[1]{\ensuremath{\sym{\Cw[\var{#1}]}}}
\newcommand{\GOTO}{\mymathop{GOTO}}
\newcommand{\Next}[1]{\mymathop{Next}(#1)}
\newcommand{\Predict}[1]{\mymathop{Predict}(#1)}
\newcommand{\Postdot}[1]{\mymathop{Postdot}(#1)}
\newcommand{\Penult}[1]{\mymathop{Penult}(#1)}
\newcommand{\LHS}[1]{\mymathop{LHS}(#1)}
\newcommand{\RHS}[1]{\mymathop{RHS}(#1)}
\newcommand{\ID}[1]{\mymathop{ID}(#1)}
\newcommand{\PSL}[2]{\mymathop{PSL}[#1][#2]}
\newcommand{\myL}[1]{\mymathop{Lang}(#1)}
\newcommand\Etable[1]{\ensuremath{\mymathop{table}[#1]}}
\newcommand\bigEtable[1]{\ensuremath{\mymathop{table}\bigl[#1\bigr]}}
\newcommand\Rtable[1]{\ensuremath{\mymathop{table}[#1]}}
\newcommand\Rtablesize[1]{\ensuremath{\bigl| \mymathop{table}[#1] \bigr|}}
\newcommand\Vtable[1]{\Etable{\var{#1}}}
\newcommand\EEtable[2]{\ensuremath{\mymathop{table}[#1,#2]}}
\newcommand\EVtable[2]{\EEtable{#1}{\var{#2}}}
% I want to use 'call' outside of pseudocode
\newcommand\call[2]{\textproc{#1}\ifthenelse{\equal{#2}{}}{}{(#2)}}%
\newcommand{\myendTheorem}{\ensuremath{\blacklozenge}}
% I don't like to put whole paragraphs in italics,
% so I make this simple variation on the "plain" theoremstyle
\newtheoremstyle{myplain}
{10pt} % ABOVESPACE
{10pt} % BELOWSPACE
{\normalfont} % BODYFONT
{0pt} % INDENT (empty value is the same as 0pt)
{\bfseries} % HEADFONT
{.} % HEADPUNCT
{5pt plus 1pt minus 1pt} % HEADSPACE
{} % CUSTOM-HEAD-SPEC
\theoremstyle{myplain}
\newtheorem{theorem}{Theorem}[section]
\newenvironment{mytheorem}{
\begin{theorem}
}{
\myendTheorem
\end{theorem}
}
\newtheorem{construction}[theorem]{Construction}
\newtheorem{lemma}[theorem]{Lemma}
\index{Lemmas|see{Theorems}}
\newtheorem{definition}[theorem]{Definition}
\newenvironment{mydefinition}{
\begin{definition}
}{
\myendTheorem
\end{definition}
}
\newtheorem{observation}[theorem]{Observation}
\newenvironment{myobservation}{
\begin{observation}
}{
\myendTheorem
\end{observation}
}
\newtheorem{step}[theorem]{Step}
% \theoremstyle{remark}
% \newtheorem*{remark}{Remark}
\hyphenation{oper-and oper-ands}
\hyphenation{look-ahead}
\hyphenation{memo-ization}
\begin{document}
\date{\today}
\title{Marpa, a practical general parser: the recognizer}
\author{Jeffrey Kegler}
\thanks{%
Copyright \copyright\ 2014 Jeffrey Kegler.
}
\thanks{%
This document is licensed under
a Creative Commons Attribution-NoDerivs 3.0 United States License.
}
\begin{abstract}
The recognizer of the
Marpa parser is described.\linebreak
Marpa is a practical and fully implemented
system for the recognition, parsing
and evaluation of context-free grammars.
The Marpa recognizer is based on Jay Earley's algorithm,
as modified by Joop Leo.
Leo's algorithm, and therefore Marpa,
is shown to have \On{} time complexity
for any finite union of LR-regular grammars (LRR).
Individual LRR grammars are unambiguous, but their finite union
includes many ambiguous grammars.
The Marpa parsing strategy is primarily syntax-driven
but, unlike previous Earley-based parse engines,
the Marpa parse engine
will allow the parse to be interrupted
for application-specific logic.
Events can be scheduled
to trigger when symbols
are recognized or predicted,
pausing the parse at those points.
Using nulled symbols, the application can in effect
schedule events that trigger at any point in a rule.
While a parse is paused,
an application has available to it full information
on the state of the parse.
Marpa can also
pause the parse on rejection of a token as unacceptable.
This can be very useful,
since rejection is based on
exact knowledge of those tokens which allow the parse to continue,
and Marpa allows parsing to be retried with another token.
\end{abstract}
\maketitle
\section{Introduction}
The first parsers to be described were left parsers\cite{Irons}.
They were flexible, but efficient only for a confining
subset of the context free grammars.
During the 1960's the effort to replace left parsing
pitted Knuth's LR-parsing against
Earley's parsing algorithm.
The discovery of LALR parsing, which adopted LR-parsing to the severe hardware constraints
of the 1970's, made LR parsing the victor.
For the next three decades,
LALR parsing was respected
by practitioners as the state-of-the-art,
and the theory of LR parsing dominated
the textbooks.
But over the past decade,
practitioners concluded that the modest gains
in power that LALR offers
over left parsing were not adequate
compensation for LALR's problems with
error detection, recovery and reporting
and its life-cycle maintenance costs.
LALR is being abandoned
in favor of the left parsing of the 1960's.
At this writing,
this reversion to the technology of over half
a century ago is the major trend in parsing.
It took five decades for
the practice of parsing to make
the round trip
from left parsing to LR, then back again.
These were not decades during which
the underlying
computer technology remained unchanged.
For most of today's computer programmers,
the hardware constraints that led to the verdict
against Earley's are not even a memory.
In the meantime,
the theory of Earley parsing has not stood still.
A major improvement to Earley's was discovered by
Joop Leo in 1991,
Leo's algorithm, without resorting to lookahead,
runs in \On{} time for a vast class of grammars,
including the LR-regular grammars and their finite unions.
Despite this,
neither Earley's original algorithm,
or Leo's improvement to it,
have previously been incorporated into a highly available tool,
while several such tools exist for LALR\cite{Johnson} and
tools for regular expressions are numerous and widespread.
The Marpa project was intended to remedy this situation.
The first stable version\cite{Marpa-XS} was uploaded to the CPAN Perl archive
on Solstice Day in 2011.
This \doc{} describes Marpa::R2\cite{Marpa-R2},
the current stable version.
The next section of this paper,
Section \ref{s:capabilities},
summarizes the advantages
of Marpa, prior to immersion into the details of
implementation and its supporting proofs.
Section
\ref{s:notation} describes
the basic notation and conventions
of this \doc.
Section \ref{s:rewrite} deals with the rewrites Marpa
uses to put its grammars into Marpa internal form.
Sections \ref{s:earley},
\ref{s:earley-approach},
and \ref{s:earley-ops}
describe Earley's algorithm.
Sections
\ref{s:leo-memoization}
and \ref{s:leo} describe Leo's modification
to Earley's algorithm.
Section \ref{s:pseudocode} presents the pseudocode
for Marpa's recognizer.
Section
\ref{s:correct}
contain a proof of Marpa's correctness,
while Section \ref{s:complexity} contains
its complexity results.
Finally,
Section \ref{s:input}
generalizes Marpa's input model.
\section{A summary of Marpa's capabilities}
\label{s:capabilities}
\subsection{Time and space complexity}
As implemented,
Marpa parses,
without exception,
all context-free grammars.
Time bounds are the best of Leo\cite{Leo1991}
and Earley\cite{Earley1970}.
The bound which Leo claimed,
\On{} for LR-regular (LRR) grammars,
is especially relevant to
Marpa's goal of being a practical parser:
If a grammar is in a class of grammar currently in practical use,
Marpa parses it in linear time.
In this \doc{}
we will show that Marpa runs in \On{} space
and time for any grammar that is
the union of a finite number of LRR grammars.
While all LRR grammars are unambiguous,
the class of their finite unions includes
ambiguous grammars.
Ambiguous grammars of practical interest
are especially likely to a finite union of LRR grammars.
\subsection{Error detection and recovery}
Marpa breaks new ground with respect to error-detection.
Marpa has the immediate error detection property,
but goes well beyond that:
it is fully aware of the state of the parse,
and can report this to the user while tokens are
being scanned.
Marpa allows the lexer to check its list
of acceptable tokens before a token is scanned.
Because rejection of tokens is easily and
efficiently recoverable,
the lexer is also free to take an event-driven
approach.
Rejection and recovery are quite efficient,
and no longer
need to be seen as acts of desperation,
to be used for error detection only.
Token rejection can be used as
the basis for parsing techniques.
\subsection{Pausing and resuming parsing}
Recursive descent
is the dominant practical parsing technique
as of this writing.
Recursive descent
forces the programmer to specify the parsing
logic step by step,
but this is a necessity that practitioners have come to see
as an advantage.
Recursive descent make it easy for the
programmer to insert
application-specific logic at
any point in the parse.
What may prove anything but easy
is maintaining
all this case-specific coding once
it is widely-dispersed throughout and intertwined
with the logic that addresses more general
structure.
Here Marpa allows the programmer the best of both worlds.
Marpa's approach to parsing is primarily syntax-driven.
But the syntax-driven parsing can be interrupted at any point
in favor of application-specific logic.
In the Marpa context, application-specific logic has
advantages that it does not have in a recursive descent
framework.
Code called in a Marpa framework can query Marpa about the
state of the parse.
From Marpa, an application can find out
which rules have been predicted, completed or partially recognized,
and at which locations in the input.
Marpa can also determine which tokens are currently acceptable.
Marpa's idea of ``acceptability'' is very precise.
Marpa will report a token as acceptable if and only if
for some possible continuation of the input,
the parse could succeed.
The application is free to use this information to
guide the creation and reading of tokens.
To help the application to decide efficiently when
to switch from syntax-driven parsing
to application-specific logic,
Marpa offers an ``event'' mechanism.
Events can be scheduled to trigger
when a non-nulled symbol is predicted;
when a non-nulled symbol is recognized;
or when a nulled symbol occurs.
Nulled symbols,
whose occurrence is simultaneously a prediction and a recognition,
are especially useful for events.
A nulling symbol can be placed anywhere, which means
that events can be set up to trigger
at any position inside a Marpa rule.
\subsection{Ruby Slipper parsing}
If a token is rejected,
the lexer is free to create a new token
in the light of the parser's expectations.
This approach can be described
as making the parser's
``wishes'' come true,
and we have called this
``Ruby Slippers Parsing''.
One use of the Ruby Slippers technique is to
parse with a clean
but oversimplified grammar,
programming the lexical analyzer to make up for the grammar's
short-comings on the fly.
The author has implemented an HTML parser\cite{Marpa-HTML},
based on a grammar that assumes that all start
and end tags are present.
Such an HTML grammar is too simple even to describe perfectly
standard-conformant HTML,
but the lexical analyzer is
programmed to supply start and end tags as requested by the parser.
The result is a very simply and cleanly designed parser
that parses very liberal HTML
and accepts all input files,
in the worst case
treating them as highly defective HTML.
\section{Notation}
\label{s:notation}
We assume familiarity with the theory of parsing,
as well as Earley's algorithm.
This \doc{} will
use subscripts to indicate commonly occurring types.
\begin{center}
\begin{tabular}{ll}
$\var{X}_T$ & The variable \var{X} of type $T$ \\
$\setTyped{set-one}{t}$ & The variable \var{set-one} of type set of $T$ \\
$SYM$ & The type for a symbol \\
\Vsym{a} & The variable \var{a} of type $SYM$ \\
\Vsymset{set-two} & The variable \var{set-two}, a set of symbols \\
\end{tabular}
\end{center}
Type is often clear from the context,
and subscripts to indicate type are often omitted.
The notation for
constants is the same as that for variables.
Multi-character variable names will be common,
and operations will never be implicit.
\begin{center}
\begin{tabular}{ll}
Multiplication & $\var{a} \times \var{b}$ \\
Concatenation & $\var{a} \cat \var{b}$ \\
Subtraction & $\var{symbol-count} \subtract \var{terminal-count}$ \\
\end{tabular}
\end{center}
Type names are often used in the text
as a convenient way to refer to
their type.
In this paper, unless indicated otherwise,
\Vsymset{T} will be a non-empty finite set of terminal symbols,
called the alphabet.
\Vsymset{L} be another non-empty set of symbols, called LHS symbols.
\var{L} and \var{T} will not necessarily be disjoint.
\Vsymset{V}, where $\var{V} = \var{T} \cup \var{L}$,
will be the vocabulary.
Where \Vsymset{sym-set} is a non-empty set of symbols,
$\var{sym-set}^\ast$ is the set of all sequences
formed from those symbols.
Call the empty string, $\epsilon$.
Then $\Vsymset{sym-set}^+ = \var{sym-set}^\ast \backslash \epsilon$.
If $\var{string} \in \var{T}^\ast$, \Vstr{string}
is of type \type{STR}.
Where \Vstr{s} is a string,
\size{\Vstr{s}} is its length, counted in symbols.
Where \var{s1} and \var{s2} are strings,
whether of type \type{STR} or \type{SF},
we will write $\var{s1} \cat \var{s2}$ for the
concatenation of \var{s1} and \var{s2},
and $\var{s1} \times \var{n}$ for the concatenation of \var{n} copies
of \var{s1}.
The \var{x}'th character of a string \var{s} is written as \CVstr{\var{s}}{\var{x}}.
The span of characters from \var{x} to \var{y}, inclusive,
is written as
\Crange{\var{s}}{\var{x}}{\var{y}}.
Since the range is inclusive,
\begin{equation*}
\Crange{\var{s}}{\var{x}}{\var{x}} = \CVstr{\var{s}}{\var{x}}.
\quad \text{and} \quad
\bigl| \Crange{\var{s}}{\var{x}}{\var{x}} \bigr| = 1.
\end{equation*}
For a zero-length string starting at location \var{x},
we write $\Crange{\var{s}} {\var{x}} {(\var{x}\subtract 1)}$.
In this case,
$\var{x}\subtract 1$ need not be an actual location.
For example, the empty substring at the start of \var{s}
would be written $\Crange{\var{s}}{0}{(-1)}$.
Where \var{ss} is a set of strings,
we will write $\var{ss}^\ast$ for
the set of strings
that is the closure of concatenation on \var{ss},
and
$\var{ss}^+$ for $\var{ss}^\ast \backslash \epsilon$.
Where \var{ss1} and \var{ss2} are sets of strings,
\begin{gather}
\var{ss1} \cat \var{ss2} \defined
\bigl\lbrace \var{s1} \cat \var{s2}
\bigm| \var{s1} \in \var{ss1}, \var{s2} \in \var{ss2}
\bigr\rbrace, \\
\var{ss1} \times \var{n} \defined
\bigl\lbrace \var{s}[0] \cat \var{s}[1] \dots \var{s}[\var{n}\subtract 1]
\bigm| \forall \var{i} : \var{s}[\var{i}] \in \var{ss1} \bigr\rbrace, \\
\var{ss1}^\ast \defined \bigcup_{\var{n} \ge 0}{ \var{ss1} \times \var{n} } \quad \text{and} \\
\var{ss1}^+ \defined \var{ss1}^\ast \backslash \epsilon
\end{gather}
Let \Vruleset{rules} be a set of rules (type \type{RULE}),
where a rule is a duple
of the form $[\Vsym{lhs} \de \Vsf{rhs}]$,
such that
\begin{equation*}
\Vsym{lhs} \in \var{L} \quad \text{and}
\quad \Vsf{rhs} \in \var{V}^+.
\end{equation*}
\Vsym{lhs} is referred to as the left hand side (LHS)
of \Vrule{r}.
\Vstr{rhs} is referred to as the right hand side (RHS)
of \Vrule{r}.
The LHS and RHS of \Vrule{r} may also be
referred to as
$\LHS{\var{r}}$ and $\RHS{\var{r}}$, respectively.
We will represent grammars as 4-tuples
of
\begin{equation*}
[\Vsymset{vocab}, \Vsymset{lhs-syms}, \var{productions}, \Vsym{start}].
\end{equation*}
Here \Vsym{start}, $\var{start} \in \var{lhs-syms}$,
is a distinguished start symbol.
The grammar's set of terminals will be
be $\var{T} = \Vsymset{vocab} \backslash \Vsymset{lhs-syms}$,
unless indicated otherwise.
In most contexts of this \doc{},
only one grammar will be under discussion:
\begin{equation*}
\Cg = [\Vsymset{V}, \Vsymset{L}, \Crules, \Vsym{S}].
\end{equation*}
In those context where
a string is formed from
the symbols in the vocabulary of some grammar,
\var{vocab}, we will say that a string
is of type \type{SF},
if it potentially contains non-terminals.
We will reserve
\type{STR} for those strings which
must be formed entirely from terminals.
The rules imply the traditional rewriting system,
in which $\Vsf{x} \derives \Vsf{y}$
states that \Vsf{x} derives \Vsf{y} in exactly one step;
$\Vsf{x} \deplus \Vsf{y}$
states that \Vsf{x} derives \Vsf{y} in one or more steps;
and $\Vsf{x} \destar \Vsf{y}$
states that \Vsf{x} derives \Vsf{y} in zero or more steps.
We say that symbol \Vsym{x} is \dfn{nullable} if and only if
$\Vsym{x} \destar \epsilon$.
\Vsym{x} is \dfn{nonnull} if and only if it is not nullable.
In general,
the input to
the parse be \Cw{} such that $\Cw \in \var{T}^+$.
Locations in the input will be of type \type{LOC}.
Let \Vsize{w} be the length of the input, counted in symbols.
When we state our complexity results later,
they will often be in terms of $\var{n}$,
where $\var{n} = \Vsize{w}$.
% TODO -- Move these to where used? Or eliminate them?
In this \doc{},
\Earley{} will refer to the Earley's original
recognizer\cite{Earley1970}.
\Leo{} will refer to Leo's revision of \Earley{}
as described in~\cite{Leo1991}.
\Marpa{} will refer to the parser described in
this \doc{}.
Where $\alg{Recce}$ is a recognizer,
The language of \Cg{} is $\myL{\Cg}$,
$\myL{\alg{Recce},\Cg}$ will be the language accepted by $\alg{Recce}$
when parsing \Cg{}.
\section{Rewriting the grammar}
\label{s:rewrite}
Marpa runs on fully general BNF.
To do this, it rewrites the grammar before calling
the recognizer,
then undoes the rewrite after the recognizer has run.
The original grammar is called the {\bf external grammar},
because in the Marpa implementation, that is the one which
the user sees.
The grammar at the completion of the rewrite is called
the {\bf internal grammar}.
The Marpa parse engine actually runs on the internal grammar.
It is important that the two rewrites,
from external to internal,
and from internal to external,
be such that they can be done efficiently,
while preserving the semantics.
Marpa claims to be a practical parser,
and semantics are essential in practical parsing.
In fact, as implemented, Marpa often translates
symbols, rules and positions in rules back and forth
between internal and external
while the recognizer is still running.
This is done, for example, in implementing events.
Applications can specify events to trigger when certain symbols
are either predicted or recognized, and the application
needs to be able to specify these events
in terms of the symbols of the external grammar.
Marpa is implemented so that it can translate from external symbol to
internal and back again very quickly.
We now describe the rewrite.
This description is conceptual --
the actual implementation is optimized.
The rewrite takes place as if the following steps were executed:
\begin{step}
Determine if the grammar is trivial.
A grammar \Cg{} is trivial if and only if
$\myL{\Cg} = \set{ \epsilon }$.
Trivial grammars are easily dealt with, without using the methods
described in this paper.
If the grammar is trivial,
ignore the remaining steps of this rewrite.
In the Marpa implementation,
trivial grammars are special cased.
\end{step}
\begin{sloppypar}
\begin{step}
Reduce the grammar, eliminating inaccessible and
un\-productive symbols.
Inaccessible symbols are those which cannot be reached from the start symbol.
Unproductive symbols are those which can never derive a sentence in the language.
\end{step}
\end{sloppypar}
\begin{step}
Augment the grammar.
Rename symbols so that the current start symbol is named \Vsym{old-start}.
Add a new dedicated start symbol, \Vsym{S}.
Add a new dedicated start rule $[\var{S} \de \Vsym{old-start}]$.
\end{step}
\begin{step}
\label{step:divide-rules}
\label{step:first-CHAF}
Rewrite the grammar so that no RHS contains more than two properly
nullable symbols.
A properly nullable symbol is a nullable symbol which is not a
nulling symbol.
This rewrite can be done by splitting the offending rules into
several rules.
\end{step}
\begin{step}
For each properly nullable symbol, create two aliases,
one a nulling symbol and one a non-nullable symbol.
\end{step}
\begin{step}
\label{step:factor-rules}
Using the aliases created in the previous step,
eliminate all properly nullable symbols.
This is done by
``factoring'' every rule with one or more proper nullable
into several rules,
so that the new set of rules represents every possible combination
of nulling and non-nullable symbols.
Since there are two aliases and at most two properly nullable symbols
in a rule, the maximum number of ``factors'' is $2^2 = 4$.
\end{step}
\begin{step}
\label{step:last-CHAF}
Every rule and symbol
will now be nulling or non-nullable.
Determine which symbols are nulling,
and which rules are nulling.
Discard all the nulling rules, treating their LHS symbol
as a nulling symbol.
\end{step}
\begin{step}
Remove all nulling symbols from the rules, marking their
location for future reference.
\end{step}
This rewrite of Steps
\ref{step:first-CHAF} through \ref{step:last-CHAF}
is based on that of Aycock and Horspool\cite{AH2002},
and we call it Chomsky-Horspool-Aycock Form (CHAF).
The major difference between this rewrite and \cite{AH2002}
is Step \ref{step:divide-rules}.
Without Step \ref{step:divide-rules},
rules with many optional symbols on their RHS could come into
Step \ref{step:factor-rules}.
Where \var{x} is the number of proper nullables, the number of
factors would be $2^\var{x}$, so that
factoring would be exponential in the length of the rule.
Although \var{x} is a constant that depends on the grammar,
we see the exponential explosion
as a potential problem in practice.
SQL's select statement is one example of a rule
whose rewrite would be exponential
without
Step \ref{step:divide-rules}.
From this point in this \doc{} on,
unless otherwise noted,
the grammar \Cg{} will refer to a Marpa internal grammar.
Summarizing the result of this rewrite:
\begin{itemize}
\item \Cg{} will not be a trivial grammar -- it will accept
at least one non-null input.
\item \Cg{} will not be nullable -- it will accept no null
inputs. (In the implementation, trivial
grammars and null inputs are special-cased.)
\item \Cg{} will contain no inaccessible symbols.
\item \Cg{} will contain no unproductive symbols.
\item \Cg{} will contain no nullable symbols.
\item \Cg{} will contain no nullable rules.
\item \Cg{} will have a dedicated start symbol, \Vsym{S}, which
occurs only as the LHS of the start rule.
\item \Cg{} will have a dedicated start rule,
whose RHS is of length one, so that it takes the form
\begin{equation*}
[\Vsym{S} \de \Vsym{old-start}].
\end{equation*}
\end{itemize}
Note that if a grammar in Marpa external form
contain cycles, it will still contain cycles
in Marpa internal form.
Nothing in the rewrite eliminates cycles.
\section{Earley's algorithm}
\label{s:earley}
Let $\Vrule{r} \in \Crules$
be a rule,
and $\Vsize{r}$ the length of its RHS.
A dotted rule (type \type{DR}) is a duple, $[\Vrule{r}, \var{pos}]$,
where $0 \le \var{pos} \le \size{\Vrule{r}}$.
The position, \var{pos}, indicates the extent to which
the rule has been recognized,
and is represented with a large raised dot,
so that if
\begin{equation*}
[\Vsym{A} \de \Vsym{X} \cat \Vsym{Y} \cat \Vsym{Z}]
\end{equation*}
is a rule,
\begin{equation*}
[\Vsym{A} \de \var{X} \cat \var{Y} \mydot \var{Z}]
\end{equation*}
is the dotted rule with the dot at
$\var{pos} = 2$,
between \Vsym{Y} and \Vsym{Z}.
If we let \Vdr{x} be a dotted rule, such that
\begin{equation*}
\Vdr{x} =
\bigl[ [\Vsym{A} \de \Vstr{pre} \cat \Vsym{next} \cat \Vstr{post}],
\var{pos} \bigr],
\end{equation*}
then
%
\begin{gather*}
%
\LHS{\Vdr{x}} \defined \Vsym{A} \\
%
\Postdot{\Vdr{x}} \defined
\begin{cases}
\Vsym{next}, \quad \text{if $\var{x} = [\var{A} \de \var{pre} \mydot \var{next} \cat \var{post}]$} \\
\Lambda, \quad \text{if $\var{x} = [\var{A} \de \var{pre} \cat \var{next} \cat \var{post} \mydot]$}
\end{cases} \\
%
\Next{\Vdr{x}} \defined
\begin{cases}
[\var{A} \de \var{pre} \cat \var{next} \mydot \var{post}], \\
\qquad \text{if $\Postdot{\Vdr{x}} = \var{next}$} \\
\text{$\Lambda$, otherwise}
\end{cases} \\
%
\Penult{\Vdr{x}} \defined
\begin{cases}
\Vsym{next}, \quad \text{if} \\
\qquad \Postdot{\Vdr{x}} = \var{next} \\
\qquad \land \quad \Vstr{post} \destar \epsilon \\
\qquad \land \quad \neg (\Vsym{next} \destar \epsilon) \\
\Lambda, \quad \text{otherwise}
\end{cases}
%
\end{gather*}
A \dfn{penult} is a dotted rule \Vdr{d} such that $\Penult{\var{d}} \neq \Lambda$.
Note that $\Penult{\Vdr{x}}$
is never a nullable symbol.
The \dfn{initial dotted rule} is
\begin{equation*}
\Vdr{initial} = [\Vsym{accept} \de \mydot \Vsym{start} ].
\end{equation*}
A \dfn{predicted dotted rule} is a dotted rule,
other than the initial dotted rule,
with a dot position of zero,
for example,
\begin{equation*}
\Vdr{predicted} = [\Vsym{A} \de \mydot \Vstr{alpha} ].
\end{equation*}
A \dfn{confirmed dotted rule}
is the initial dotted rule,
or a dotted rule
with a dot position greater than zero.
A \dfn{completed dotted rule} is a dotted rule with its dot
position after the end of its RHS,
for example,
\begin{equation*}
\Vdr{completed} = [\Vsym{A} \de \Vstr{alpha} \mydot ].
\end{equation*}
Predicted, confirmed and completed dotted rules
are also called, respectively,
\dfn{predictions}, \dfn{confirmations} and \dfn{completions}.
\section{Tree predicates}
\label{s:tree-predicates}
An tree predicate is is a triple
\begin{equation*}
[\Vdr{dotted-rule}, \Vloc{origin}, \Vloc{dot}].
\end{equation*}
Let
\begin{equation*}
\Vdr{dotted-rule} = [\Vsym{LHS} \de \Vsf{predot} \mydot \Vsf{postdot}].
\end{equation*}
\begin{mydefinition}
\label{d:valid-tp}
In the context of a grammar
\begin{equation}
\var{g} = [ \var{V}, \var{N}, \var{P}, \var{S} ],
\end{equation}
we say that tree predicate
\begin{equation}
\bigl[
\label{e:tree-predicate-1} [\Vsym{LHS} \de \Vsf{predot} \mydot \Vsf{postdot}]
, \Vloc{origin}, \Vloc{dot} \bigr]
\end{equation}
is \dfn{valid} for symbol \Vsym{A} at location \Vloc{base}
if and only if,
for some \Vsf{after},
\begin{gather}
\label{e:tree-predicate-2}
[\var{LHS} \de \var{predot} \cat \var{postdot}] \in \var{P}, \\
\label{e:tree-predicate-3}
\Vsym{A} \destar
\Cwrange{\var{base}}{\var{origin}-1} \cat \var{LHS} \cat \var{after} \quad \text{and} \\
\label{e:tree-predicate-4}
\var{predot} \destar \Cwrange{\var{origin}}{\Vloc{dot}-1}.
\end{gather}
We write the set of valid tree predicates
for grammar \var{g},
symbol \Vsym{A}, base location \var{b}, and input \Vstr{w}
as $\var{Valid-Predicates}(\var{g}, \var{A}, \var{b}, \var{w})$.
\end{mydefinition}
\begin{mydefinition}
A tree predicate
is \dfn{valid} for grammar \var{g}
and input \Vstr{w},
if and only if it is valid for its start symbol at location 0.
We write the set of valid tree predicates
for grammar \var{g} and input \var{w}
as $\var{Valid}(\var{g}, \var{w})$.
\end{mydefinition}
This definition means that
\begin{equation}
\var{Valid}(\var{g}, \var{w})
= \var{Valid-Predicates}(\var{g}, \var{S}, 0, \var{w}),
\end{equation}
where \Vsym{S} is the start symbol of \var{g}.
We will find it convenient to overload the \var{Valid} predicate.
We will write the set of valid tree predicates for \var{g} and input \var{w} as $\var{Valid}(\var{w})$,
in contexts where \var{g} is understood.
The set of tree predicates valid at a location \Vloc{i} is
\begin{multline}
\var{Valid}(\var{g}, \var{w}) [ \var{i} ] = \\
\Bigl\lbrace \var{tp}
\Bigm|
\var{tp} = \bigl[ \Vdr{dotted}, \Vorig{start}, \Vloc{i} \bigr]
\land \var{tp} \in \var{Valid}(\var{w})
\Bigr\rbrace .
\end{multline}
Where \var{g} is understood, this is written
$\var{Valid}(\var{w}) [ \var{i} ]$.
\begin{definition}
For a fused tree predicate of the form,
\begin{equation}
\Vtp{tp} = \bigl[ [ \Vsym{lhs} \de \Vsf{rhs1} \Vsym{transition} \mydot \Vsf{rhs2} ], \Vorig{i}, \Vloc{k} \bigr],
\end{equation}
a fusion proof is a pair of valid tree predicates of the form
\begin{gather}
\bigl[ [ \Vsym{lhs} \de \Vsf{rhs1} \mydot \Vsym{transition} \Vsf{rhs2} ], \Vorig{i}, \Vloc{j} \bigr] \\
\bigl[ [ \Vsym{transition} \de \Vsf{transition-rhs} ], \Vorig{j}, \Vloc{k} \bigr],
\end{gather}
\end{definition}
\begin{definition}
For a scanned tree predicate of the form,
\begin{equation}
\Vtp{tp} = \bigl[ [ \Vsym{lhs} \de \Vsf{rhs1} \Vsym{transition} \mydot \Vsf{rhs2} ], \Vorig{i}, \Vloc{k} \bigr],
\end{equation}
a scanning proof is a duple consisting of a tree predicate of the form
\begin{equation}
\bigl[ [ \Vsym{lhs} \de \Vsf{rhs1} \mydot \Vsym{transition} \Vsf{rhs2} ], \Vorig{i}, (\var{k}\subtract 1) \bigr]
\end{equation}
and the input symbol
\begin{equation}
\CVw{\var{k}\subtract 1} = \Vsym{transition}.
\end{equation}
\end{definition}
Every valid fused tree predicate will have at least
one fusion proof.~\footnote{
Fusion proofs are one of the elegant and efficient notational
ideas in ~\cite{Wich2005}, where they are called ``completer proofs''.
}
Every valid scanned tree predicate will have at least one scanning proof.
Together, fusion proofs and scanning proofs are called ``costed proofs''.
\index{costed proof|emph}
The set of costed proofs for a tree predicate \Vtp{tp} found in
a set of tree predicates \var{tp-set}, is written
$\var{Costed}(\var{tp-set}, \var{tp})$.
Often, \var{tp-set} is $\var{Valid}(\var{g}$.
In contexts, where \var{tp-set} is understood,
it may be written
$\var{Costed}(\var{tp})$.
A tree predicate \Vtp{tp}
is
\dfn{actually right recursive}
\index{actually right recursive|emph},
with respect to a set of tree predicates \var{tp-set},
$\var{RR}(\var{tp-set}, \var{tp})$,
\index{notation!RR@$\var{RR}(\var{tp-set}, \Vtp{tp})$|emph}
if and only if it is of the form
\begin{multline*}
\var{tp} = \bigl[ [ \Vsym{lhs} \de \Vsf{before} \mydot \Vsym{recurse} ], \var{i}, \var{j} \bigr] \\
\text{where} \quad \bigl[ [ \Vsym{lhs} \de \Vsf{before} \mydot \Vsym{recurse} \bigr],
\var{i2}, \var{j} \bigr] \in \var{tp-set} \\
\text{and} \quad \var{i2} < \var{i}.
\end{multline*}
A tree predicate map is a function that maps strings to sets of tree predicates.
More precisely, where \var{tp-set} is a set of tree predicates,
and \var{T} is the set of terminals they imply,
the function \Vtpm{tp-map},
\begin{equation*}
\Vtpm{tp-map} : \var{T}^+ \mapsto 2^\var{tp-set}
\end{equation*}
$\overrightarrow{\var{Valid}(\var{g})}$ (also written as \Vtpm{Valid}) is a tree predicate map,
as is
$\overrightarrow{\var{Valid}(\var{tps})}$ where \var{tps} is any set of tree predicates.
If the type is clear in context,
\Vtpm{tpm} may be written without the over-arrow:
\var{tpm}.
The union of a set of tree predicate maps,
is a tree predicate map.
Call the union \Vtpm{union},
and the set of tree predicate maps, \var{tpm-set}.
For every
\begin{gather}
\Vstr{w} \in
\bigcup_{\var{tpm} \in \var{tpm-set}}
{\left\lbrace \var{x} \;
\middle| \; \Vtpm{tpm} :\var{x} \mapsto \var{y}
\right\rbrace}, \\
\Vtpm{union}(\Vstr{w}) =
\bigcup_{
\var{tpm}
\in \var{tpm-set}
}
{\Vtpm{tpm}(\var{w})}.
\end{gather}
We will often want to relocate tree predicate maps to an offset in
a input stream.
\begin{mydefinition}
Let $\var{tp} = [ \var{dr}, \var{i}, \var{j} ]$
be a tree predicate
The \dfn{offset} of \var{tp} by \var{n}
written $\var{tp}+\var{n}$
is
\begin{equation}
[ \var{dr}, \var{i}+\var{n}, \var{j}+\var{n} ]
\end{equation}
An offset may be negative, as long as it never
results in a location that is less than zero.
The negative offset of a \var{tp} may be
written $\var{tp}\subtract \var{n}$
\end{mydefinition}
\begin{mydefinition}
\label{d:grammar-offset}
We define the \dfn{offset} of a grammar in an input stream,
written
\begin{equation}
\Vtpm{offset} = \Vtpoffset{g}{\var{n},\var{T2}}
\end{equation}
to be the tree predicate map
\begin{multline}
\Vtpm{offset}(\Vstr{prefix} . \Vstr{w}) = \\
\left\lbrace
\Vtp{tp} + \var{n}
\; \middle| \;
\Vtp{tp} \in
\Vtpm{\var{Valid}(\var{g},\Vstr{w})}
\right\rbrace
\end{multline}
where $\Vstr{prefix} \in (\var{T2} \cup \var{T}) ^\ast$ and
$\size{\var{prefix}} = \var{n}$.
Here
\var{T} is the set of terminals for \var{g}
and \var{T2} is a set of additional terminals.
If \var{T2} is the empty set,
we may write
\begin{equation}
\Vtpm{offset} = \Vtpoffset{g}{\var{n}},
\end{equation}
so that, for example,
\begin{equation}
\Vtpm{\var{Valid}(\var{g})} = \Vtpoffset{g}{0}.
\end{equation}
An offset may be negative, as long as it never results
in a location that is less than zero.
\end{mydefinition}
When comparing two grammars,
\begin{gather*}
\var{g1} = [ \var{V1}, \var{N1}, \var{P1}, \var{S1} ] \quad \text{and} \\
\var{g2} = [ \var{V2}, \var{N2}, \var{P2}, \var{S2} ],
\end{gather*}
we say they are symbolically disjoint if $\var{V1} \cup \var{V2} = \epsilon$.
We say they are symbolically equivalent if every \Vsym{A} such that
$\Vsym{A} \in \var{V1} \land \Vsym{A} \in \var{V2}$
has exactly the same set of derivations.
\begin{mydefinition}
A grammar \var{g1} is a subgrammar of another symbolically equivalent
grammar \var{g2}, if
\begin{gather*}
\var{g2} = [ \var{V2}, \var{N2}, \var{P2}, \var{S2} ], \\
\text{where} \quad \var{g1} = [ \var{V1}, \var{N1}, \var{P1}, \var{S1} ], \\
\quad \var{V1} \subset \var{V2}, \quad
\quad \var{N1} \subseteq \var{N2}, \quad
\quad \var{P1} \subset \var{P2} \quad \text{and} \quad
\quad \var{S1} \neq \var{S2}.
\end{gather*}
\end{mydefinition}
Note that by this definition of subgrammar,
a grammar is never a subgrammar of itself.
In a symbolically equivalent context,
we can offset a set of tree
predicates by \var{n} without affecting their validity.
The next observation states this more formally.
\begin{myobservation}
\label{t:offset-valid}
Consider two grammars,
\begin{gather*}
\var{g1} = [ \var{V1}, \var{N1}, \var{P1}, \var{S1} ] \quad \text{and} \\
\var{g2} = [ \var{V2}, \var{N2}, \var{P2}, \var{S2} ],
\end{gather*}
where \var{g2} is a subgrammar of \var{g1}.
If
\begin{gather*}
\Vstr{w2} \in (\var{V1} \backslash \var{N1})^+, \\
0 \le \Vloc{n} < \size{\var{w}} \quad \text{and} \\
\var{w1} = \bigl( (\var{V2} \backslash \var{N2}) \times \var{n} \bigr) \cat \var{w2}, \\
\text{then} \quad
\Vtpoffset{g2}{\var{n}}(\var{w2}) = \var{Valid-Predicates}(\var{g1}, \Vsym{S2}, \var{n}, \var{w1}).
\end{gather*}
\end{myobservation}
\begin{proof}
Define two grammars
\begin{gather}
\var{g1} = [ \var{V1}, \var{N1}, \var{P1}, \var{S1} ] \quad \text{and} \\
\var{g2} = [ \var{V2}, \var{N2}, \var{P2}, \var{S2} ]
\end{gather}
where \var{g2} is a subgrammar of \var{g1}.
We need to show
that the tree predicates derived using \var{g2} from
\begin{equation}
\label{e:offset-valid-proof-1}
\bigl[ [\var{S2} \de \mydot \Vsf{rhs} ], 0, 0 \bigr]
\end{equation}
for the input string
\begin{equation}
\label{e:offset-valid-proof-2}
\var{w2} = \Vstr{relevant}
\end{equation}
will match one-to-one with the those derived using \var{g1} from
\begin{equation}
\label{e:offset-valid-proof-3}
\bigl[ [\var{S2} \de \mydot \Vsf{rhs} ], \var{n}, \var{n} \bigr]
\end{equation}
for the input
\begin{equation}
\label{e:offset-valid-proof-4}
\var{w1} = \bigl( (\var{V1} \backslash \var{N1}) \times \var{n} \bigr) \; \cat \; \Vstr{relevant}
\end{equation}
Let
\begin{gather}
\label{e:offset-valid-proof-5}
\var{vp1} = \var{Valid-Predicates}(\var{g1}, \Vsym{S2}, \var{n}, \var{w1}) \quad \text{and} \\
\label{e:offset-valid-proof-6}
\var{vp2} = \var{Valid-Predicates}(\var{g2}, \Vsym{S2}, 0, \var{w2})
\end{gather}
We will consider \var{tp1} an arbitrary tree predicate from \var{vp1},
and show that
\begin{equation}
\label{e:offset-valid-proof-8}
\var{tp1} \in \var{tp1} \iff \var{tp1} \subtract \var{n} \in \var{vp2}.
\end{equation}
In the following, we will show the ``if'',
but at each step we will
also be careful that it a mutual implication,
so that the ``only if'' can be shown easily immediately afterwards.
For the ``if'' direction,
assume that
\begin{equation}
\var{tp1} = \bigl[ [\var{lhs} \de \Vsf{rhs1} \mydot \Vsf{rhs} ], \var{i}, \var{j} \bigr] \in \var{vp1}
\end{equation}
Then, for some $\Vsf{after} \in \var{V1}^\ast$,
by Definition \ref{d:valid-tp}
\begin{gather}
\label{e:offset-valid-proof-9}
[\var{lhs} \de \var{rhs1} \cat \var{rhs2}] \in \var{P1}, \\
\label{e:offset-valid-proof-10}
\Vsym{S2} \destar
\Crange{\var{w1}}{\var{n}}{\var{i}-1} \cat \var{lhs} \cat \var{after} \quad \text{and} \\
\label{e:offset-valid-proof-11}
\var{rhs1} \destar \Crange{\var{w1}}{\var{i}}{\Vloc{j}-1}.
\end{gather}
By the symbolic equivalence of \var{g1} and \var{g2}, we know that every derivation
from any symbol they share can be performed in both grammars.
This means that every symbol derivable from
\var{S2} must be in both \var{V1} and \var{V2}.
It also means that any rule in \var{P1} with a shared symbol on its LHS,
must also be in \var{P2}, so that
\begin{equation}
\label{e:offset-valid-proof-12}
[\var{lhs} \de \var{rhs1} \cat \var{rhs2}] \in \var{P2}.
\end{equation}
It also means that
\begin{equation}
\label{e:offset-valid-proof-13}
\var{after} \in \var{V2}^\ast.
\end{equation}
Note that we use symbol equivalence to show
\eqref{e:offset-valid-proof-12}
and \eqref{e:offset-valid-proof-13} and not the
subgrammar relationship.
This is in order to preserve the chain of mutual implications.
We know from \eqref{e:offset-valid-proof-2}
and \eqref{e:offset-valid-proof-4} that
\begin{multline}
\label{e:offset-valid-proof-14}
\Crange{\var{w1}} {\var{n}} {(\var{i}-1)} \\
= \Crange{\var{relevant}} {0} {(\var{i}-\var{n}-1)} = \\
= \Crange{\var{w2}} {0} {(\var{i}-\var{n}-1)}
\end{multline}
and that
\begin{multline}
\label{e:offset-valid-proof-15}
\Crange{\var{w1}} {\var{i}} {(\var{j}-1)} \\
= \Crange{\var{relevant}} {(\var{i}-\var{n})} {(\var{j}-\var{n}-1)} \\
= \Crange{\var{w2}} {(\var{i}-\var{n})} {(\var{j}-\var{n}-1)}.
\end{multline}
Note that, since the notation for ranges is inclusive, the notation
for a zero length range is $\Crange{\var{s}}{\var{x}}{(\var{x}-1)}$.
$\Crange{\var{s}}{\var{0}}{(-1)}$ is a possible value, and indicates
at zero-length string based at location 0.
Combining
\eqref{e:offset-valid-proof-10}
and \eqref{e:offset-valid-proof-14}, we obtain
\begin{equation}
\label{e:offset-valid-proof-16}
\Vsym{S2} \destar
\Crange{\var{w2}}{\var{n}}{(\var{i}-\var{n}-1)} \cat \var{lhs} \cat \var{after}.
\end{equation}
Similarly, combining
\eqref{e:offset-valid-proof-11}
and \eqref{e:offset-valid-proof-15}, we obtain
\begin{equation}
\label{e:offset-valid-proof-17}
\var{rhs1} \destar \Crange{\var{w2}}{(\var{i}-\var{n})}{(\Vloc{j}-\var{n}-1)}.
\end{equation}
Combining \eqref{e:offset-valid-proof-12},
\eqref{e:offset-valid-proof-13},
\eqref{e:offset-valid-proof-16},
and \eqref{e:offset-valid-proof-17},
and using the definition of validity for tree predicates
(Definition \ref{d:valid-tp}),
shows us that
\begin{equation}
\var{tp1}-\var{n} = \bigl[ [\var{lhs} \de \Vsf{rhs1} \mydot \Vsf{rhs} ], \var{i}-\var{n}, \var{j}-\var{n} \bigr] \in \var{vp2}
\end{equation}
This was what we needed to show the ``if'' direction of
\eqref{e:offset-valid-proof-8}.
To show the ``if only'' direction,
we note that every step of the argument for the ``if'' direction
was a mutual implication.
\end{proof}
\section{An approach to the Earley algorithm}
\label{s:earley-approach}
Tree predicates are often treated as Earley items.
An Earley item (type \type{EIM}) is a duple
\[
[\Vdr{dotted-rule}, \Vorig{x}]
\]
of dotted rule and origin.
The origin is the location where recognition of the rule
started.
It is sometimes called the ``parent''.
For convenience, the type \type{ORIG} will be a synonym
for \type{LOC}, indicating that the variable designates
the origin element of an Earley item.
An Earley set (type \type{ES}) is a set of Earley items at a single dot location,
call it \Vloc{j},
so that every Earley item $[\Vdr{dotted-rule}, \Vorig{i}]$
in an Earley set corresponds one-to-one with
the tree predicate
\begin{equation*}
[\Vdr{dotted-rule}, \Vorig{i}, \Vloc{i}].
\end{equation*}
An Earley table is a set of Earley sets.
An Earley parser builds a table of Earley sets,
\begin{equation*}
\EVtable{\Earley}{i},
\quad \text{where} \quad
0 \le \Vloc{i} \le \size{\Cw}.
\end{equation*}
Earley sets are of type \type{ES}.
Earley sets are often named by their location,
so that \Ves{i} means the Earley set at \Vloc{i}.
The type designator \type{ES} is sometimes omitted.
Tree predicates and Earley items (EIM's) can be separated into
classes based on their predot symbol.
If a tree predicate or EIM has no predot symbol, it is a prediction.
If the predot symbol is a terminal, it is scanned tree predicate (EIM),
or a scansion.
If the predot symbol is a non-terminal,
it is fused tree predicate (EIM), or a fusion.
If a tree predicate (EIM) is not a prediction,
that is, if it is either a fusion or a scansion,
it can be called a reduction.
Reductions and predictions are disjoint sets.
Note that, in Marpa, terminals
and non-terminals are not disjoint,
so that scansions and fusions are not disjoint sets.
It is possible for a EIM (and therefore its corresponding tree predicate,
to be
both a scansion and a fusion in the same Earley set of the same
parse.
We say that the Earley item is valid in Earley set
if and only if
the corresponding tree predicate is valid.
Let \var{X} be one of the EIM classifications: prediction,
scansion, fusion or reduction.
We say that an Earley set is correct up to \var{X}
if and only if all of Earley items of type \var{X} are correct.
We say that an Earley set is complete up to \var{X},
if and only if it contains every correct Earley item of type \var{X}.
We say that an Earley set is valid up to \var{X},
if and only if it is correct and complete up to \var{X}.
For example, we say that Earley set \Vloc{i} is correct up to scansion
if and only if all of its scanned Earley items are correct.
We say that an Earley set is correct
if and only if all of its Earley items are correct.
We say that an Earley set is complete
if and only if all of its Earley items are complete.
We say that an Earley set is valid
if and only if it is correct and complete.
We say that an Earley table is correct, complete or valid
as far as \Vloc{i}
if and only if every Earley set is, respectively,
correct, complete or valid
as far as \Vloc{i}.
We say that an Earley table is correct, complete or valid
if and only if
all of its Earley sets are correct, complete or valid.
\begin{subequations}
\begin{lemma}
\label{l:eim-correctness-is-transitive}
Let \textup{\Vdr{predecessor}} be a dotted rule,
let \textup{\Vdr{successor}} be its successor,
and let \textup{\Vsym{transition}} be the postdot symbol in
\textup{\Vdr{predecessor}} and the predot symbol in
\textup{\Vdr{successor}}.
If the EIM
\begin{equation}
\label{e:eim-reduction-1}
\textup{
$\bigl[\Vdr{predecessor}, \Vorig{i} \bigr]$ is correct at \Ves{j},
}
\end{equation}
and
\begin{equation}
\label{e:eim-reduction-2}
\textup{
$\Vsym{transition} \destar \Cwrange{\var{j}+1}{\var{k}}$
}
\end{equation}
where $\var{j} < \var{k}$,
then the EIM
\begin{equation}
\label{e:eim-reduction-3}
\textup{
$\bigl[\Vdr{successor}, \Vorig{i} \bigr]$ at \Ves{k}
is correct.
}
\end{equation}
\end{lemma}
\begin{proof}
Let \Vdr{predecessor} be the dotted rule
\begin{equation}
\textup{ $[\Vsym{lhs} \de \Vsf{before} \mydot \Vsym{transition} \cat \Vsf{after}].$ }
\end{equation}
\Vdr{successor} is therefore
\begin{equation}
\label{e:eim-reduction-3-1}
[\Vsym{lhs} \de \Vsf{before} \cat \Vsym{transition} \mydot \Vsf{after}].
\end{equation}
From assumption \eqref{e:eim-reduction-1}, we know that
\begin{equation}
\label{e:eim-reduction-4}
\Vsym{start} \destar
\Cwrange{0}{\Vloc{i} - 1} \cat \Vsym{transition} \cat \Vsf{post-rule}.
\end{equation}
Also from \eqref{e:eim-reduction-1},
we have
\begin{equation}
\label{e:eim-reduction-5}
\Vsf{before} \destar \Cwrange{\var{i}}{\var{j}}.
\end{equation}
Combining
\eqref{e:eim-reduction-5}
with the assumption \eqref{e:eim-reduction-2},
we have
\begin{equation}
\label{e:eim-reduction-6}
\Vsf{before} \cat \Vsym{transition} \destar \Cwrange{\var{i}}{\var{k}}.
\end{equation}
If, for substitution in the definition of tree prediction validity,
we match \eqref{e:eim-reduction-3-1} with \eqref{e:tree-predicate-2},
\eqref{e:eim-reduction-4} with \eqref{e:tree-predicate-3}
and
\eqref{e:eim-reduction-6} with \eqref{e:tree-predicate-4},
we see that the tree predicate
\begin{equation}
[ \Vdr{successor}, \var{i}, \var{k} ]
\end{equation}
is valid.
And, by the definition of an Earley item,
this tree predicate is
equivalent to
\eqref{e:eim-reduction-3}.
\end{proof}
\end{subequations}
\begin{subequations}
\begin{theorem}
\label{l:eim-prediction-correctness}
Let \Vdr{cause} be a dotted rule,
where $\Vsym{transition} = \Postdot{\var{cause}}$.
\Vdr{cause}.
If the EIM
\begin{equation}
\label{e:eim-prediction-1}
\bigl[\Vdr{cause}, \Vorig{i} \bigr]
\text{ is correct at \Ves{j},}
\end{equation}
and
\begin{gather}
\label{e:eim-prediction-2}
\Vsym{transition} \de \Vsym{initial} \cat \Vsf{remainder} \quad \text{and} \\
\label{e:eim-prediction-2-5}
\Vsym{initial} \de \Vsf{rhs}
\end{gather}
are rules
then the EIM
\begin{equation}
\label{e:eim-prediction-3}
\bigl[ [ \Vsym{initial} \de \mydot \Vsf{rhs} ], \Vorig{j} \bigr]
\text{ at \Ves{j}}
\end{equation}
is correct.
\end{theorem}
\begin{proof}
Let \Vdr{cause} be the dotted rule
\begin{equation}
\label{e:eim-prediction-3-5}
[\Vsym{lhs} \de \Vsf{before} \mydot \Vsym{transition} \cat \Vsf{after}].
\end{equation}
From assumption \eqref{e:eim-prediction-1}, we know that
\begin{equation}
\label{e:eim-prediction-4-5}
\Vsym{start} \destar
\Cwrange{0}{\Vloc{j} \negthickspace - 1}
\mydot \Vsym{transition} \cat \Vsf{post-rule}.
\end{equation}
From \eqref{e:eim-prediction-4-5}
and \eqref{e:eim-prediction-2}, we know that
\begin{multline}
\label{e:eim-prediction-4-7}
\Vsym{start} \destar \\
\Cwrange{0}{\Vloc{j} \negthickspace - 1}
\mydot \Vsym{initial} \cat \Vsf{remainder} \cat \Vsf{post-rule}.
\end{multline}
For substitution in the definition of tree prediction validity,
we match the rule
$[\Vsym{initial} \de \Vsf{rhs}]$
with \eqref{e:tree-predicate-2}
and \eqref{e:eim-prediction-4-7} with \eqref{e:tree-predicate-3}.
For \Vsf{predot} in
\eqref{e:tree-predicate-3}
and \eqref{e:tree-predicate-4},
we substitute the empty sentential form,
so that
\begin{equation}
\Vsf{predot} \destar \epsilon
\end{equation}
matches \eqref{e:tree-predicate-4}.
With these substitutions in the definition
of tree prediction validity,
we see that the tree predicate
\begin{equation}
\bigl[ [\Vsym{initial} \de \Vsf{rhs}], \Vloc{j}, \Vloc{j} \bigr]
\end{equation}
is valid.
And, by the definition of an Earley item,
this tree predicate is
equivalent to
\eqref{e:eim-prediction-3}.
\end{proof}
\end{subequations}
\section{Operations of the Earley algorithm}
\label{s:earley-ops}
Because the Marpa project's motivation is implementation,
this paper will describe the Marpa algorithm,
and prove its correctness, using pseudo-code.
Nonetheless, this section's more abstract outline,
as rules of inference whose closure produces all the
valid tree predicates
may be helpful as an briefer introduction.
\footnote{
Among the many
more formal presentations of the Earley algorithm,
we found that of ~\cite{Wich2005} especially elegant
and useful.
}
We will also use it as a framework within which to define
this {\doc}'s terminology.
Consider \var{Initial}, the singleton set of tree predicates,
containing only the initial tree predicate, as follows:
\begin{equation}
\Bigset{ \bigl[ [ \Vsym{accept} \de \mydot \Vsym{start} ], 0, 0 \bigr] }
\end{equation}
The set of valid tree predicts will be the closure of \var{Initial} under
three operations: the scanner, the fuser and the predictor.
In the following, \var{g} will be an arbitrary grammar in Marpa internal form,
and \Cw will be an arbitrary input.
\subsection{Scanner}
For abitrary \Vsym{lhs}, \Vsf{before}, \Vsf{after}, \Vloc{i}, \Vloc{j},
where
\begin{gather*}
\Vdr{before} = [ \Vsym{lhs} \de \Vsf{before} \mydot \CVw{\var{j}+1} \cat \Vsf{after} ]
\quad \text{and} \\
\Vdr{after} = [ \Vsym{lhs} \de \Vsf{before} \cat \CVw{\var{j}+1} \mydot \Vsf{after} ], \\
\end{gather*}
from any tree predicate of the form
\begin{equation*}
\Vtp{predecessor} = [ \Vdr{before}, \var{i}, \var{j} ]
\end{equation*}
infer the tree predicate
\begin{equation*}
\Vtp{scansion} = [ \Vdr{after}, \var{i}, \var{j}+1 ] .
\end{equation*}
In this case, \Vtp{predecessor} is the \dfn{predecessor} and \Vtp{scansion} is the \dfn{successor}.
$\CVw{\var{j}+1}$ is the \dfn{cause} and \Vtp{scansion} is the \dfn{effect}.
$\CVw{\var{j}+1}$ is also called the \dfn{transition symbol}
of the scanner operation.
\Vtp{scansion} is called a \dfn{scanned Earley item}
or a \dfn{scansion}.
The \dfn{antecedents} of the scansion are a duple:
its cause ($\CVw{\var{j}+1}$) and its predecessor.
\subsection{Fuser}
For abitrary \Vsym{lhs}, \Vsf{before}, \Vsf{after}, \Vsym{trans}, \Vsf{rhs},
\Vloc{i}, \Vloc{j}, \Vloc{k},
where
\begin{gather*}
\Vdr{before} = [ \Vsym{lhs} \de \Vsf{before} \mydot \Vsym{trans} \cat \Vsf{after} ]
\quad \text{and} \\
\Vdr{after} = [ \Vsym{lhs} \de \Vsf{before} \cat \Vsym{trans} \mydot \Vsf{after} ], \\
\end{gather*}
from any two tree predicates of the form
\begin{gather*}
\Vtp{predecessor} = [ \Vdr{before}, \var{i}, \var{j} ] \\
\Vtp{component} = \bigl[ [ \Vsym{trans} \de \Vsf{rhs} \mydot ], \var{j}, \var{k} \bigr]
\end{gather*}
infer the tree predicate
\begin{equation*}
\Vtp{fusion} = [ \Vdr{after}, \var{i}, \var{k} ] .
\end{equation*}
In this case, \Vtp{predecessor} is the \dfn{predecessor} and \Vtp{fusion} is the \dfn{successor}.
\Vtp{component} is the \dfn{cause} and \Vtp{scansion} is the \dfn{effect}.
\Vsym{trans} is also called the \dfn{transition symbol}
of the scanner operation.
\Vtp{component} is called
the \dfn{component}\footnote{
The term ``component'' comes from Irons \cite{Irons}.
}
of the fusion operation.
\Vtp{fusion} is called a \dfn{fused Earley item}
or a \dfn{fusion}.
The \dfn{antecedents} of the fusion are a duple:
its cause (\Vtp{component})
and its predecessor (\Vtp{predecessor}).
What we call a fusion in this \doc{}.
is traditionally called a completion.
The older terminology is overloaded in an
extremely confusing way,
and we hope that readers will view
our departure from it with
more relief than annoyance.\footnote{
A completion can be either the outcome
of a completer operation (sense 1), or an Earley item with the dot
at the end of the rule (sense 2).
A completion in sense 1 may or may not be a completion in sense 2.
On the other hand, the cause of a completion in sense 1 must always
be a completion in sense 2.
In this \doc, the term ``completion'' will only be used in sense 2.
Use of the term ``completion''
in the traditional sense 1
is abandoned in favor of the term ``fusion''.
}
\subsection{Predicter}
For abitrary \Vsym{lhs}, \Vsf{before}, \Vsf{after}, \Vsym{postdot}, \Vsf{rhs},
\Vloc{i}, \Vloc{j},
where
\begin{equation*}
\Vdr{before} = [ \Vsym{lhs} \de \Vsf{before} \mydot \Vsym{postdot} \cat \Vsf{after} ]
\end{equation*}
from any tree predicate of the form
\begin{equation*}
\Vtp{cause} = [ \Vdr{before}, \var{i}, \var{j} ] \\
\end{equation*}
and any rule of the form
\begin{equation*}
\Vrule{predicted} = [ \Vsym{postdot} \de \Vsf{rhs} ],
\end{equation*}
infer the tree predicate
\begin{equation*}
\Vtp{prediction} = \bigl[ [ \Vsym{postdot} \de \mydot \Vsf{rhs} ], \var{j}, \var{j} \bigr].
\end{equation*}
In this case,
\Vtp{cause} is the \dfn{cause} and \Vtp{prediction} is the \dfn{effect}.
\Vtp{prediction} is called a \dfn{predicted} tree predicate
or a \dfn{prediction}.
The \dfn{antecedents} of the prediction are a duple:
its cause (\Vtp{cause})
and the prediction's rule (\Vrule{rule}).
\subsection{Earley items}
The inferences described above in terms of tree predicates can also
be described in terms of Earley items.
All terms defined for tree predicates are applied to the
corresponding Earley items as well.
\section{Leo memoization}
\label{s:leo-memoization}
To
deal with right recursion in linear time,
Marpa memoizes certain Earley item completions,
using the technique of
\cite{Leo1991}.
A Leo item (LIM) is the triple
\begin{equation*}
[ \Vdr{top}, \Vsym{transition}, \Vorig{top} ]
\end{equation*}
where \Vsym{transition} is the transition symbol.
Like EIM's,
each LIM is associated with a specific Earley set.
When a LIM is associated with an Earley set \Ves{i},
that LIM is said to be ``in'' \Ves{i}.
\Ves{i} is also said to ``contain'' the LIM.
Let
\begin{gather*}
\Vrule{memo} = [ \Vsym{lhs} \de \Vsf{pre-final} \cat \Vsym{transition} ] \\
\Vrule{cause} = [ \Vsym{transition} \de \Vsf{cause-lhs} ]
\end{gather*}
be a rule in the grammar.
Let \Vdr{summit} be a dotted rule.
and \Vorig{summit} and \Vorig{memo} be two locations,
where
$\Vorig{summit} \le \Vorig{memo}$.
\Vrule{memo}, \Vrule{cause} and
the rule of \Vdr{summit} may or may not be distinct.
We say that an Earley item \Ves{i} is {\bf Leo-memoized} if it is
\begin{equation*}
\bigl[ [ \Vsym{lhs} \de \Vsf{pre-final} \Vsym{transition} \mydot ],
\Vorig{memo} \bigr].
\end{equation*}
where Earley set \Ves{l} physically
contains the Leo item
\begin{equation*}
[ \Vdr{summit}, \Vsym{transition}, \Vorig{summit} ].
\end{equation*}
and physically contains the Earley item
\begin{equation*}
\Veim{predecessor} =
\bigl[ [ \Vsym{lhs} \de \Vsf{pre-final} \mydot \Vsym{transition} ],
\Vorig{memo} \bigr].
\end{equation*}
%
and
%
\begin{equation*}
\Veim{cause} =
\bigl[ [ \Vsym{transition} \de \Vsf{cause-rhs} \mydot ],
\Vorig{l} \bigr]
\end{equation*}
is either physically contained or memoized at \Ves{i}.
Quite often we will simply say that a Leo-memoized
is {bf memoized}.
\Veim{predecessor} must be physically present in the Earley sets,
but \Veim{cause} may also be Leo-memoized, so that memoized Earley items
form ``trails'', starting with a physical Earley item which is the
``trailhead'', and leading to a {bf summit}.
All the Earley items on a Leo
trail are completions.
Detailed description of Leo memoization can be found in
\cite{Leo1991}, and details of Marpa's implementation
and proofs of correctness and the complexity claims
will follow.
Here we will make some comments as aids to the intuition.
The Leo memoization is only intended to memoize deterministic
Leo trails, something which the implementation guarantees.
This means that, in the above,
the choice of \Veim{predecessor} can be expected to be unique.
no other Earley item in \Ves{memo} will have \Vsym{transition}
as its postdot symbol.
When this is not the cause, the Leo item is not created.
Not creating a Leo item is always safe.
The omission of Leo items has no effect
on correctness -- they are purely memoizations for efficiency.
Leo's memoization was created in response to the problems
presented to Earley's algorithm by right recursion.
Right recursion presented time and space complexity problems
for Earley's original algorithm.
Whenever Earley's did not
know whether an Earley set was going to be the last in a right recursion,
it needed to physically track the potential completions in that set.
The length of these chains of completion grew linearly with the length of the
right recursion and, as a result, time and space for right recursion in
Earley's original algorithm was quadratic.
Leo's memoization lets the top and bottom of the Leo trail (which we call its
``summit'' and ``trailhead'', respectively)
stand in for the entire trail.
The full Leo trail can be reconstructed afterwards using the Leo items,
once location where the right recursion ends is known.
Marpa performs this reconstruction during its evaluation phase.
The Leo memoization guarantees that, for any deterministic right recursion,
a small, constant number of Earley items and Leo items is sufficient, the
rest being memoized.
Leo's original algorithm did not restrict the use of Leo memoization to
right recursions -- it would also memoize any trails
involving rightmost non-nulling symbols, even those whose maximum length
was a fixed constant.
Leo memoization is not expensive, but it does have some cost, and experience
with the Marpa implementation led us to restruct use of Leo memoization
to those situations in which right recursions,
and therefore Leo trails of arbitrary length, were possible.
The set of Leo-memoized Earley items is {\bf not} disjoint from
the set of Earley items actually in the Earley sets --
an Earley item may be memoized even if it actually exists.
At points,
we will need to compare the Earley sets
produced by the different recognizers.
\EVtable{\alg{Recce}}{i} will be
the Earley set at \Vloc{i}
in the table of Earley sets of
the \alg{Recce} recognizer.
For example,
\EVtable{\Marpa}{j} will be Earley set \Vloc{j}
in \Marpa's table of Earley sets.
In contexts where it is clear which recognizer is
intended,
\Vtable{k}, or \Ves{k}, will symbolize Earley set \Vloc{k}
in that recognizer's table of Earley sets.
If \Ves{working} is an Earley set,
$\size{\Ves{working}}$ is the number of Earley items
in \Ves{working}.
\Rtablesize{\alg{Recce}} is the total number
of Earley items in all Earley sets for \alg{Recce},
\begin{equation*}
\Rtablesize{\alg{Recce}} =
\sum\limits_{\Vloc{i}=0}^{\size{\Cw}}
{\bigsize{\EVtable{\alg{Recce}}{i}}}.
\end{equation*}
For example,
\Rtablesize{\Marpa} is the total number
of Earley items in all the Earley sets of
a \Marpa{} parse.
Recall that
there was a unique acceptance symbol,
\Vsym{accept}, in \Cg{}.
The input \Cw{} is accepted if and only if,
for some \Vstr{rhs},
\begin{equation*}
\bigl[[\Vsym{accept} \de \Vstr{rhs} \mydot], 0\bigr] \in \bigEtable{\Vsize{\Cw}}
\end{equation*}
\section{The Leo algorithm}
\label{s:leo}
In \cite{Leo1991}, Joop Leo presented a method for
dealing with right recursion in \On{} time.
Leo shows that,
with his modification, Earley's algorithm
is \On{} for all LR-regular grammars.
(LR-regular is LR where lookahead
is infinite length, but restricted to
distinguishing between regular expressions.)
Summarizing Leo's method,
it consists of spotting potential right recursions
and memoizing them.
Leo restricts the memoization to situations where
the right recursion is unambiguous.
Potential right recursions are memoized by
Earley set, using what Leo called
``transitive items''.
In this \doc{} Leo's ``transitive items''
will be called Leo items,
and will be of type \type{LIM}.
In each Earley set, there is at most one Leo item per symbol.
A Leo item (LIM) is the triple
\begin{equation*}
[ \Vdr{top}, \Vsym{transition}, \Vorig{top} ]
\end{equation*}
where \Vsym{transition} is the transition symbol,
and
\begin{equation*}
\Veim{top} = [\Vdr{top}, \Vorig{top}]
\end{equation*}
is the Earley item to be added on fusions over
\Vsym{transition}.
Leo items memoize what would otherwise be sequences
of Earley items.
Leo items only memoize unambiguous (or
deterministic) sequences,
so that the top of the sequence can represent
the entire sequence --
the only role the other EIM's in the sequence
play in the parse is to derive the top EIM.
We will call these memoized sequences, Leo sequences.
To quarantee that a Leo sequence is deterministic,
\Leo{} enforced \dfn{Leo uniqueness}.
Define containment of a dotted rule in a Earley set
of EIM's as
\begin{equation*}
\begin{split}
& \mymathop{Contains}(\Ves{i}, \Vdr{d}) \defined \exists \, \Veim{b}, \Vorig{j} \mid \\
& \qquad \Veim{b} = [ \Vdr{d}, \Vorig{j} ]
\land \Veim{b} \in \Ves{i}.
\end{split}
\end{equation*}
A dotted rule \Vdr{d} is \dfn{Leo unique} in the Earley set
at \Ves{i}
if and only if
\begin{equation*}
\begin{split}
& \Penult{\Vdr{d}} \neq \Lambda \\
& \land \forall \, \Vdr{d2} \bigl( \mymathop{Contains}(\Ves{i}, \Vdr{d2}) \implies \\
& \qquad \Postdot{\Vdr{d}} = \Postdot{\Vdr{d2}} \implies \Vdr{d} = \Vdr{d2} \bigr).
\end{split}
\end{equation*}
If \Vdr{d} is Leo unique, then the symbol $\Postdot{\Vdr{d}}$ is
also said to be \dfn{Leo unique}.
In cases where a symbol \Vsym{transition} is Leo unique in \Ves{i},
we can speak of the dotted rule for \Vsym{transition},
and the rule for \Vsym{transition},
because there can be only one of each.
In the previous definitions,
it is important to emphasize that \Vdr{d2} ranges over all the dotted
rules of Earley set \Ves{i},
even those which are ineligible for Leo memoization.
Let \var{n} be the length of a Leo sequence.
In \Earley, each such sequence would be expanded in every
Earley set that is the origin of an EIM included in the
sequence, and the total number of EIM's would be
\order{\var{n}^2}.
With Leo memoization, a single EIM stands in for the sequence.
There are \Oc{} Leo items per Earley set,
so the cost of the sequence is \Oc{} per Earley set,
or \On{} for the entire sequence.
If, at evaluation time,
it is desirable to expand the Leo sequence,
only those items actually involved in the parse
need to be expanded.
All the EIM's of a potential right-recursion
will be in one Earley set and the number of EIM's
will be \On{},
so that even including expansion of the Leo sequence
for evaluation, the time and space complexity of
the sequence remains \On{}.
\begin{sloppypar}
Recall that we
call a dotted rule \Vdr{d} a \dfn{penult} if $\Penult{\var{d}} \neq \Lambda$.
In Leo's original algorithm, any penult
was treated as a potential right-recursion.
\Marpa{} applies the Leo memoizations in more restricted circumstances.
For \Marpa{} to consider a dotted rule
\begin{equation*}
\Vdr{candidate} = [\Vrule{candidate}, \var{i}]
\end{equation*}
for Leo memoization,
\Vdr{candidate} must be a penult and
\Vrule{candidate} must be right-recursive.
\end{sloppypar}
By restricting Leo memoization to right-recursive rules,
\Marpa{} incurs the cost of Leo memoization only in cases
where Leo sequences could be infinitely
long.
This more careful targeting of the memoization is for efficiency reasons.
If all penults are memoized,
many memoizations will be performed where
the longest potential Leo sequence is short,
and the payoff is therefore very limited.
Omission of a memoization does not affect correctness,
so \Marpa{}'s restriction of Leo memoization
preserves the correctness as shown in Leo\cite{Leo1991}.
Later in this \doc{} we will
show that this change also leaves
the complexity results of
Leo\cite{Leo1991} intact.
\section{The Marpa Recognizer}
\label{s:pseudocode}
\subsection{Complexity}
Alongside the pseudocode of this section
are observations about its space and time complexity.
In what follows,
we will charge all time and space resources
to Earley items,
or to attempts to add Earley items.
We will show that,
to each Earley item actually added,
or to each attempt to add a duplicate Earley item,
we can charge amortized \Oc{} time and space.
At points, it will not be immediately
convenient to speak of
charging a resource
to an Earley item
or to an attempt to add a duplicate
Earley item.
In those circumstances,
we speak of charging time and space
\begin{itemize}
\item to the parse; or
\item to the Earley set; or
\item to the current procedure's caller.
\end{itemize}
We can charge time and space to the parse itself,
as long as the total time and space charged is \Oc{}.
Afterwards, this resource can be re-charged to
the initial Earley item, which is present in all parses.
Soft and hard failures of the recognizer use
worst-case \Oc{} resource,
and are charged to the parse.
We can charge resources to the Earley set,
as long as the time or space is \Oc{}.
Afterwards,
the resource charged to the Earley set can be
re-charged to an arbitrary member of the Earley set,
for example, the first.
If an Earley set is empty,
the parse must fail,
and the time can be charged to the parse.
In a procedure,
resource can be ``caller-included''.
Caller-included resource is not accounted for in
the current procedure,
but passed upward to the procedure's caller,
to be accounted for there.
A procedure to which caller-included resource is passed will
sometimes pass the resource upward to its own caller,
although of course the top-level procedure does not do this.
For each procedure, we will state whether
the time and space we are charging is inclusive or exclusive.
The exclusive time or space of a procedure is that
which it uses directly,
ignoring resource charges passed up from called procedures.
Inclusive time or space includes
resource passed upward to the
current procedure from called procedures.
Earley sets may be represented by \Ves{i},
where \var{i} is the Earley set's locaiton \Vloc{i}.
The two notations should be regarded as interchangeable.
The actual implementation of either
should be the equivalent of a pointer to
a data structure containing,
at a minimum,
the Earley items,
the Leo items,
a memoization of the Earley set's location as an integer,
and a per-set-list.
Per-set-lists will be described in Section \ref{s:per-set-lists}.
\begin{algorithm}[ht]
\caption{Marpa Top-level}
\begin{algorithmic}[1]
\Procedure{Main}{}
\State \Call{Initial}{}
\For{ $\var{i}, 0 \le \var{i} \le \Vsize{w}$ }
\State \Comment At this point, $\Ves{x}$ is complete, for $0 \le \var{x} < \var{i}$
\State \Call{Scan pass}{$\var{i}, \var{w}[\var{i} \subtract 1]$}
\If{$\size{\Ves{i}} = 0$}
\State reject \Cw{} and return
\EndIf
\State \Call{Fusion pass}{\var{i}}
\State \Call{Prediction pass}{\var{i}}
\EndFor
\State \Call{Accept or Reject Logic}{}
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Complexity of Marpa Top-level}
Exclusive time and space
for the loop over the Earley sets,
including any time passed up from the
\call{Scan pass}{},
\call{Fusion pass}{},
and \call{Prediction pass}{},
is charged to the Earley sets.
Overhead is charged to the parse.
All these resource charges are \Oc{}.
\subsection{Ruby Slippers parsing}
The Marpa parse engine is different from previous Earley
parse engines in separating
the \call{Scan pass}{}, on one hand, from
\call{Fusion pass}{},
and \call{Prediction pass}{}, on the other.
Because of this separation,
when the scanning of tokens that start at location \Vloc{i} begins,
the Earley sets for all locations prior to \Vloc{i} are complete.
As a result, there is a point at which the parser can pause,
after the Earley set at \Vloc{i} is completely built,
but before any tokens which begin at location \Vloc{i} have
been scanned.
An application may examine the state of the parse,
and may alter the input in response to what it finds.
\begin{algorithm}[ht]
\caption{Marpa Top-level}
\begin{algorithmic}[1]
\Procedure{Accept or Reject Logic}{}
\If{$[\Vdr{accept}, 0] \in \Etable{\Vsize{w}}$}
\State accept \Cw{}
\Else
\State reject \Cw{}
\EndIf
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Complexity of Accept or Reject Logic}
The time and space complexity is clearly \Oc{},
which is caller-included.
The caller will include charge this time and space
to the parse.
\begin{algorithm}[ht]
\caption{Initialization}
\begin{algorithmic}[1]
\Procedure{Initial}{}
\State \Call{Add EIM}{$0_{ES}, \Vdr{start}, 0$}
\State \Call{Prediction pass}{$0_{ES}$}
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Initialization}
\label{p:initial-op}
Inclusive time and space is \Oc{}
and is charged to the parse.
\subsection{Prediction pass}
\label{p:prediction-op}
\begin{algorithm}[ht]
\caption{Prediction pass}
\label{a:prediction-pass}
\begin{algorithmic}[1]
\Procedure{Prediction pass}{\Vloc{i}}
\State Note: Let \var{stack} be a stack of rules.
\State Let \var{seen} be the set of rules which have already
\State \hspace{2.5em} been put on the stack.
\State Initially, \var{stack} and \var{seen} are empty,
\For{each Earley item $\Veim{work} \in \Vtable{i}$
}\label{a:prediction-pass-loop1}
\State $[\Vdr{work}, \Vloc{origin}] \gets \Veim{work}$
\State \Call{Predict from dotted rule}{\Vdr{work}, \var{stack}, \var{seen}}
\EndFor
\For{each \Vrule{work} on working stack}
\State Note: Where $\Vrule{predict} = \Vsym{lhs} \de \Vsf{rhs}$
\State $\Vdr{predict} \gets [ \Vsym{lhs} \de \mydot \Vsf{rhs} ]$
\State \Call{Add EIM}{\Ves{i}, \Vdr{predict}, \Vorig{i}}
\State \Call{Predict from dotted rule}{\Vdr{predict}, \var{stack}, \var{seen}}
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}
\begin{algorithm}[ht]
\caption{Push predictions onto stack}
\begin{algorithmic}[1]
\Procedure{Predict from dotted rule}{\Vdr{x}, \var{stack}, \var{seen}}
\State Let $\var{predictions}$ be all \Vrule{x} such that
\State \hspace{2.5em} the LHS of \Vrule{x} is $\Postdot{\Vdr{x}}$
\For{each $\Vrule{prediction} \in \var{predictions}$}
\If{$\var{prediction} \notin \var{seen}$}
\State Push \Vrule{prediction} onto the working stack
\State Add \Vrule{prediction} to \var{seen}
\EndIf
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{The prediction pass is correct}
\label{p:prediction-correct}
\begin{theorem}
Algorithm~\ref{a:prediction-pass} is correct.
\end{theorem}
\begin{proof}
Examination of \ref{a:prediction-pass} will show
each an Earley item corresponds to a tree
predicate that meets the criteria of
Theorem~\ref{l:eim-prediction-correctness}.
for correctness.
\end{proof}
\subsection{The prediction pass is complete}
\label{p:prediction-complete}
\begin{theorem}
If called when Earley set \Vloc{j} is either complete
up to reductions,
or contains the start Earley item, then
Algorithm~\ref{a:prediction-pass} is complete.
\end{theorem}
\begin{proof}
The cause of a prediction is either the start Earley item,
another prediction,
or a reduction.
From
Theorem~\ref{l:eim-prediction-correctness},
we see that a complete set of predictions can
be found by examining all possible pairs of
Earley items (causes) and rules.
From
Theorem~\ref{l:eim-prediction-correctness},
we can also see that
the cause of a prediction must be an Earley item that
is valid in the
same Earley set,
and that the rule must have, as its LHS,
the postdot symbol of the cause.
For this proof, we will call a prediction ``direct'' if
its cause is the initial Earley item or a reduction.
We will call a prediction ``indirect'' if it is not a direct
prediction.
We can see directly that
the cause of an indirect prediction must be a prediction
which is not the initial Earley item.
The loop near line \ref{a:prediction-pass-loop1}
in Algorithm \ref{a:prediction-pass}
can be seen to
examine, and use as causes, all Earley items already in
Earley set \Vloc{j}, and as rules,
all rules whose LHS is their postdot symbol.
By assumption this includes the start item, if it is correct,
as well as any valid reductions.
This shows that
Algorithm~\ref{a:prediction-pass} will leave
the Earley set \var{j} complete up to all direct predictions.
To show that Algorithm~\ref{a:prediction-pass} leaves
complete up to indirect predictions,
we first note if the sequence leads to a cycle,
it will be terminated at the first prediction that has
already been placed on the working stack.
This means that all sequences will be finite.
It can be seen that, once a rule has been put on the
working stack,
placing it on the stack again will add no new Earley item
predictions, so that it is safe to
terminate a cycle at the first repetition.
From this, we see that all sequences of indirect predictions
will be finite.
An induction on the length of the sequences,
with the direct predictions as the base,
will show that
Algorithm~\ref{a:prediction-pass} adds all indirect predictions.
We have seen that Algorithm~\ref{a:prediction-pass}
leaves Earley set \var{j} complete up to both direct and indirect
predictions.
Therefore Algorithm~\ref{a:prediction-pass}
leaves Earley set \var{j} complete up to predictions.
\end{proof}
\subsection{The scan pass}
\label{p:scan-op}
\var{transitions} is a set of tables, one per Earley set.
The tables in the set are indexed by symbol.
Symbol indexing is \Oc{}, since the number of symbols
is a constant, but
since the number of Earley sets grows with
the length of the parse,
it cannot be assumed that Earley sets can be indexed by location
in \Oc{} time.
For the operation $\var{transitions}(\Vloc{l}, \Vsym{s})$
to be in \Oc{} time,
\Vloc{l} must represent a link directly to the Earley set.
In the case of scanning,
the lookup is always in the previous Earley set,
which can easily be tracked in \Oc{} space
and retrieved in \Oc{} time.
Inclusive time and space can be charged to the
\Veim{predecessor}.
Overhead is charged to the Earley set at \Vloc{i}.
\begin{algorithm}[ht]
\caption{Marpa Scan pass}
\begin{algorithmic}[1]
\Procedure{Scan pass}{$\Vloc{i},\Vsym{a}$}
\State Note: Each pass through this loop is an EIM attempt
\For{each $\Veim{predecessor} \in \var{transitions}((\var{i} \subtract 1),\var{a})$}
\State $[\Vdr{from}, \Vloc{origin}] \gets \Veim{predecessor}$
\State $\Vdr{to} \gets \GOTO(\Vdr{from}, \Vsym{a})$
\State \Call{Add EIM}{$\Ves{i}, \Vdr{to}, \Vloc{origin}$}
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Correctness of the scan pass}
\label{p:scan-correct}
\subsection{Completeness of the scan pass}
\label{p:scan-complete}
\begin{algorithm}[ht]
\caption{Fusion pass}
\begin{algorithmic}[1]
\Procedure{Fusion pass}{\Vloc{i}}
\State Note: \Vtable{i} may include EIM's added
\State \hspace{2.5em} during passes of the loop
\State \hspace{2.5em} by \Call{Fuse one LHS}{}.
\State \hspace{2.5em} The loop must traverse these.
\For{each Earley item $\Veim{work} \in \Vtable{i}$}
\State $[\Vdr{work}, \Vloc{origin}] \gets \Veim{work}$
\State $\Vsymset{lh-sides} \gets$ a set containing the LHS
\State \hspace\algorithmicindent of every completed rule in \Veim{work}
\For{each $\Vsym{lhs} \in \Vsymset{lh-sides}$}
\State \Call{Fuse one LHS}{\Vloc{i}, \Vloc{origin}, \Vsym{lhs}}
\EndFor
\EndFor
\State \Call{Memoize transitions}{\Vloc{i}}
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Fusion pass}
The loop over \Vtable{i} must also include
any items added by \call{Fuse one LHS}{}.
This can be done by implementing \Vtable{i} as an ordered
set and adding new items at the end.
Exclusive time is clearly \Oc{} per
\Veim{work},
and is charged to the \Veim{work}.
Additionally,
some of the time required by
\call{Fuse one LHS}{} is caller-included,
and therefore charged to this procedure.
Inclusive time from \call{Fuse one LHS}{}
is \Oc{} per call,
as will be seen in section \ref{p:fuse-one-lhs},
and is charged to the \Veim{work}
that is current
during that call to \call{Fuse one LHS}{}.
Overhead may be charged to the Earley set at \Vloc{i}.
\begin{algorithm}[ht]
\caption{Memoize transitions}
\begin{algorithmic}[1]
\Procedure{Memoize transitions}{\Vloc{i}}
\For{every \Vsym{postdot}, a postdot symbol of $\Ves{i}$}
\State Note: \Vsym{postdot} is ``Leo eligible" if it is
\State \hspace\algorithmicindent Leo unique and its rule is right recursive
\If{\Vsym{postdot} is Leo eligible}
\State Set $\var{transitions}(\Vloc{i},\Vsym{postdot})$
\State \hspace\algorithmicindent to a LIM
\Else
\State Set $\var{transitions}(\Vloc{i},\Vsym{postdot})$
\State \hspace\algorithmicindent to the set of EIM's that have
\State \hspace\algorithmicindent \Vsym{postdot} as their postdot symbol
\EndIf
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Memoize transitions}
The \var{transitions} table for \Ves{i}
is built once all EIMs have been
added to \Ves{i}.
We first look at the resource,
excluding the processing of Leo items.
The non-Leo processing can be done in
a single pass over \Ves{i},
in \Oc{} time per EIM.
Inclusive time and space are charged to the
Earley items being examined.
Overhead is charged to \Ves{i}.
We now look at the resource used in the Leo processing.
A transition symbol \Vsym{transition}
is Leo eligible if it is Leo unique
and its rule is right recursive.
(If \Vsym{transition} is Leo unique in \Ves{i}, it will be the
postdot symbol of only one rule in \Ves{i}.)
All but one of the determinations needed to decide
if \Vsym{transition} is Leo eligible can be precomputed
from the grammar,
and the resource to do this is charged to the parse.
The precomputation, for example,
for every rule, determines if it is right recursive.
One part of the test for
Leo eligibility cannot be done as a precomputation.
This is the determination whether there is only one dotted
rule in \Ves{i} whose postdot symbol is
\Vsym{transition}.
This can be done
in a single pass over the EIM's of \Ves{i}
that notes the postdot symbols as they are encountered
and whether any is enountered twice.
The time and space,
including that for the creation of a LIM if necessary,
will be \Oc{} time per EIM examined,
and can be charged to EIM being examined.
\begin{algorithm}[ht]
\caption{Fuse one LHS symbol}
\begin{algorithmic}[1]
\Procedure{Fuse one LHS}{\Vloc{i}, \Vloc{origin}, \Vsym{lhs}}
\State Note: If the transitions contain a LIM, that LIM is unique,
\State \hspace{2.5em} we do not look at any of the EIM's.
\If{$\exists \Vlim{lim}, \var{lim} \in \var{transitions}(\Vloc{origin},\Vsym{lhs})$}
\State Perform a \Call{Leo fusion operation}{}
\State \hspace\algorithmicindent for operands \Vloc{i}, \Vlim{pim}
\State return
\EndIf
\State Note: Each pass through this loop is an EIM attempt
\For{each $\var{eim} \in \var{transitions}(\Vloc{origin},\Vsym{lhs})$}
\State Perform a \Call{Earley fusion operation}{}
\State \hspace\algorithmicindent for operands \Vloc{i}, \Veim{pim}, \Vsym{lhs}
\EndFor
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Fuse one LHS}
\label{p:fuse-one-lhs}
To show that
\begin{equation*}
\var{transitions}(\Vloc{origin},\Vsym{lhs})
\end{equation*}
can be traversed in \Oc{} time,
we note
that the number of symbols is a constant
and assume that \Vloc{origin} is implemented
as a link back to the Earley set,
rather than as an integer index.
This requires that \Veim{work}
in \call{Fusion pass}{}
carry a link
back to its origin.
As implemented, Marpa's
Earley items have such links.
Inclusive time
for the loop over the EIM attempts
is charged to each EIM attempt.
Overhead is \Oc{} and caller-included.
\begin{algorithm}[ht]
\caption{Earley fusion operation}
\begin{algorithmic}[1]
\Procedure{Earley fusion operation}{\Vloc{i}, \Veim{from}, \Vsym{trans}}
\State $[\Vdr{from}, \Vloc{origin}] \gets \Veim{from}$
\State $\Vdr{to} \gets \GOTO(\Vdr{from}, \Vsym{trans})$
\State \Call{Add EIM}{\Ves{i}, \Vdr{to}, \Vloc{origin}}
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Earley Fusion operation}
\label{p:fusion-op}
\begin{sloppypar}
Exclusive time and space is clearly \Oc{}.
\call{Earley fusion operation}{} is always
called as part of an EIM attempt,
and inclusive time and space is charged to the EIM
attempt.
\end{sloppypar}
\subsection{Correctness of Earley fusion}
\label{p:earley-fusion-correct}
\subsection{Completeness of Earley fusion}
\label{p:earley-fusion-complete}
\begin{algorithm}[ht]
\caption{Leo fusion operation}
\begin{algorithmic}[1]
\Procedure{Leo fusion operation}{\Vloc{i}, \Vlim{from}}
\State $[\Vdr{from}, \Vsym{trans}, \Vloc{origin}] \gets \Vlim{from}$
\State $\Vdr{to} \gets \GOTO(\Vdr{from}, \Vsym{trans})$
\State \Call{Add EIM}{\Ves{i}, \Vdr{to}, \Vloc{origin}}
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Leo fusion operation}
\label{p:leo-op}
Exclusive time and space is clearly \Oc{}.
\call{Leo fusion operation}{} is always
called as part of an EIM attempt,
and inclusive time and space is charged to the EIM
attempt.
\begin{algorithm}[ht]
\caption{Add EIM}\label{a:pair}
\begin{algorithmic}[1]
\Procedure{Add EIM}{$\Ves{i},\Vdr{confirmed}, \Vloc{origin}$}
\State $\Veim{confirmed} \gets [\Vdr{confirmed}, \Vloc{origin}]$
\If{\Veim{confirmed} is new}
\State Add \Veim{confirmed} to \Vtable{i}
\EndIf
\EndProcedure
\end{algorithmic}
\end{algorithm}
\subsection{Correctness of Leo fusion}
\label{p:leo-fusion-correct}
\subsection{Completeness of Leo fusion}
\label{p:leo-fusion-complete}
\subsection{Correctness of fusion}
\label{p:fusion-correct}
\subsection{Completeness of fusion}
\label{p:fusion-complete}
\subsection{Adding an Earley item}
\label{p:add-eim}
This operation adds a confirmed EIM.
Inclusive time and space is charged to the
calling procedure.
Trivially, the space is \Oc{} per call.
We show that time is also \Oc{}
by singling out the two non-trivial cases:
checking that an Earley item is new,
and adding it to the Earley set.
\Marpa{} checks whether an Earley item is new
in \Oc{} time
by using a data structure called a PSL.
PSL's are the subject of Section \ref{s:per-set-lists}.
An Earley item can be added to the current
set in \Oc{} time
if Earley set is seen as a linked
list, to the head of which the new Earley item is added.
The resource used by \call{Add EIM}{}
is always caller-included.
\subsection{Per-set lists}
\label{s:per-set-lists}
In the general case,
where \var{x} is an arbitrary datum,
it is not possible
to use duple $[\Ves{i}, x]$
as a search key and expect the search to use
\Oc{} time.
Within \Marpa, however, there are specific cases
where it is desirable to do exactly that.
This is accomplished by
taking advantage of special properties of the search.
If it can be arranged that there is
a link direct to the Earley set \Ves{i},
and that $0 \leq \var{x} < \var{c}$,
where \var{c} is a constant of reasonable size,
then a search can be made in \Oc{} time,
using a data structure called a PSL.
Data structures identical to or very similar to PSL's are
briefly outlined in both
\cite[p. 97]{Earley1970} and
\cite[Vol. 1, pages 326-327]{AU1972}.
But neither source gives them a name.
The term PSL
(``per-Earley set list'')
is new
with this \doc{}.
A PSL is a fixed-length array of
integers, indexed by an integer,
and kept as part of each Earley set.
While \Marpa{} is building a new Earley set,
\Ves{j},
the PSL for every previous Earley set, \Vloc{i},
tracks the Earley items in \Ves{j} that have \Vloc{i}
as their origin.
The maximum number of Earley items that must be tracked
in each PSL is
the number of dotted rules,
\Vsize{\Cdr},
which is a constant of reasonable size
that depends on \Cg{}.
It would take more than \Oc{} time
to clear and rebuild the PSL's each time
that a new Earley set is started.
This overhead is avoided by ``time-stamping'' each PSL
entry with the Earley set
that was current when that PSL
entry was last updated.
As before,
where \Ves{i} is an Earley set,
let \Vloc{i} be its location,
and vice versa.
\Vloc{i} is an integer which is
assigned as Earley sets are created.
Let $\ID{\Vdr{x}}$ be an integer
uniquely identifying a dotted rule.
Numbering the dotted rules from 0 on up as rules are added to
the grammar
is an easy way to create a practical choice of $\ID{\Vdr{x}}$.
Let $\PSL{\Ves{x}}{\var{y}}$
be the entry for integer \var{y} in the PSL in
the Earley set at \Vloc{x}.
Consider the case where Marpa is building \Ves{j}
and wants to check whether Earley item \Veim{x} is new,
where $\Veim{x} = [ \Vdr{x}, \Vorig{x} ]$.
To check if \Veim{x} is new,
Marpa checks
\begin{equation*}
\var{time-stamp} = \PSL{\Ves{x}}{\ID{\Vdr{x}}}
\end{equation*}
If the entry has never been used,
we assume that $\var{time-stamp} = \Lambda$.
If $\var{time-stamp} \ne \Lambda \land \var{time-stamp} = \Vloc{j}$,
then \Veim{x} is not new,
and will not be added to the Earley set.
If $\Vloc{p} = \Lambda \lor \var{time-stamp} \ne \Vloc{j}$,
then \Veim{x} is new.
\Veim{x} is added to the Earley set,
and a new time-stamp is set, as follow:
\begin{equation*}
\PSL{\Ves{x}}{\ID{\Vdr{x}}} \gets \Vloc{j}.
\end{equation*}
\subsection{Complexity summary}
For convenience, we collect and summarize here
some of the observations of this section.
\begin{myobservation}
The time and space charged to an Earley item
which is actually added to the Earley sets
is \Oc{}.
\end{myobservation}
\begin{myobservation}
The time charged to an attempt
to add a duplicate Earley item to the Earley sets
is \Oc{}.
\end{myobservation}
For evaluation purposes, \Marpa{} adds a link to
each EIM that records each attempt to
add that EIM,
whether originally or as a duplicate.
Traditionally, complexity results treat parsers
as recognizers, and such costs are ignored.
This will be an issue when the space complexity
for unambiguous grammars is considered.
\begin{myobservation}
The space charged to an attempt
to add a duplicate Earley item to the Earley sets
is \Oc{} if links are included,
zero otherwise.
\end{myobservation}
As noted in Section \ref{p:add-eim},
the time and space used by predicted Earley items
and attempts to add them is charged elsewhere.
\begin{myobservation}
No space or time is charged to predicted Earley items,
or to attempts to add predicted Earley items.
\end{myobservation}
\section{Marpa is correct}
\label{s:correct}
\subsection{Nulling symbols}
\label{s:nulling}
Recall that Marpa grammars,
without loss of generality,
contain neither empty rules or
properly nullable symbols.
This corresponds directly
to a grammar rewrite in the \Marpa{} implementation,
and its reversal during \Marpa's evaluation phase.
For the correctness and complexity proofs in this \doc{},
we assume an additional rewrite,
this time to eliminate nulling symbols.
Elimination of nulling symbols is also
without loss of generality, as can be seen
if we assume that a history
of the rewrite is kept,
and that the rewrite is reversed
after the parse.
Clearly, whether a grammar \Cg{} accepts
an input \Cw{}
will not depend on the nulling symbols in its rules.
\subsection{Marpa's Earley sets grow at worst linearly}
\begin{theorem}\label{t:es-count}
For a context-free grammar,
and a parse location \Vloc{i},
\begin{equation*}
\textup{
$\bigsize{\EVtable{\Marpa}{i}} = \order{\var{i}}$.
}
\end{equation*}
\end{theorem}
\begin{proof}
EIM's have the form $[\Vdr{x}, \Vorig{x}]$.
\Vorig{x} is the origin of the EIM,
which in Marpa cannot be after the current
Earley set at \Vloc{i},
so that
\begin{equation*}
0 \le \Vorig{x} \le \Vloc{i}.
\end{equation*}
The possibilities for \Vdr{x} are finite,
since the number of dotted rules is a constant,
$\size{\Cdr}$,
which depends on \Cg{}.
Since duplicate EIM's are never added to an Earley set,
the maximum size of Earley set \Vloc{i} is therefore
\begin{equation*}
\Vloc{i} \times \size{\Cdr} = \order{\Vloc{i}}.\qedhere
\end{equation*}
\end{proof}
\subsection{Marpa's Earley sets are correct}
\begin{theorem}\label{t:table-correct}
Marpa's Earley sets are correct.
\end{theorem}
Proof omitted.\qedsymbol
\subsection{Marpa is correct}
We are now is a position to show that Marpa is correct.
\begin{theorem}
\textup{ $\myL{\Marpa,\Cg} = \myL{\Cg}$ }
\end{theorem}
\begin{proof}
From Theorem \ref{t:table-correct},
we know that
\begin{equation*}
[\Vdr{accept},0] \in \EVtable{\Leo}{\Vsize{w}}
\end{equation*}
if and only there is a
\begin{equation*}
[\Vdr{accept},0] \in \EVtable{\Marpa}{\Vsize{w}}
\end{equation*}
such that $\Vdr{accept} \in \Vdr{accept}$.
From the acceptance criteria in the \Leo{} definitions
and the \Marpa{} pseudocode,
it follows that
\begin{equation*}
\myL{\Marpa,\Cg} = \myL{\Leo,\Cg}.
\end{equation*}
By Theorem 4.1 in \cite{Leo1991}, we know that
\begin{equation*}
\myL{\Leo,\Cg} = \myL{\Cg}.
\end{equation*}
The theorem follows from
the previous two equalities.
\end{proof}
\section{Complexity preliminaries}
\label{s:complexity}
In the following we extend the notation for sentential forms to
allow location to be indicated.
Specifically, to indicate in the sentential form
\begin{equation*}
\Vsf{a} \cat \Vsf{b} \mydot \Vsf{c}
\end{equation*}
that \Vloc{x} is between \Vsf{a} and \Vsf{b},
and that the dot is at location \var{y},
we will write
\begin{equation*}
\Vsf{a} \; [\var{x}] \; \Vsf{b} \; [\var{y}] \; \Vsf{c}.
\end{equation*}
In the section
we will examine a Marpa internal grammar \var{g}
for an arbitrary partial input,
$\Vstr{seen} = \Cwrange{0}{\var{j-1}}$.
We assume that $\var{j} > 0$, but otherwise
the choice of \var{seen} and \Vloc{j}
is without loss of generality.
We will assume that \var{g} is $LR(\pi)$,
where $\pi$ is a finite left congruence.
We will be concerned with one cell of the partition,
\var{B}, $\var{B} \in \pi$.
\var{B} is chosen arbitrarily and without loss of generality.
\begin{theorem}
\label{t:leo-lemma}
For all
\Vsf{prefix1}, \Vsf{prefix2},
$\Vstr{unseen1} \in \var{B}$, and $\Vstr{unseen2} \in \var{B}$, let
\begin{gather}
\label{e:leo-lemma-derivation-shorter}
\begin{split}
\Vsym{S} & \deRstar \Vsf{prefix1} \cat \Vstr{unseen1} \\
& \deRstar \Vstr{seen} \cat \Vstr{unseen1},
\end{split} \\
\label{e:leo-lemma-derivation-longer}
\begin{split}
\Vsym{S} & \deRstar \Vsf{prefix2} \cat \Vstr{unseen2} \\
& \deRstar \Vstr{seen} \cat \Vstr{unseen2}
\end{split}
\end{gather}
Then, either
\begin{gather}
\label{e:leo-lemma-conclusion-1}
\var{prefix1} \deRstar \var{prefix2} \quad \text{or} \\
\label{e:leo-lemma-conclusion-2}
\var{prefix2} \deRstar \var{prefix1}.
\end{gather}
$\blacklozenge$
\end{theorem}
Joop Leo proved this using a reduction in an unpublished addendum to \cite{Leo1991}.
The following is a proof by induction.
\begin{proof}
We number the direct steps of each right derivation,
from last to first, so
that
\begin{equation}
\label{leo-lemma-step-1-0}
\Vstr{seen} \diamond \Vstr{unseen1}
\end{equation}
is step 0 of the derivation for \Vsf{prefix1}.
In the following, we will be comparing steps, but we will only be interested in the
part of the sentential forms before \Vloc{j}.
We call \var{j} the ``threshold''.
For convenience we will mark \var{j} with a diamond.
Without loss of generality, we assume that
the derivation for \Vsf{prefix2} is longer than that of \Vsf{prefix1},
so that what we want to show is
\begin{align}
\label{leo-lemma-step-2-S} \Vsym{S} & \deRstar \Vsf{prefix2} \diamond \Vstr{unseen2} \\
\label{leo-lemma-step-2-n} & \deRstar \Vsf{prefix1} \diamond \Vstr{unseen2} \\
\label{leo-lemma-step-2-0} & \deRstar \Vstr{seen} \diamond \Vstr{unseen2},
\end{align}
which we do by induction.
As the basis of the induction, we compare
the pre-threshold portion of
Step \eqref{leo-lemma-step-1-0} with
\eqref{leo-lemma-step-2-0},
and note they are identical.
To show the step of the induction, assume
that the pre-threshold portions of step \var{n}
in derivation \ref{e:leo-lemma-derivation-longer}
is identical
to the pre-threshold portions of step \var{n}
in derivation \ref{e:leo-lemma-derivation-shorter}.
Without loss of generality, we show the direct derivation
from step $\var{n}+1$ to step \var{n}
for the two derivations as
\begin{gather}
\Vsf{before1} \cat \Vsym{lhs1} \cat \Vstr{after1} \diamond \Vstr{unseen1} \\
\label{e:handle-1} \deR \Vsf{before1} \cat \Vsf{rhs1} \cat \Vstr{after1} \diamond \Vstr{unseen1}
\end{gather}
and
\begin{gather}
\Vsf{before2} \cat \Vsym{lhs2} \cat \Vstr{after2} \diamond \Vstr{unseen2} \\
\label{e:handle-2} \deR \Vsf{before2} \cat \Vsf{rhs2} \cat \Vstr{after2} \diamond \Vstr{unseen2}
\end{gather}
where from the assumption for the step we know that
\begin{multline}
\label{e:leo-lemma-induction-step-assumption}
\Vsf{before1} \cat \Vsf{rhs1} \; [\var{end1}] \; \Vstr{after1} \\
= \Vsf{before2} \cat \Vsf{rhs2} \; [\var{end2}] \; \Vstr{after2}.
\end{multline}
In \eqref{e:leo-lemma-induction-step-assumption}
\Vloc{end1} and \Vloc{end2} indicate where the ends of the next handle
must be found in their respective derivation.
We know the two sentential forms
in \eqref{e:leo-lemma-induction-step-assumption}
are equal, but we need to show that $\var{end1} = \var{end2}$.
To do this, we proceed by reductio.
We assume for the reductio that
\begin{equation}
\label{e:leo-lemma-2}
\var{end1} \neq \var{end2},
\end{equation}
\eqref{e:leo-lemma-2} means that
$\var{after1} \neq \var{after2}$.
Without loss of generality, we assume that
$\var{after1} > \var{after2}$.
We assumed that $\var{unseen1} \in \var{B}$
and $\var{unseen1} \in \var{B}$,
so that
\begin{equation}
\label{e:leo-lemma-unseen-mod-pi-assumption}
\var{unseen1} \equiv \var{unseen2} \pmod \pi,
\end{equation}
By assumption,
$\pi$ is a left congruence, so that from
\eqref{e:leo-lemma-unseen-mod-pi-assumption} it follows that
\begin{gather}
\label{e:leo-lemma-1}
\Vstr{after2} .\var{unseen1} \equiv
\Vstr{after2} .\var{unseen2} \pmod \pi,
\end{gather}
so that both strings are in the same cell of the partition, $\pi$.
By the LR-property, if there is a handle just before
$\Vstr{after2} .\var{unseen1}$ in \eqref{e:leo-lemma-1},
there must also be a handle just before
$\Vstr{after2} .\var{unseen2}$ in \eqref{e:leo-lemma-1}.
Since \var{end1} and \var{end2} are the handle locations,
that means that
$\var{end1} = \var{end2}$.
But this contradicts
\eqref{e:leo-lemma-2}, our assumption for the reductio.
So, in fact,
\begin{equation}
\label{e:leo-lemma-ends-equal}
\Vloc{end1} = \Vloc{end2}.
\end{equation}
From
\eqref{e:leo-lemma-induction-step-assumption}
and
\eqref{e:leo-lemma-ends-equal}, we have
\begin{equation}
\label{e:leo-lemma-after-eq}
\Vstr{after1} = \Vstr{after2}.
\end{equation}
By the LR-property, we also know that the production
of the two handles will be identical, so that
that is, that
\begin{gather}
\label{e:leo-lemma-lhs-eq} \Vsym{lhs1} = \Vsym{lhs2} \quad \text{and} \\
\label{e:leo-lemma-rhs-eq} \Vsf{rhs1} = \Vsf{rhs2}.
\end{gather}
From
\eqref{e:leo-lemma-induction-step-assumption},
\eqref{e:leo-lemma-ends-equal},
\eqref{e:leo-lemma-lhs-eq}
and \eqref{e:leo-lemma-rhs-eq},
we have
\begin{equation}
\label{e:leo-lemma-before-eq}
\Vsf{before1} = \Vsf{before2}.
\end{equation}
From
\ref{e:leo-lemma-before-eq},
\ref{e:leo-lemma-lhs-eq}
and \ref{e:leo-lemma-after-eq},
we can show that
\begin{equation}
\Vsf{before1} \cat \Vsym{lhs1} \cat \Vstr{after1} =
\Vsf{before2} \cat \Vsym{lhs2} \cat \Vstr{after2},
\end{equation}
which show the step of the induction,
and the induction.
By the induction, we have
\eqref{leo-lemma-step-2-S} and
\eqref{leo-lemma-step-2-n},
so that
\begin{equation*}
\Vsf{prefix2} \diamond \Vstr{unseen2} \deRstar
\Vsf{prefix1} \diamond \Vstr{unseen2}
\end{equation*}
and therefore
\begin{equation*}
\Vsf{prefix2} \deRstar \Vsf{prefix1}.
\end{equation*}
\end{proof}
\begin{theorem}
\label{t:completions-per-cell}
The number of completed tree predicates at \var{j}
for \var{g}, \var{B}, \Cwrange{0}{\var{j}\subtract 1},
excluding those in right recursions,
is less than or equal to $\size{\Crules}$, the number of rules in \var{g}.
\myendTheorem
\end{theorem}
\begin{proof}
Assume, for a reductio, that there are
$\size{\Crules} + 1$ completion tree predicates,
and that no two recurse.
Then at least two of them will share the same dotted rule.
Call their shared dotted rule, $[ \Vsym{lhs} \de \Vsf{rhs} \mydot ]$.
By assumption they share the same current location, \Vloc{j},
and therefore differ only in their origin.
Call the two origins, \Vloc{i} and \typedPrime{i}{'}{loc}, so that
the two tree predicates are of the form
\begin{gather}
[ [ \var{lhs} \de \var{rhs} \mydot ], \var{i}, \var{j} ] \\
[ [ \var{lhs} \de \var{rhs} \mydot ], \var{i}', \var{j} ].
\end{gather}
Then by the definition of tree predicates,
there will be the two rightmost derivation steps
\begin{gather}
\label{e:non-recursive-completion-step-1}
\Vsf{before1} \cat \var{lhs} \mydot \Cwrange{\var{j}}{\var{n}} \quad \text{and} \\
\label{e:non-recursive-completion-step-2}
\Vsf{before2} \cat \var{lhs} \mydot \Cwrange{\var{j}}{\var{n}}.
\end{gather}
By theorem \ref{t:leo-lemma}, we have
\begin{equation}
\label{e:non-recursive-recursion}
\Vsf{before1} \cat \var{lhs} \mydot \Cwrange{\var{j}}{\var{n}} \deRstar
\Vsf{before2} \cat \var{lhs} \mydot \Cwrange{\var{j}}{\var{n}}.
\end{equation}
From \eqref{e:non-recursive-recursion}, we see the pair
\eqref{e:non-recursive-completion-step-1} and
\eqref{e:non-recursive-completion-step-2},
must recurse.
This is contrary to the assumption for the reductio,
and shows the reduction.
To show the theorem, we note that any recursion of tree predicates
is on the predot symbol,
and therefore any recursion of completed tree predicates
is be a right recursion.
\end{proof}
We say a rightmost derivation step is \dfn{factored at location \var{i}}
if \Vloc{i} is not location 0 and there is a symbol of the
derivation with input position \var{i}.
Note that since the input position is always before a symbol,
a derivation step is never factored at
the last input location.
A sentence of $\myL{\Cg}$ is factored at every possible factoring location.
Each reduction of a derivation step
may eliminate one or more of the factoring locations.
A reduction will never add a new factoring location.
When we have reduced back to the derivation which consists
of only the start symbol, \Vsym{S},
there are no factoring locations.
\begin{mydefinition}
\label{d:last-prefix}
The
most reduced derivation factored at \var{i}
is the last
derivation step in a series of rightmost
reductions that is factored at \var{i}.
It is also called the \dfn{last derivation factored at \var{i}}.
The least reduced derivation not factored at \var{i} is the first
derivation step in a series of reductions that is not factored at \var{i}.
It is also called the \dfn{first derivation factored at \var{i}}.
The \dfn{last prefix at \var{i}} is the sentential form that is
the portion of the last derivation factored at \var{i}
at locations 0 through $\var{i}\subtract 1$, inclusive.
\end{mydefinition}
Less formally, the last prefix at \var{i}
is the prefix of the rightmost
derivation that is as reduced as possible
without location \var{i} being swallowed up by a symbol.
Once that happens, the prefix of location \var{i} is no
longer well defined.
\begin{theorem}
\label{t:last-prefix-unique}
For \var{g}, \var{B}, \var{j} and \Cwrange{0}{\var{j}\subtract 1},
the last prefix at \var{j} is unique.
\myendTheorem
\end{theorem}
\begin{proof}
Assume, for a reductio, that there are two last prefixes at \var{j}.
For convenience, we will mark the position of \var{j} with a diamond.
Call the two prefixes \Vsf{prefix1}
and \Vsf{prefix2}.
Then,
where $\Vstr{seen} = \Cwrange{0}{\var{j}\subtract 1}$,
$\Vstr{unseen1} \in \var{B}$ and
$\Vstr{unseen2} \in \var{B}$,
we have
\begin{gather}
\begin{split}
\var{S} & \deRstar \var{prefix1} \diamond \var{unseen1} \\
& \deRstar \Vstr{seen} \diamond \var{unseen1} \quad \text{and}
\end{split} \\
\begin{split}
\var{S} & \deRstar \var{prefix2} \diamond \var{unseen2} \\
& \deRstar \var{seen} \diamond \var{unseen2}
\end{split}
\end{gather}
But by Theorem \ref{t:leo-lemma},
this means one of \Vsf{prefix1} and
\Vsf{prefix2} derives the other.
The last prefix at \var{j} cannot be derived from another
prefix factored at \var{j}, so this contradicts the assumption
for the reductio.
\end{proof}
\begin{theorem}
\label{t:medials-per-cell}
The number of medial tree predicates at \var{j}
for \var{g}, \var{B}, \Cwrange{0}{\var{j}\subtract 1}
is less than or equal to the number of
medial dotted rules in \var{g}.
\myendTheorem
\end{theorem}
\begin{proof}
We say that a derivation factors internally at \var{j} if it is of
the form
\begin{multline}
\label{e:medial-derivation-1}
\Vsf{before} \cat \Vsym{A} \cat \Vsf{after} \\
\de \var{before} \cat \Vsf{rhs1} \; [\Vloc{j}] \; \Vsf{rhs2} \cat \var{after} \\
\text{where} \quad \var{rhs1} \neq \epsilon, \var{rhs2} \neq \epsilon
\end{multline}
If any tree has a derivation that factors internally at \var{j}, it must
be unique to that tree.
Any reduction will see \var{j} as properly ``inside'' either \Vsym{A} or one
of \var{A}'s indirect parent symbols.
Any child derivation will either be from one of \var{before}, \var{rhs1}, \var{rhs2} or \var{after}.
Every location of \var{before} and \var{rhs1} is less than or equal to \var{j}.
Every location of \var{after} and \var{rhs2} is greater than or equal to \var{j}.
Therefore symbol of a child derivation can properly contain \var{j}
and, as a further consequence,
no child derivation can factor internally at \var{j}.
Consider a medial tree predicate at \var{j} in \var{B}.
We can describe it, without loss of generality as being of the form
\begin{equation}
\bigl[[ \Vsym{A} \de \Vsf{rhs1} \mydot \Vsf{rhs2} \bigr], \Vloc{i}, \Vloc{j}]
\end{equation}
By the definition of tree predicate, there will be a corresponding derivation
which we can describe, again without loss of generality,
as being of the form
shown in \eqref{e:medial-derivation-1}.
Let \var{c} be the number of medial dotted rules in \Cg.
Assume, for a reductio, that there are $\var{c} + 1$ medial tree predicates at \var{j}.
This means that at least two distinct tree predicates must share the same dotted rule.
Let the second one, without loss of generality, be of the form
\begin{equation}
\bigl[[ \Vsym{A} \de \Vsf{rhs1} \mydot \Vsf{rhs2} \bigr], \Vloc{i2}, \Vloc{j}]
\end{equation}
and let its corresponding derivation be
\begin{multline}
\label{e:medial-derivation-2}
\Vsf{before2} \cat \Vsym{A} \cat \Vsf{after2} \\
\de \var{before2} \; [\Vloc{i2}] \; \Vsf{rhs1} \; [\Vloc{j}] \; \Vsf{rhs2} \cat \var{after2}
\end{multline}
We see from
\eqref{e:medial-derivation-1} that $\var{before} \cat \var{rhs1}$ is
the most reduced prefix of \var{B} at \var{j} or,
as we have called it,
the last prefix at \var{j}.
From \eqref{e:medial-derivation-2} we see that
$\var{before2} \cat \var{rhs1}$ is also the last prefix at \var{j},
as defined in Definition \ref{d:last-prefix}.
Therefore,
by Theorem \ref{t:last-prefix-unique},
\begin{gather}
\var{before} \cat \var{rhs1} = \var{before2} \cat \var{rhs1}
\quad \text{and} \\
\label{e:before-medial-eq}
\var{before} = \var{before2}.
\end{gather}
For the two tree predicates to be distinct, we must have
$\var{i} \neq \var{i2}$ and since
\eqref{e:before-medial-eq},
we know that
\begin{gather}
\var{before} \; [\var{i}] \; \var{rhs1} \de \Cwrange{0}{\var{j}\subtract 1}
\quad \text{and} \\
\var{before} \; [\var{i2}] \; \var{rhs1} \de \Cwrange{0}{\var{j}\subtract 1}
\end{gather}
so that $\var{before} \cat \var{rhs1}$ produces
two distinct factorings of \Cwrange{0}{\var{j}\subtract 1}.
If any substring of any derivation step of a reduced grammar allows distinct factorings,
that grammar is ambiguous.
\Cg{} is a Marpa internal grammar and therefore reduced,
so \Cg{} must be ambiguous.
But \Cg{}, by assumption, is LR, and therefore is not ambiguous.
So \Cg{} is both ambiguous and not ambiguous, which completes the reductio.
\end{proof}
\begin{mytheorem}
\label{t:lrr-medial-Oc}
The number of medial tree predicates at any location of an LRR parse
is \Oc.
\end{mytheorem}
\begin{proof}
An LRR parse, by definition, is an $LR(\pi)$ parse of finite index.
The partition has \Oc{} cells.
By Theorem \ref{t:medials-per-cell},
in an LRR parse,
the number of medial tree predicates
at each location
for each cell is \Oc.
The number of medial tree predicates is therefore $\Oc \times \Oc = \Oc$.
\end{proof}
\begin{mytheorem}
\label{t:lrr-completed-Oc}
In an LRR parse,
the number of
completed tree predicates at any location
is \Oc,
if right recursions are excluded.
\end{mytheorem}
\begin{proof}
An LRR parse, by definition, is an $LR(\pi)$ parse of finite index.
and therefore the partition has \Oc{} cells.
By Theorem
\ref{t:completions-per-cell}
in an LRR parse,
the number of completed tree predicates
at each location,
and for each cell is \Oc,
if right recursions are excluded.
The number of non-right-recursive
completed tree predicates at any location
is therefore $\Oc \times \Oc = \Oc$.
\end{proof}
\section{Linear complexity results}
\begin{mydefinition}
Let \var{lb-set} be
a set of tree predicate maps,
Let
\begin{equation}
\var{all-inputs} =
\bigcup_{\var{lb} \in \var{lb-set}}
{\left\lbrace \var{x} \;
\middle| \; \var{lb} :\var{x} \mapsto \var{y}
\right\rbrace}
\end{equation}
and
\begin{multline}
\var{all-predicates} = \\
\bigcup{
\bigl\lbrace \var{tp}
\; \bigm| \;
\var{tp} \in \var{y}
\; \land \; \var{lb} \in \var{lb-set}
\; \land \; \var{lb} : \var{x} \mapsto \var{y}
\bigr\rbrace
}.
\end{multline}
Then their union
is a new tree predicate mapping
which is defined as
\begin{equation*}
\bigcup_{\var{lb} \in \var{lb-set}}{\var{lb}(\var{w})}.
\end{equation*}
\end{mydefinition}
\begin{mydefinition}
\label{d:leo-bounded}
Let \var{tps} be a set of tree predicates,
and \var{T} the set of terminals it implies,
We call \var{tps}, \dfn{Leo bounded}
if and only if
for every
\begin{equation*}
\Vtp{tp} \in \var{tps},
\quad \Vstr{w} \in \var{T} \quad \text{and}
\quad \Vloc{i}, 0 < \var{i} < (\size{\var{w}} \subtract 1),
\end{equation*}
all of the following are true
\begin{gather}
\label{e:lb-tp-bounded}
\left|
\left\lbrace
\var{tp}
\; \middle| \;
\begin{aligned}
& \var{tp} \in \var{Valid}(\var{tps}, \var{w})[\var{i}] \\
& \land \quad \neg \var{RR}(\var{tps}, \var{tp})
\end{aligned}
\right\rbrace
\right|
= \Oc, \\
\label{e:lb-cost-bounded} \bigsize{ \var{Costed}(\var{tps}, \var{tp}) } = \Oc \quad \text{and} \\
\label{e:lb-rr-deterministic}
\var{RR}(\var{tps}, \var{tp}) \implies \bigsize{ \var{Costed}(\var{tps}, \var{tp}) } \le 1
\end{gather}
We call a Marpa internal grammar \var{g}
\dfn{Leo bounded} if and only if
$\var{Valid}(\var{g})$ is Leo-bounded.
We call a set of sets of tree predicates, \var{tpss},
\dfn{Leo bounded}
if the union of its elements,
\begin{equation*}
\bigcup_{\var{tps} \in \var{tpss}}\var{tps},
\end{equation*}
is Leo bounded.
\end{mydefinition}
\begin{mytheorem}
Any subset of a Leo-bounded set of tree predicates is
Leo-bounded.
\end{mytheorem}
\begin{proof}
Examination of the definition of Leo Bounded
(Definition \ref{d:leo-bounded})
will show that the conditions
\eqref{e:lb-tp-bounded},
\eqref{e:lb-cost-bounded},
and \eqref{e:lb-rr-deterministic}
all set maximums on the number of tree predicates.
Therefore removal of a tree predicate from a Leo-bounded predicate
set will not cause any of the remaining tree predicates to fail
the conditions.
\end{proof}
In the pseudo-code, it was shown that all time and
space is charged to
\begin{itemize}
\item an Earley set,
\item a medial tree predicate, or
\item a completed tree predicate which is
not in a deterministic right recursion.
\end{itemize}
\begin{definition}
We say that a tree predicate is \dfn{Marpa-charged}
if it is medial
or a completion which is not a right recursion
that is part of a determistic right recursion.
\end{definition}
\begin{mytheorem}
\label{t:leo-bounded-is-linear}
If \var{g} is Leo bounded,
Marpa runs in $\On$ time and space.
where \var{n} is the input length.
\end{mytheorem}
\begin{proof}
Since \var{g} is Leo bounded,
by condition \eqref{e:lb-tp-bounded},
the number of Marpa-charged tree predicates
at any location is \Oc.
By condition \eqref{e:lb-cost-bounded},
the number of attempts to add each of them is \Oc.
In the pseudocode,
it was shown that all charges
of time and space are either to the Earley sets
or to attempts to add tree predicates.
Each Earley set is charged \Oc{} time and space.
Each add-attempt is also charged \Oc{} time and space.
For each Earley set the total time and space charged is therefore
$\Oc + \Oc \times \Oc \times \Oc = \Oc$.
By assumption,
there are \var{n} Earley sets,
the total time and space charged by the parse is $\Oc \times \On = \On$.
\end{proof}
\begin{theorem}
\label{e:lrr-is-leo-bounded}
LRR is Leo-bounded,
and Marpa runs all LRR grammars in \On{} time and space,
where \var{n} is the number of locations.
\end{theorem}
\begin{proof}
Tree predicates may be predictions,
completions or medial.
From the pseudo-code, we know that predictions are charged
to the Earley set, and the charge will be \Oc{} time and space.
By Theorem \ref{t:lrr-completed-Oc},
there are at most \Oc{} completions,
excluding right recursions.
Deterministic right recursions are not Marpa-charged,
Because LRR grammars are unambiguous, none of its right recursions
are not Marpa-charged.
So at \Oc{} completions per location are Marpa-charged.
By Theorem \ref{t:lrr-medial-Oc},
there are at most \Oc{} medial tree predicates per location.
There are therefore $\Oc+\Oc=\Oc$ Marpa-charged tree predicates
per location.
From the pseudo-code, we know that each attempt to
add a Marpa-charged tree predicate is charge \Oc{} time
and space.
Because LRR parses are unambiguous, there will be no more
exactly one attempt to add every tree predicate,
so that the total charges for tree predicates come to
$1 \times \Oc \times \Oc = \Oc$ time and space.
Counting the charges for each Earley set and the charges for
each tree predicates, the charges for each location are
$\Oc+\Oc=\Oc$.
There are \On locations, so the total charges for the parse
are $\Oc \times \On = \On$.
\end{proof}
The theorems which follow in the remainder of this section show
\On{} time and space for classes of grammar beyond LRR.
These classes include many ambiguous grammars.
In the sequel we will often want to use tree predicates from one grammar
in another grammar, with input positions moved appropriately.
The following definition will be convenient.
\begin{definition}
Let \Vtp{tp} and \Vtp{tp2} be tree predicates,
where
\begin{equation}
\Vtp{tp} = [ \Vrule{r}, \Vorig{i}, \Vloc{j} ].
\end{equation}
Let \var{n} be an integer.
We say \var{tp2} is \var{tp} \dfn{offset} by \var{n},
\begin{equation}
\var{tp2} = \var{tp} + \var{n},
\end{equation}
if and only if
\begin{equation}
\var{tp2} = [ \Vrule{r}, \var{i}+\var{n}, \var{j}+\var{n}].
\end{equation}
\myendTheorem
\end{definition}
\begin{definition}
Let \var{tp-set} and \var{tp-set2} be sets of tree predicates,
and \var{n} a non-negative integer.
\var{tp-set} is \var{tp-set2} \dfn{offset} by \var{n},
$\var{tp-set} = \var{tp-set2} + \var{n}$,
if and only if
\begin{equation}
\var{tp-set} = \left\lbrace
\Vtp{x} + \var{n} \middle| \var{x} \in \var{tp-set2}
\right\rbrace
\end{equation}
\end{definition}
\begin{theorem}
Let \var{n} be a non-negative integer
and \var{tp-set} be the set of tree predicates produced by Marpa
for grammar \var{g},
with input \var{w},
where $\var{j} = \size{\var{w}}$,
and \var{tp-set2} be $\var{tp-set} + \var{n}$.
For any tree predicate $\var{tp} \in \var{tp-set}$,
the number of add attempts for $\var{tp} + \var{n}$
will be the same as for \var{tp}.
\myendTheorem
\end{theorem}
\begin{proof}
Let \var{tp} be
\begin{equation}
[ \Vdr{dr}, \Vorig{i}, \Vloc{j} ].
\end{equation}
If \var{dr} is a scanned dotted rule,
there will always be exactly one attempt to add it.
We next examine the most complicated case,
that of fused dotted rules.
The number of add attempts for a fused dotted rule
is equal to the number of valid completer proofs.
Without loss of generality, let
\begin{multline}
\label{e:fused-1}
\var{fused-tp} =
\bigl[ [ \Vsym{lhs} \de \Vsf{rhs1} \Vsym{transition} \mydot \Vsf{rhs2} ], \Vorig{i}, \Vloc{k} \bigr]
\end{multline}
be a fused tree predicate in \var{tp-set}.
Then
its completer proofs will have the form,
\begin{multline}
\label{e-completer-proof-1}
\bigl[ [ \Vsym{lhs} \de \Vsf{rhs1} \mydot \Vsym{transition} \Vsf{rhs2} ], \Vorig{i}, \Vloc{j} \bigr]
\quad \text{and} \\
\bigl[ [ \Vsym{transition} \de \Vsf{transition-rhs} ], \Vorig{j}, \Vloc{k} \bigr].
\end{multline}
It is clear from the definition of offset for tree predicates that this means that
the tree predicates
\begin{multline}
\label{e-completer-proof-2}
\bigl[ [ \Vsym{lhs} \de \Vsf{rhs1} \mydot \Vsym{transition} \Vsf{rhs2} ],
\Vorig{i}+\var{n}, \Vloc{j}+\var{n} \bigr] \in \var{tp-set2} \\
\land \bigl[ [ \Vsym{transition} \de \Vsf{transition-rhs} ], \Vorig{j}+\var{n}, \Vloc{k}+\var{n} \bigr] \in \var{tp-set2}.
\end{multline}
will be in $\var{tp-set} + \var{n}$.
By the definition of completer proof,
\eqref{e-completer-proof-2} is a completer proof
of $\var{fused-tp} + \var{n}$.
% TODO -- Show the bijection?
Offset is a bijection in the sets, \var{tp-set} and \var{tp-set2},
so we can also see that there is a bijection between the Cartesian products of
\var{tp-set} and \var{tp-set2},
and therefore there is a bijection between completer
proofs in \var{tp-set} and \var{tp-set2}.
Therefore the number of completer proofs for
\var{fused-tp} and $\var{fused-tp} + \var{n}$ must be the same.
Finally, for predicted tree predicates, in Marpa the number of attempts to add a predicted tree predicate
at \Vloc{j}
will depend on the postdot symbols of the scanned and fused tree predicates at \Vloc{j}.
We see that
\begin{equation*}
\bigl[ [ \Vsym{lhs} \de \Vsf{rhs1} \mydot \Vsym{post-dot} \Vsf{rhs2} ], \Vorig{i}, \Vloc{j} \bigr] \in \var{tp-set}
\end{equation*}
if and only if
\begin{equation*}
\bigl[ [ \Vsym{lhs} \de \Vsf{rhs1} \mydot \Vsym{post-dot} \Vsf{rhs2} ], \Vorig{i}+\var{n}, \Vloc{j}+\var{n} \bigr] \in \var{tp-set}
\end{equation*}
so that the number of attempts to add
\begin{equation*}
\bigl[ [ \Vsym{post-dot} \de \mydot \Vsf{predicted-rhs} ], \Vorig{j}, \Vloc{j} \bigr]
\end{equation*}
to \var{tp-set} will be the same as the number of attempts to add
\begin{equation*}
\bigl[ [ \Vsym{post-dot} \de \mydot \Vsf{predicted-rhs} ], \Vorig{j}+\var{n}, \Vloc{j}+\var{n} \bigr]
\end{equation*}
to \var{tp-set2}
\end{proof}
\begin{definition}
Let \var{lb-set} be a set of Leo bounds.
We say that \var{lb-set} has bounded overlap if
for all \Vstr{w}, \Vloc{i},
\begin{equation}
\label{e:def-bounded-overlap}
\Bigl|
\bigl\lbrace \var{lb}
\bigm|
\var{lb} \in \var{lb-set} \land
\var{lb}(\var{w},\var{i}) > 0
\bigr\rbrace
\Bigr| = \Oc
\end{equation}
\end{definition}
Less formally, a set of Leo bounds has bounded overlap if
there is a finite limit to the number of Leo bounds which have
tree predicates at any input location.
This concept is useful primarily where \var{lb-set} has transfinite
cardinality, since
it is trivially true that any set of finite cardinality is
a bounded overlap set.
To deal with unions of Leo bounds,
it is convenient to
single out
the issues around right recursion.
We thereforce state and prove the following lemma,
which takes care of the other conditions.
\begin{lemma}
\label{t:leo_bound_union}
The union of a set of Leo bounds is a Leo bound,
if it has bounded overlap and if it does not introduce
non-determinism to existing right recursions.
\end{lemma}
\begin{proof}
Call the set of Leo bounds, \var{lb-set},
and call their union, \var{lb-union}.
By definition, \Vtpm{lb-union} is a tree predicate map.
We need to show that \var{lb-union} is also a Leo bound.
We do this by showing, by cases, that
the conditions of Definition \ref{d:leo-bounded} are preserved.
To show \eqref{e:lb-tp-bounded},
we consider
an arbitrary input \var{w} and an arbitrary location, \Vloc{i}.
Call the set of tree predicates at \Vloc{i} for \var{w},
\var{tp-i},
\begin{gather}
\var{tp-i} \subseteq \var{lb-set} \quad \text{and} \\
\label{e:bounded-union-3}
\var{lb-i} = \left\lbrace \Vtpm{lb}
\quad \text{such that} \quad
\size{
\bigcup_{\var{lb} \in \var{lb-i}}
{\Vtpm{lb}(\var{w},\var{i})}
} > 0
\right\rbrace
\end{gather}
By assumption, \var{lb-i} is bounded,
or using
\eqref{e:def-bounded-overlap}
\begin{equation}
\label{e:bounded-union-1}
\size{\var{lb-i}} = \Oc.
\end{equation}
\var{lb} is a Leo bound by definition,
and from condition \eqref{e:lb-tp-bounded} of that definition,
we know
\begin{equation}
\label{e:bounded-union-2}
\forall \var{lb} \in \var{lb-i}, \; \size{\var{lb}(\var{w},\var{i})} = \Oc.
\end{equation}
Using
\eqref{e:bounded-union-1}
and \eqref{e:bounded-union-2},
we see that
\begin{equation}
\label{e:bounded-union-4}
\sum_{\var{lb} \in \var{lb-i}}
{\bigl| \var{lb}(\var{w},\var{i}) \bigr| }
= \Oc \times \Oc.
\end{equation}
Since by
\eqref{e:bounded-union-3},
there are no tree predicates except for elements of \var{lb-i},
\begin{equation}
\size{\Vtpm{lb-union}(\var{w}, \var{i})} =
\sum_{\var{lb} \in \var{lb-i}}
{\bigl| \Vtpm{lb}(\var{w},\var{i}) \bigr| }.
\end{equation}
and using
\eqref{e:bounded-union-4},
\begin{equation}
\label{e:bounded-union-5},
\size{\Vtpm{lb-union}(\var{w}, \var{i})} = \Oc \times \Oc = \Oc.
\end{equation}
Because the choice of \Vstr{w} and \var{i} was arbitrary,
\eqref{e:bounded-union-5} shows
that condition \eqref{e:lb-tp-bounded}
of Definition \ref{d:leo-bounded}
is preserved in \Vtpm{lb-union}.
We now
show condition \eqref{e:lb-cost-bounded}
of Definition \ref{d:leo-bounded},
that the number of costed proofs for each tree predicate remains bounded.
To do this, we again
consider an arbitrary \Vloc{i}
and an arbitrary input \Vstr{w}.
This time we also consider an an arbitrary tree predicate at \Vloc{i}, \Vtp{tp}.
Every costed proof is a duple of predecessor and cause.
The find the cardinality of the costed proofs for \var{tp},
we examine the possibile values of each element this duple.
Causes must be either a scanned terminal symbol,
or another tree predicate at \Vloc{i}.
By \eqref{e:bounded-union-5},
the number of tree predicates at \var{i} is \Oc{},
so that the total number of possible causes is $\Oc + 1$.
Each cause of a costed proof uniquely determines a middle location,
call it \Vloc{mid}.
The predecessor of the costed proof must be a tree predicate at
\var{mid}.
By \eqref{e:bounded-union-5},
the number of tree predicates at \var{mid} is \Oc.
Since the number of causes is $\Oc + 1$ and the number of predecessors
for each cause is \Oc,
the number of costed proofs is
\begin{equation}
\label{e:bounded-union-6}
\var{Costed}(\Vtpm{lb-union}(\var{w}), \var{tp}) = \Oc \times (\Oc + 1) = \Oc.
\end{equation}
Because the choice of \Vstr{w} and \var{i} and \Vtp{tp} was arbitrary,
\eqref{e:bounded-union-6} shows
that condition \eqref{e:lb-cost-bounded}
of Definition \ref{d:leo-bounded}
is preserved in \Vtpm{lb-union}.
By assumption for the theorem,
condition \eqref{e:lb-rr-deterministic}
of Definition \ref{d:leo-bounded} is preserved;
\eqref{e:bounded-union-5} showed
condition \eqref{e:lb-tp-bounded};
and \eqref{e:bounded-union-6}
showed condition \eqref{e:lb-cost-bounded}.
\var{lb-union} is therefore a Leo bound.
\end{proof}
\begin{theorem}
The union of a set of Leo bounds is a Leo bound,
if it has bounded overlap and the elements of the set
contain no right recursions.
\end{theorem}
\begin{proof}
The assumption shows
condition \eqref{e:lb-rr-deterministic}
of Definition \ref{d:leo-bounded} vacuously
and the result follows directly from
Lemma \ref{t:leo_bound_union}.
\end{proof}
\begin{theorem}
The union of a set of Leo bounds is a Leo bound,
if it has bounded overlap and the elements of the set
are symbolically disjoint.
\end{theorem}
\begin{proof}
Call the set of Leo bounds, \var{tp-set}.
New costed proofs would have to involve tree predicates
from two or more of the elements of \var{tp-set}.
This is clearly not possible if they
are symbolically disjoint.
If there are no new costed proofs,
Condition \eqref{e:lb-rr-deterministic}
of Definition \ref{d:leo-bounded} will continue to hold.
The result follows directly from
Lemma \ref{t:leo_bound_union}.
\end{proof}
\begin{theorem}
The union of a set of Leo bounds is a Leo bound,
if the elements of the set
do not properly overlap,
\end{theorem}
\begin{proof}
Call the set of Leo bounds, \var{tp-set}.
By assumption Condition \eqref{e:lb-rr-deterministic}
of Definition \ref{d:leo-bounded} holds for the costed proofs
of each element of \var{tp-set}.
Any new costed proof must involve two or more elements of
of \var{tp-set}.
In a Marpa internal grammar,
neither the cause or the predecessor of a costed proof
is allowed to be zero-length, so that the proof would have to
be of a tree predicate which properly overlaps
two or more of the elements of \var{tp-set}.
But, by assumption, none of them do.
So condition \eqref{e:lb-rr-deterministic}
of Definition \ref{d:leo-bounded} will continue to hold.
In addition,
since the elements of \var{tp-set} cannot be zero length,
if they do not properly overlap, overlap can only take place
at a boundary between two elements of \var{tp-set}.
Therefore \var{tp-set} has a bounded overlap of 2.
Since the union does not introduce non-determinism to any existing
right recursive tree predicate,
and it has bounded overlap,
the result follows directly from
Lemma \ref{t:leo_bound_union}.
\end{proof}
Call the set of Leo bounds, \var{tp-set}.
\begin{mydefinition}[Grammar substitution]
\label{d:grammar-substitution}
\index{grammar substitution}
\index{substitution (of grammars)}
Let
\begin{equation*}
\var{g-top} = \bigl[
\var{V-top},
\var{N-top},
\var{P-top},
\var{S-top} \bigr],
\end{equation*}
be a grammar whose
terminal symbols are
\begin{equation*}
\Vsym{t}[1], \var{t}[2], \dotsc \Vsym{t}[\var{n}].
\end{equation*}
Let
\begin{equation}
\var{g}[\var{i}] = \bigl[
\var{V}[\var{i}],
\var{N}[\var{i}],
\var{P}[\var{i}],
\var{S}[\var{i}] \bigr],
1 \le \var{i} \le \var{n}
\end{equation}
be a series of grammars which are pairwise symbolically
equivalent and symbolically disjoint from \var{g-top}.
The \dfn{substitution} of every
\begin{equation*}
\var{g}[\var{i}], 1 \le \var{i} \le \var{n}
\end{equation*}
for the terminal of the equivalent index,
$\var{t}[\var{i}]$,
is the grammar
\begin{equation}
\var{g-subst} = \bigl[
\var{V-subst},
\var{N-subst},
\var{P-subst},
\Vsym{S-subst} \bigr],
\end{equation}
where \Vsym{S-subst} is a new start symbol and
\begin{gather}
\var{V-subst} = \set{ \var{S-subst} } \cup \var{V-top} \cup \bigcup_{1 \le \var{i} \le \var{n}} \var{V}[\var{i}], \\
\var{N-subst} = \set{ \var{S-subst} } \cup \var{N-top} \cup
\bigcup_{1 \le \var{i} \le \var{n}} \var{N}[\var{i}] \quad \text{and} \\
\var{P-subst} = \var{P-top} \cup \bigcup_{1 \le \var{i} \le \var{n}} \var{P}[\var{i}] \cup
\bigl\lbrace [ \var{t}[\var{i}] \de \var{S}[\var{i}] ] \bigr\rbrace.
\end{gather}
The new rules of the form $[ \var{t}[\var{i}] \de \var{S}[\var{i}] ]$
are called ``shim rules''.
\end{mydefinition}
\begin{theorem}[Linear grammar substitution]
\label{t:linear-grammar-substitution}
\index{Linear grammar substitution theorem}
\index{Theorems!linear grammar substitution}
Let \var{g-top}, \var{g-subst} and
$\var{g}[1], \var{g}[2], \dotsc \var{g}[\var{n}]$
be as in Definition \ref{d:grammar-substitution}.
If \var{g-top} is Leo-bounded,
and the set
\begin{equation}
\bigl\lbrace \var{g}[\var{i}] \bigm| 1 \le \var{i} \le \var{n} \bigr\rbrace
\end{equation}
is Leo-bound compatible,
then \var{g-subst} is Leo-bounded.
\end{theorem}
\begin{proof}
\end{proof}
\begin{definition}
\label{d:grammar-union}
Let \var{g1} and \var{g2} be two grammars.
Rewrite each so that they do not share LHS symbols.
(LHS symbols which are equivalent, in the sense they derive exactly the same
set of sentential forms, need not be rewritten.)
Call the rewritten grammars,
\begin{gather*}
\var{g1}' = [ \var{V1}', \var{N1}', \var{rule1}', \var{S1}'] \quad \text{and} \\
\var{g2}' = [ \var{V2}', \var{N2}', \var{rule2}', \var{S2}'].
\end{gather*}
Let \Vsym{S} be a non-terminal which is new,
that is, $\var{S} \notin \var{V1}' \cup \var{V2}'$.
Let \var{\Crules} be
\begin{equation*}
\bigset{
[\Vsym{S} \de \var{S1}'],
[\Vsym{S} \de \var{S2}']}
\cup \var{rule1}' \cup \var{rule2}'
\end{equation*}
Let \var{V} be $\var{V1}' \cup \var{V2}'$ and
let \var{N} be $\var{N1}' \cup \var{N2}'$.
Then \var{g}, the \dfn{union} of \var{g1} and \var{g2} is
\begin{equation}
\var{g} = [\var{V}, \var{N}, \var{\Crules}, \var{S}].
\end{equation}
The \dfn{union} of a set of grammars is the closure of
the union of pairs from the set.
$\blacklozenge$
\end{definition}
\begin{theorem}
\label{e:finite-union-is-leo-bounded}
The Leo-bounded grammars are closed under finite union.
\end{theorem}
\begin{proof}
Consider the union of \var{z} grammars,
which will call $\var{g}[0], \var{g}[0], \dots \var{g}[\var{z}]$.
Let the rules added for the union be described as
\begin{gather}
\Vsym{S-union} \de \var{S}[\var{x}],
\end{gather}
where \var{S-union} is the new start symbol for the union
and $\var{S}[\var{x}]$ is the start symbol of grammar $\var{g}[\var{x}]$.
Let $\var{Valid}(\var{g}[\var{x}], \Vloc{i})$
be the set of valid tree predicates for
$\var{g}[\var{x}]$ at \Vloc{i}.
The set of tree predicates for the union
at \Vloc{i}, $\var{Valid}(\var{g-union}, \var{i})$, is
\begin{equation}
\bigcup_{0 \le \var{x} \le \var{z}} \var{Valid}(\var{g}[\var{x}], \var{i}).
\end{equation}
To
$\var{Valid}(\var{g-union}, 0)$
we must add the predictions from the new start symbol
\begin{equation}
\bigcup_{0 \le \var{x} \le \var{z}}
\Bigl[ \bigl[ \var{S-union} \de \mydot \var{S}[\var{x}] \bigr], 0, 0 \Bigr].
\end{equation}
Finally, for every completion of one of the basis grammars,
\begin{equation}
\bigl[ \var{S}[\var{x}] \de \Vsf{rhs} \mydot \bigr], 0, \var{i} \Bigr],
\end{equation}
we must add a completion of the union
\begin{equation}
\bigl[ \var{S-union} \de \var{S}[\var{x}] \mydot \bigr], 0, \var{i} \Bigr].
\end{equation}
At most \var{z} tree predicates will be added for any location,
and this set of tree predicates meets the other conditions for preserving
Leo-boundedness.
\end{proof}
Concatenation also preserves Leo-boundedness,
with a condition -- the grammars which are the basis
of the closure must have the prefix property.
But, if the grammars which are the basis of a series
of concatenations have the prefix property,
the concatenations will preserve it.
\begin{theorem}
The Leo-bounded grammars with the prefix property
are closed under concatenation.
\end{theorem}
\begin{proof}
Consider the concatenation of \var{z} grammars, each
of which has the prefix property.
Call the grammar operands $\var{g}[0], \var{g}[0], \dots \var{g}[\var{z}]$,
let $\var{S}[\var{x}]$ be the start symbol of grammar $\var{g}[\var{x}]$.
We will create a new grammar, \var{g-cat},
that concatenates them by adding
and a new start symbol, \var{S-cat},
and a new start rule,
\begin{equation}
\Vsym{S-cat} \de \var{S}[0] \cat \var{S}[1] \dots \var{S}[\var{z}],
\end{equation}
Because the operand grammars have the prefix property,
for our arbitrary choice of input \Cwrange{0}{\var{j}\subtract 1},
each operand grammar will have at most one start
and one end location.
(Our input may only allow recognition of part of the concatenation.)
Note that, while we have assumed that $\var{g}[\var{z}]$
has the prefix property, we do not use that assumption here.
Let \Vloc{current} be a location, $0 \le \var{current} \le \var{j}$.
Let \var{current-g} be the operand grammar at \var{current},
If \Vloc{current} is the start of one grammar and the end of another,
\var{current-g} is the one that starts at \Vloc{current}.
Let \Vloc{offset} be the start position of \var{current-g}.
$\var{Valid}(\var{g-concat}, \var{current})$
includes all the tree predicates from the current operand grammar,
\begin{multline}
\bigl[
[ \Vsym{lhs} \de \Vsf{rhs1} \mydot \Vsf{rhs2} ],
\var{origin},
\var{current}
\bigr] \\
\in \var{Valid}(\var{current-g},\Vloc{current})
\end{multline}
offset appropriately:
\begin{equation}
\left[
\begin{aligned}
& [ \Vsym{lhs} \de \Vsf{rhs1} \mydot \Vsf{rhs2} ], \\
& \qquad \var{offset} + \var{origin}, \\
& \qquad \qquad \var{offset} + \var{current}
\end{aligned}
\right].
\end{equation}
We will call a tree predicate added to
$\var{Valid}(\var{g-concat})$ by offseting tree predicates from
one of its operand grammars, an offset tree predicate.
To the offset tree predicates, we must add some others,
which we will call ``original tree predicates''.
These will track the parse's progress through the start rule.
The original tree predicates will be
the prediction of
the concatenation's start rule,
\begin{equation}
\Bigl[ \bigl[
\Vsym{S-cat} \de \mydot \var{S}[0] \cat \var{S}[1] \dots \var{S}[\var{x} \bigr],
0, 0 \Bigr].
\end{equation}
and, for every grammar, $\var{g}[\var{n}]$, where
$\var{end}[\var{n}]$ is its final location,
\begin{equation}
\Bigl[
\bigl[