Skip to content
Find file
Fetching contributors…
Cannot retrieve contributors at this time
19675 lines (18515 sloc) 542 KB
% Copyright 2015 Jeffrey Kegler
% This document is licensed under
% a Creative Commons Attribution-NoDerivs 3.0 United States License.
% \RequirePackage[l2tabu, orthodox]{nag}
\usepackage{amsfonts}% to get the \mathbb alphabet
% \usepackage[columns=1]{idxlayout}
% In "Writing a thesis with LATEX", Lapo F. Mori
% says anything but '[tb]' and '[p]' for floats
% is counterproductive. I switched over based on
% his suggestion, and he's absolutely right.
% for escaping in index commands ...
% the quote sign does not seem to work in amsmidx
% \makeindex{recce-figures}
% This is used to find the font size if need be
% its uses are usually commented out
\newcommand\thefontsize[1]{{#1 The current font size is: \f@size pt\par}}
% \DeclareMathSizes{6}{6}{6}{6}
% \DeclareMathSizes{8}{8}{8}{8}
% \DeclareMathSizes{10}{10}{10}{10}
% \DeclareMathSizes{10.95}{10.95}{10.95}{10.95}
% \DeclareMathSizes{12}{12}{12}{12}
% \DeclareMathSizes{14.4}{14.4}{14.4}{14.4}
% \DeclareMathSizes{20.74}{20.74}{20.74}{20.74}
% \DeclareMathSizes{24.88}{24.88}{24.88}{24.88}
\newcommand{\mysee}[2]{\emph{see} #1}
% \newcommand{\todo}[1]{\par{\large\textbf Todo: #1}\par}
% I use multi-character variable names, and TeX's usual math italic
% does not work well for them. Rather than get into changing the
% default math font, I use a special macro for variables.
% It also allows me to use hyphens in variable names.
% Many of the macros have two forms --
% '\xyz' and '\Vxyz'. The \Vxyz is the same
% as the \xyz form, except that it typesets its
% argument as a math variable in the style of this
% monograph.
% definitions for "elements" -- x[i]
% definitions for "operators" -- x(a, ...)
\myfnname{#1}^{\displaystyle #2}(#3)
% For a type name, when it occurs in text
% For defining a type
\index{recce-definitions}{#1 (type)}%
\index{recce-notation}{#1@\ensuremath{\var{x}_{#1}} (#1 type)}%
\newcommand{\size}[1]{\ensuremath{\left | {#1} \right |}}
\newcommand{\bigsize}[1]{\ensuremath{\bigl| {#1} \bigr|}}
\newcommand{\order}[1]{\ensuremath{{\mathcal O}(#1)}}
% I use hyphens in variable names,
% so I need to ensure that subtraction is
% clearly distinguished by the typography
\newcommand{\incr}[1]{\ensuremath{#1 + 1}}
% \newcommand{\lblarrow}[1]
% {\mathrel{\mbox{$\:\stackrel{\!{#1}}{\mapsto\!}\:$}}}
\newcommand{\set}[1]{{\left\lbrace #1 \right\rbrace} }
\newcommand{\bigset}[1]{{\bigl\lbrace #1 \bigr\rbrace} }
\newcommand{\Bigset}[1]{{\Bigl\lbrace #1 \Bigr\rbrace} }
\newcommand{\symset}[1]{#1_{\lbrace SYM \rbrace} }
% \newcommand{\mkdelim}{\,\,}
\newcommand{\mkr}[1]{\mkdelim \mk{#1}}
\newcommand{\mkl}[1]{\mk{#1} \mkdelim}
\newcommand{\mkm}[1]{\mkdelim \mk{#1} \mkdelim}
\newcommand{\Vmkl}[1]{\Vmk{#1} \mkdelim}
\newcommand{\Vmkr}[1]{\mkdelim \Vmk{#1}}
\newcommand{\Vmkm}[1]{\mkdelim \Vmk{#1} \mkdelim}
% I want a hash sign to be a 'last index' operator, but
% the regular hash sign is ungainly and not resizeable.
% This tames it a bit.
\newcommand{\CW}{\ensuremath{\mathbb W}}
\newcommand{\Maxleosilo}[2]{\ensuremath{\myfnname{Max-Leo-silo}(#1, #2)}}
\newcommand\Rtablesize[1]{\ensuremath{\bigl| \myfnname{table}[#1] \bigr|}}
% I want to use 'call' outside of pseudocode
% I don't like to put whole paragraphs in italics,
% so I make this simple variation on the "plain" theoremstyle
{\normalfont} % BODYFONT
{0pt} % INDENT (empty value is the same as 0pt)
{\bfseries} % HEADFONT
{5pt plus 1pt minus 1pt} % HEADSPACE
% \newtheorem{oldtheorem}{Theorem}[chapter]
% \begin{center}
% \vspace{-.4\baselineskip}
% \rule{3em}{.5pt}
% \end{center}
\newenvironment{definition}{ \begin{baredefinition}
}{ \begin{center}
\end{center} \end{baredefinition} }
\newenvironment{observation}{ \begin{bareobservation}
}{ \begin{center}
\end{center} \end{bareobservation} }
\newenvironment{construction}{ \begin{bareconstruction}
}{ \begin{center}
\end{center} \end{bareconstruction} }
\ifnum #1 < 10 0\fi%
\ifnum #1 < 100 0\fi%
\ifnum #1 < 1000 0\fi%
% My macros for definition & theorem titles and references
% The 'q' forms are 'quiet' -- they do not index
% Don't use definition titles for index -- use \xdfn instead.
\padnum{\theoldtheorem}@Definition \theoldtheorem. #1%
% \newcommand\dtitle[1]{\textbf{#1}:}
{(D\ref{#2} ``#1'')}%
% \newcommand\qdtitle[1]{\textbf{#1}:}
\padnum{\theoldtheorem}@Theorem \theoldtheorem. #1%
\padnum{\theoldtheorem}@Lemma \theoldtheorem. #1%
Algorithm \padnum{\thealgorithm}@Algorithm \thealgorithm: #1%
Figure \padnum{\thefigure}@Figure \thefigure: #1%
\hyphenation{oper-and oper-ands}
% I use parboxes in equations. This sets a useful width for them.
\vspace{-\prevdepth} % remove the depth of the last line
\vspace{1ex} % add a fixed vertical space
% \makeindex
\title{The Marpa book}
\author{Jeffrey Kegler}
Copyright \copyright\ 2015 Jeffrey Kegler.
This document is licensed under
a Creative Commons Attribution-NoDerivs 3.0 United States License.
\thanks{\textbf{This is a draft.}}
\thanks{Date: \today}
% \begin{abstract}
% Marpa is
% a practical and fully implemented
% algorithm for the recognition,
% parsing and evaluation of context-free grammars.
% The Marpa recognizer is the first
% practical implementation
% of the improvements
% to Earley's algorithm found in
% Joop Leo's%
% \index{recce-general}{Leo, Joop}
% 1991 paper.
% Marpa has a new parse engine that
% allows the user to alternate
% between Earley-style syntax-driven parsing,
% and procedural parsing of the type available
% with recursive descent.
% Marpa is left-eidetic so that,
% unlike recursive descent,
% Marpa's procedural logic has available
% full information about
% the state of the parse so far.
% The Marpa recognizer described
% is a simplification of that
% in our 2013 paper~\cite{Marpa-2013}.
% \end{abstract}
\chapter{Status of this paper}
This paper is a draft.
Please use the date at the bottom of the
first page
to make sure this is the latest revision.
If this revision is not the latest, please ignore it.
If this does seems to be the latest version,
and you are adventurous,
then read on.
are in the ``advanced draft'' stage.
Advanced draft chapters are still subject to revision,
but the author hopes they are stable enough
to make comments and corrections useful.
Readers should note that changes in early draft chapters
sometimes require changes
to chapters whose content was thought to
be settled.
Therefore, it is possible that
even chapters in advanced draft status
will change dramatically.
TODO: When done with this revision, update chapters
are in \dfn{early draft} stage.
``Early draft'' means that the author's thoughts are not well settled,
and the chapters are likely to contain inconsistencies and errors.
Comments and corrections on early draft
chapters are not encouraged ---
the material may be already slated for deletion,
rewriting or rethinking.
The Marpa project was intended to create
a practical and highly available tool
to generate and use general context-free
Tools of this kind
had long existed
for LALR~\cite{Johnson} and
regular expressions.
But, despite an encouraging academic literature,
no such tool had existed for context-free parsing.
The first stable version of Marpa was uploaded to
a public archive on Solstice Day 2011.
This monograph describes the algorithm used
in the most recent version of Marpa,
It is a simplification of the algorithm presented
in our earlier paper~\cite{Marpa-2013}.
\section{A proven algorithm}
While the presentation in this monograph is theoretical,
the approach is practical.
The Marpa::R2 implementation has been widely available
for some time,
and has seen considerable use,
including in production environments.
Many of the ideas in the parsing literature
satisfy theoretical criteria,
but in practice turn out to face significant obstacles.
An algorithm may be as fast as reported, but may turn
out not to allow
adequate error reporting.
Or a modification may speed up the recognizer,
but require additional processing at evaluation time,
leaving no advantage to compensate for
the additional complexity.
In this monograph, we describe the Marpa
as it was implemented for Marpa::R2.
In many cases,
we believe there are better approaches than those we
have described.
But we treat these techniques,
however solid their theory,
as conjectures.
Whenever we mention a technique
that was not actually implemented in
we will always explicitly state that
that technique is not in Marpa as implemented.
\subsection{General context-free parsing}
As implemented,
Marpa parses
all ``proper'' context-free grammars.
proper context-free grammars are those which
are free of cycles,
unproductive symbols,
inaccessible symbols.
Worst case time bounds are never worse than
those of Earley~\cite{Earley1970},
and therefore never worse than $\order{\var{n}^3}$.
\subsection{Linear time for practical grammars}
Currently, the grammars suitable for practical
use are thought to be a subset
of the deterministic context-free grammars.
Using a technique discovered by
Marpa parses all of these in linear time.
Leo's modification of Earley's algorithm is
\On{} for LR-regular grammars.
Leo's modification
also parses many ambiguous grammars in linear
The original Earley algorithm kept full information
about the parse ---
including partial and fully
recognized rule instances ---
in its tables.
At every parse location,
before any symbols
are scanned,
Marpa's parse engine makes available
information about the state of the parse so far.
This information is
in useful form,
and can be accessed efficiently.
\subsection{Recoverable from read errors}
Marpa reads a token which it cannot accept,
the error is fully recoverable.
An application can try to read another
The application can do this repeatedly
as long as none of the tokens are accepted.
Once the application provides
a token that is accepted by the parser,
parsing will continue
as if the unsuccessful read attempts had never been made.
\subsection{Ambiguous tokens}
Marpa allows ambiguous tokens.
These are often useful in natural language processing
where, for example,
the same word might be a verb or a noun.
Use of ambiguous tokens can be combined with
recovery from rejected tokens so that,
for example, an application could react to the
rejection of a token by reading two others.
\section{Using the features}
\subsection{Error reporting}
An obvious application of left-eideticism is error
Marpa's abilities in this respect are
For example,
users typically regard an ambiguity as an error
in the grammar.
Marpa, as currently implemented,
can detect an ambiguity and report
specifically where it occurred
and what the alternatives were.
\subsection{Event driven parsing}
As implemented,
allows the user to define ``events''.
Events can be defined that trigger when a specified rule is complete,
when a specified rule is predicted,
when a specified symbol is nulled,
when a user-specified lexeme has been scanned,
or when a user-specified lexeme is about to be scanned.
A mid-rule event can be defined by adding a nulling symbol
at the desired point in the rule,
and defining an event which triggers when the symbol is nulled.
\subsection{Ruby slippers parsing}
Left-eideticism, efficient error recovery,
and the event mechanism can be combined to allow
the application to change the input in response to
feedback from the parser.
In traditional parser practice,
error detection is an act of desperation.
In contrast,
Marpa's error detection is so painless
that it can be used as the foundation
of new parsing techniques.
For example,
if a token is rejected,
the lexer is free to create a new token
in the light of the parser's expectations.
This approach can be seen
as making the parser's
``wishes'' come true,
and we have called it
``Ruby Slippers Parsing''.
One use of the Ruby Slippers technique is to
parse with a clean
but oversimplified grammar,
programming the lexical analyzer to make up for the grammar's
short-comings on the fly.
As part of Marpa::R2~\cite{Marpa-R2},
the author has implemented an HTML parser,
based on a grammar that assumes that all start
and end tags are present.
Such an HTML grammar is too simple even to describe perfectly
standard-conformant HTML,
but the lexical analyzer is
programmed to supply start and end tags as requested by the parser.
The result is a simple and cleanly designed parser
that parses very liberal HTML
and accepts all input files,
in the worst case
treating them as highly defective HTML.
\subsection{Ambiguity as a language design technique}
In current practice, ambiguity is avoided in language design.
This is very different from the practice in the languages humans choose
when communicating with each other.
Human languages exploit ambiguity in order to design highly flexible,
powerfully expressive languages.
For example,
the language of this monograph, English, is notoriously
Ambiguity of course can present a problem.
A sentence in an ambiguous
language may have undesired meanings.
But note that this is not a reason to ban potential ambiguity ---
it is only a problem with actual ambiguity.
Syntax errors, for example, are undesired, but nobody tries
to design languages to make syntax errors impossible.
A language in which every input was well-formed and meaningful
would be cumbersome and even dangerous:
all typos in such a language would be meaningful,
and parser would never warn the user about errors, because
there would be no such thing.
With Marpa, ambiguity can be dealt with in the same way
that syntax errors are dealt with in current practice.
The language can be designed to be ambiguous,
but any actual ambiguity can be detected
and reported at parse time.
This exploits Marpa's ability
to report exactly where
and what the ambiguity is.
Marpa::R2 own parser description language, the SLIF,
uses ambiguity in this way.
\subsection{Auto-generated languages}
\cite[pp. 6-7]{Culik1973} points out that the ability
to efficiently parse LR-regular languages
opens the way to auto-generated languages.
In particular,
\cite{Culik1973} notes that a parser which
can parse any LR-regular language will be
able to parse a language generated using syntax macros.
\subsection{Second order languages}
In the literature, the term ``second order language''
is usually used to describe languages with features
which are useful for second-order programming.
True second-order languages --- languages which
are auto-generated
from other languages ---
have not been seen as practical,
since there was no guarantee that the auto-generated
language could be efficiently parsed.
With Marpa, this barrier is raised.
As an example,
Marpa::R2's own parser description language, the SLIF,
allows ``precedenced rules''.
Precedenced rules are specified in an extended BNF.
The BNF extensions allow precedence and associativity
to be specified for each RHS.
Marpa::R2's precedenced rules are implemented as
a true second order language.
The SLIF representation of the precedenced rule
is parsed to create a BNF grammar which is equivalent,
and which has the desired precedence.
the SLIF does a standard textbook transformation.
The transformation starts
with a set of rules,
each of which has a precedence and
an associativity specified.
The result of the transformation is a set of
rules in pure BNF.
The SLIF's advantage is that it is powered by Marpa,
and therefore the SLIF can be certain that the grammar
that it auto-generates will
parse in linear time.
Notationally, Marpa's precedenced rules
are an improvement over
similar features
in LALR-based parser generators like
yacc or bison.
In the SLIF,
there are two important differences.
First, in the SLIF's precedenced rules,
precedence is generalized, so that it does
not depend on the operators:
there is no need to identify operators,
much less class them as binary, unary, etc.
This more powerful and flexible precedence notation
allows the definition of multiple ternary operators,
and multiple operators with arity above three.
Second, and more important, a SLIF user is guaranteed
to get exactly the language that the precedenced rule specifies.
The user of the yacc equivalent must hope their
syntax falls within the limits of LALR.
\section{How to read this document}
TODO: When done with this revision, update chapters
\ref{ch:preliminaries} describes the notation and conventions
of this monograph.
Chapter \ref{ch:rewrite} deals with Marpa's
grammar rewrites.
The next three sections develop the ideas for Earley's algorithm.
Chapter \ref{ch:dotted} describes dotted rules.
Chapter \ref{ch:earley-items} describes Earley items.
Chapter \ref{ch:tethers} introduces tethers,
which are chains of top-down causation.
Chapter \ref{ch:earley-tables} describes
the remaining ideas behind
basic Earley implementations,
Earley sets
and Earley tables.
Chapter \ref{ch:silos} introduces silos.
Like tethers, silos are chains of causation,
but unlike tethers, in silos the causation is
largely bottom-up.
Chapter \ref{ch:leo} describes Leo's modification
to Earley's algorithm.
Chapter \ref{ch:pseudocode} presents the pseudocode
for Marpa's recognizer.
contains a proof of Marpa's correctness.
Chapter \ref{ch:complexity} sets out our
time and space complexity results.
Because of its practical applications,
we expect this monograph to be of interest to many
who do not ordinarily read documents with this
level of mathematical apparatus.
For those readers, we offer some suggestions
which will be well known to our more mathematical
In most fields, texts are intended to be,
and often are, read through to the end,
starting at page one.
A monograph of this kind is rarely
read that way.
Even the most mathematical sophisticated
reader will skip most or all
of the proofs
on a first reading.
And a mathematically inclined
reader will usually
not read a proof line-by-line
unless and until her previous readings
have convinced her that the proof
is of sufficient
interest to deserve this kind of
This is not to say that we think
that the proofs are unimportant.
The proofs explore how our ideas
and claims
are connected to each other.
There is a aesthetic satisfaction in
this deeper level of knowledge.
And the proofs
increase our
assurance that our claims are, in fact, true.
But the proofs are also of practical use,
even to the programmer who is willing to
take our word for everything in these pages.
If, when coding,
the programmer only knows ``what'' and ``how'',
he will find it hard to keep all of this material
in his mind at once.
If the programmer knows,
not just ``what'' and ``how'',
but also ``why'',
he will understand the connections among
these ideas.
When the programmer
understands ``why'',
the book is always open in front of him,
turned to whatever page it is that
he needs.
We expect
most readers of this monograph
to have a practical bent.
For those readers,
one way to start is to read the preliminary
material for as long as it seems relevant,
skipping the lemmas,
as well as the proofs for both lemmas and
When impatience for something ``closer to
the metal'' arises,
this reader should
jump ahead
to the pseudocode in
Chapter \ref{ch:pseudocode}.
We assume familiarity with the theory of parsing.
Earley's algorithm is described in full,
but previous familiarity will be helpful.
This monograph will
use subscripts to indicate commonly occurring types.
$\var{X}_T$ & The variable \var{X} of type \type{T} \\
$\var{set-one}_\set{T}$ & The variable \var{set-one} of type set of \type{T} \\
\type{SYM} & The type for a symbol \\
\type{STR} & The type for a string \\
\type{EIM} & The type for an Earley item \\
\sym{\var{a}} & A variable \var{a} of type \type{SYM} \\
\str{\var{a}} & A variable \var{a} of type \type{STR} \\
\Veim{a} & A variable \var{a} of type \type{EIM} \\
\Vsymset{set-two} & The variable \var{set-two}, a set of strings \\
\Veimset{set-two} & The variable \var{set-two}, a set of Earley items
Strings and symbols occur frequently and have a special
\Vsym{a} & \var{a}, a symbol variable \\
\Vstr{a} & \var{a}, a string variable
Subscripts may be omitted when the type
is obvious from the context.
The notation for
constants is the same as that for variables.
Multi-character variable names will be common.
Concatenation is shown
only when useful for clarity.
All other operations are always explicit.
Multiplication & $\var{a} \times \var{b}$ \\
Concatenation & $\var{a} \Cat \var{b}$ \\
Subtraction & $\var{symbol-count} \subtract \var{terminal-count}$ \\
Where $\myfnname{f}$ is a function,
we use the notation $\myfnname{f}^{\displaystyle \var{n}}$
for the iterated function, so that
\myfnname{f}^0(\var{x}) \quad & \defined \quad \var{x}, \\
\myfnname{f}^1(\var{x}) \quad & \defined \quad \myfnname{f}(\var{x}), \\
\myfnname{f}^2(\var{x}) \quad & \defined \quad
\myfnname{f}(\myfnname{f}(\var{x})), \quad \text{etc.} \\
\text{Also,} \quad \myfnname{f}^\ast \quad & \defined \quad
\myfnname{f}^\var{n} \quad \text{for some $\var{n} \ge 0$ and} \\
\myfnname{f}^+ \quad & \defined \quad
\myfnname{f}^\var{n} \quad \text{for some $\var{n} \ge 1$}.
The statements of this monograph often require us to introduce
many new variables at once,
so that we might say,
``for some \var{a}, \var{b}, \var{c}, \ldots{} \var{z},
let \ldots{}''.
When we introduce an definition, and it
contains new variables
which cause no loss of generality,
we will prefer to simply say so,
noting any exceptions.
In cases where brevity is important,
such as in proofs,
we may abbreviate
``without loss of generality'' as \dfn{WLOG},
``assumption'' as \dfn{ASM},
``theorem'' as \dfn{Th},
``definition'' as \dfn{Def}.
We use the standard notation for equations,
``(\textit{n})'' to refer to equation \textit{n}.%
To indicate references, we write,
where \var{n} is a reference number,
to say Lemma \textit{n},
to say Theorem \textit{n}
to say Definition \textit{n}.
A definition reference may define several terms:
when we wish to pinpoint one of these we write
the reference
(D\textit{n} ``\textit{x}'')%
\index{recce-notation}{(D0 "x")@(D\textit{n} ``\textit{x}'')}%
\index{recce-notation}{D0 x@(D\textit{n} ``\textit{x}'')}
to say the definiton
of ``\textit{n}'' in Definition section
with reference number \textit{n}.
For example, we would write
& \text{(D42) to say Definition 42;} \\
& \text{(D7 ``\var{x}'') to say the definition of ``x'' in Definition section 7;} \\
& \text{(T11) to say Theorem 11; and} \\
& \text{(L22) to say Lemma 22.}
the subsitution of one thing for another thing with
a related meaning,
is common in language.
For example, ``Hollywood'' is a town in California,
but the word is often used to mean the U.S. entertainment
Where our use of metonymy is non-obvious or non-intuitive,
we will make it a matter of explicit definition.
For example,
in what follows, we will define Earley items,
which contain dotted rules.
Dotted rules in turn contain rules.
When we apply a rule notion to a dotted rule,
we will mean to apply that notion to the rule of the dotted rule.
When we apply a dotted rule notion to an Earley item,
we will mean to apply that notion to the dotted rule
of the Earley item.
Metonymic application of
notions will be transitive so that,
for example,
when we apply a rule notion to an Earley item,
we will mean to apply that notion to the rule
of the dotted rule of the Earley item.
\section{Undefineds and non-well-defineds}
We will use the symbol \undefined%
to mean ``undefined''
in various contexts.
For partial functions,
we will use the term ``domain'' as in category theory,
so that a partial function is not necessarily defined
for every element of its domain.
For example, we will write
\Vop{f}{x} = \undefined
to say that the partial function \myfnname{f} is
undefined for the domain element \var{x}.
\dtitle{Comparison involving undefineds}
If a value is undefined,
a second value is considered to be equal to it if and only if
that second value is also undefined:
\var{x} \, = \, \undefined \;\; \implies \;\; (\var{x} \, = \, \var{y} \; \equiv \; \var{y} \, = \, \undefined).
Traditionally, an expression is not well-defined if any element of
the expression is not well-defined.
We will find it convenient to define the result of
logical conjunction ($\land$),
logical disjunction ($\lor$),
and implication ($\Longrightarrow$)
% [ \Longrightarrow, not \implies, to avoid math spacing ]
for some cases where their second argument is not
Where its second argument is not well-defined,
an implication is well-defined and true,
if its first argument is false;
a logical conjunction is well-defined and false,
if its first argument is false; and
a logical disjunction is well-defined and true,
if its first argument is true.
Note that, under this convention,
logical conjunction and logical disjunction are assymmetric.
For example, in most cases the value of a function
will not be well-defined if its argument is undefined.
In the following, let
$\var{x} = \undefined$,
and assume that \Vop{F}{\undefined} is not well-defined.
& (\var{x} = \undefined) \lor \; \Vop{F}{x}
&& \text{is well-defined and true;}
& (\var{x} \neq \undefined) \land \; \Vop{F}{x}
&& \text{is well-defined and false; and}
& (\var{x} \neq \undefined) \implies \Vop{F}{x}
&& \text{is well-defined and true.}
Note that
the first arguments in
are well-defined, because
of \dref[comparison of undefineds]{comparison-involving-undefineds}.
These conventions eliminate the need for a lot of
special cases.
We believe the reader will find these conventions
natural and convenient in practice.
These conventions about undefinedness apply even when
we write the logical operations out verbally
(``if \var{X} and \var{Y}, then \var{X} or \var{Y}'')
instead of symbolically
($\var{X} \; \land \; \var{Y} \implies \var{X} \; \lor \; \var{Y}$).
When referring to the value of a variable in a algorithm, we
will usually need, not just the variable's name, but its line
location and perhaps other information, such as an ordinal
describing the pass through a loop.
We will write \vat{\var{v}}{\var{n}}%
for the value
of \var{v} just after the execution of line \var{n}.
Where more than the line number
is needed to specify the value of \var{v},
we will use additional arguments,
as described for each algorithm.
For example,
for some algorithm
might specify the value
of \var{v} just after the execution of line \var{n}
on the \var{p}'th pass through a loop.
Unless otherwise specified, the indexes
of a sequence are consecutive integers,
starting with zero.
Where \var{s1} and \var{s2} are sequences,
we write
for the last index of \var{s1};
$\var{s1} + \var{s2}$,%
for the concatenation of the series \var{s2} after the series
We will write
\Vel{s1}{\var{a} \ldots \var{z}}
for the subsequence
\Vel{s1}{a}, \;\;
\el{s1}{\Vincr{a}}, \;\;
\ldots \;\;
The following theorem will prove useful.
\ttitle{Sequence overlap}
TODO: This theorem not reviewed. Is it needed?
Can I delete it?
Let \var{mast} be a master sequence containing two subsequences,
\var{con} and \var{con2},
such that \var{con} does not contain the top or bottom
of \var{con2}:
\var{con} \subseteq \var{mast}
\; \land \; \var{con2} \subseteq \var{mast} \\
\el{con2}{0} \notin \var{con}
\; \land \; \el{con2}{\Vlastix{con2}} \notin \var{con}.
Let \var{a} and \var{b} be elements,
such that
\var{a} \in \var{con}
\; \land \; \var{b} \in \var{con}.
\var{a} \in \var{con2} \equiv \var{b} \in \var{con2}.
for a reductio,
that exactly one of \var{a} and \var{b}
is an element of \var{con2}.
Without loss of generalization, we formalize
the assumption for the reductio as
\var{a} \in \var{con2} \; \land \; \var{b} \notin \var{con2}.
Since \var{con} and \var{con2} are,
by assumption for the theorem, subsequences of \var{mast},
\var{a} & = \Vel{mast}{aix} \\
\var{b} & = \Vel{mast}{bix} \\
\Vel{con}{0} & = \Vel{mast}{loc} \\
\Vel{con}{\Vlastix{con}} & = \Vel{mast}{hic} \\
\Vel{con2}{0} & = \Vel{mast}{loc2} \\
\Vel{con2}{\Vlastix{con2}} & = \Vel{mast}{hic2}
We will find it useful to express facts about containment
as relations among indexes of \var{mast}:
& \var{a} \in \var{con} && \text{ASM Th} \\
& \var{loc} \le \var{aix} \le \var{hic} &&
& \var{b} \in \var{con} && \text{ASM Th} \\
& \var{loc} \le \var{bix} \le \var{hic} &&
Assume for an inner reductio that
\var{aix} \le \var{bix}
From \eqref{t:sequence-overlap-18},
and \eqref{t:sequence-overlap-50},
we have
\var{loc2} \le \var{aix} \le \var{hic2} < \var{bix} \\
\therefore \; \var{aix} \le \var{hic2} < \var{bix}.
we have
& \var{loc} \le \var{aix} \le \var{hic2} < \var{bix} \le \var{hic} &&
\therefore \; & \var{loc} \le \var{hic2} < \var{hic} &&
\therefore \; & \el{con2}{\Vlastix{con2}} \in \var{con} &&
\eqref{t:sequence-overlap-61} is contrary
to assumption for theorem.
This shows the inner reductio,
from which we conclude that
\eqref{t:sequence-overlap-50} is false,
and therefore that
\var{aix} > \var{bix}
Again using \eqref{t:sequence-overlap-18},
and \eqref{t:sequence-overlap-30},
but this time with \eqref{t:sequence-overlap-70},
we have
& \var{bix} < \var{loc2} \le \var{aix} \le \var{hic2} \\
\therefore \; & \var{bix} < \var{loc2} \le \var{aix} \\
\therefore \; & \var{loc} \le \var{bix} \le \var{loc2} < \var{aix} \le \var{hic} &&
\therefore \; & \var{loc} \le \var{loc2} < \var{hic} &&
\therefore \; & \el{con2}{0} \in \var{con} &&
\eqref{t:sequence-overlap-81} is contrary to assumption for the theorem,
which shows the outer reductio and the theorem.
Where \Vsymset{syms} is non-empty set of symbols,
let $\var{syms}^\ast$ be the set of all strings
(type \dtype{STR}) formed
from those symbols.
Let $\var{syms}^+$ be
\bigl\{ \Vstr{x}
\bigm| \Vstr{x} \in \var{syms}* \;\; \land \;\; \Vstr{x} \neq \epsilon
A grammar
is a 4-tuple:
(\Vsymset{nt}, \Vsymset{term}, \var{rules}, \Vsym{accept}).
\Vsymset{nt} is a set of symbols called non-terminals,
and \Vsymset{term} is a set of symbols called terminals.
Here $\Vsym{accept} \in \var{nt}$.
The vocabulary of the grammar is the union of
the sets of terminals and non-terminals:
\[ \Vsymset{vocab} = \Vsymset{nt} \cup \Vsymset{term}. \]
If a string of symbols contains only terminal symbols,
that string is called a \dfn{sentence}.
When a string contains only terminals, we also write its length
in symbols as
\index{recce-notation}{\Pipe{}str\Pipe{}@\Vsize{str} (size of a string of terminals)}
The length of a string which contains non-terminals will be defined later,
when we discuss inputs.
\Vruleset{rules} is a set of rules (type \dtype{RULE}),
where a rule is a duple
of the form $[\Vsym{lhs} \de \Vstr{rhs}]$,
such that
\Vsym{lhs} \in \var{nt} \quad \text{and}
\quad \Vstr{rhs} \in \var{vocab}^\ast
\Vsym{lhs} is referred to as the left hand side (LHS)
of \Vrule{r}.
\Vstr{rhs} is referred to as the right hand side (RHS)
of \Vrule{r}.
The LHS and RHS of \Vrule{r} may also be
referred to as
We will sometimes treat
\RHS{\Vrule{r}} as a sequence,
so that
to refer to the \var{i}'th RHS symbol instance
of \Vrule{r}.
For example,
\el{\RHS{\Vrule{r}}}{0} is the first
symbol instance on the RHS of \Vrule{r};
\el{\RHS{\Vrule{r}}}{2} is the third
symbol instance on the RHS of \Vrule{r}.
Where \Vrule{r} is a rule,
\index{recce-notation}{\Pipe{}rule\Pipe{}@\Vsize{rule}!size of a RULE}
is the length of its RHS, in symbols;
\index{recce-notation}{#rule@\Vlastix{rule}!last index of the RHS of a RULE}
is equal to \Vlastix{\RHS{\Vrule{r}}}.
Therefore, the last symbol instance of the RHS of \Vrule{r}
may be referred to as any of
\el{\RHS{\Vrule{r}}}{\Vlastix{\RHS{\Vrule{r}}}}, \\
\el{\RHS{\Vrule{r}}}{\Vlastix{\Vrule{r}}} \quad \text{or} \\
\el{\RHS{\Vrule{r}}}{(\Vdecr{\Vsize{\RHS{\Vrule{r}}}})}. \\
An alternative way of referring to the \var{i}'th RHS symbol instance
of \Vrule{r} is
\op{RHS}{\Vrule{r}, \var{i}}.%
\index{recce-notation}{RHS(r, ix)@\op{RHS}{\Vrule{r}, \var{ix}}}
We will write
\el{\RHS{\Vrule{r}}}{\var{a} \ldots \var{z}}
for the subsequence of RHS symbol instances
\el{\RHS{\Vrule{r}}}{\var{a}}, \;\;
\el{\RHS{\Vrule{r}}}{\Vincr{a}}, \;\;
\ldots \;\;
The rules imply the traditional rewriting system.
We write
& \myparbox{%
$\Vstr{x} \derives \Vstr{y}$
to say that
\Vstr{x} derives \Vstr{y} in exactly one step.
If a superscript is placed over the arrow,
it indicates
the number of derivation steps.
So we also write
& \myparbox{%
$\Vstr{x} \xderives{1} \Vstr{y}$
to say that
\Vstr{x} derives \Vstr{y} in one step;
} \\
& \myparbox{%
$\Vstr{x} \xderives{\var{n}} \Vstr{y}$
to say that
\Vstr{x} derives \Vstr{y} in \var{n} steps;
} \\
& \myparbox{%
$\Vstr{x} \xderives{0} \Vstr{y}$
is a derivation in zero steps;
} \\
& \myparbox{%
$\Vstr{x} \destar \Vstr{y}$
to say that
\Vstr{x} derives \Vstr{y} in zero or more steps;
} \\
& \myparbox{%
$\Vstr{x} \deplus \Vstr{y}$
to say that
\Vstr{x} derives \Vstr{y} in one or more steps.
A derivation in zero steps,
is called a
\xdfn{trivial derivation}{trivial (derivation)}.
A symbol \Vsym{x} is
\xdfn{nullable}{nullable!in traditional parsing theory} if and only if
$\Vsym{x} \destar \epsilon$.
We say that symbol \Vsym{x} is
\xdfn{nulling}{nulling!in traditional parsing theory}
if and only if
\text{for all \Vstr{y}, if $\Vsym{x} \destar \Vstr{y}$,
then $\Vstr{y} \destar \epsilon$.
A symbol is a
\xdfn{proper nullable}{proper nullable!in traditional parsing theory}
if it is nullable, but not nulling.
Note that,
following \cite[Vol. 1, p. 86]{AU1972},
if $\Vstr{x} \destar \Vstr{y}$ then,
for some \var{n}, $\var{n} \ge 0$,
we have
$\Vstr{x} \xderives{\var{n}} \Vstr{y}$
if $\Vstr{x} \deplus \Vstr{y}$,
then for some \var{n},
$\var{n} \ge 1$, we have
$\Vstr{x} \xderives{\var{n}} \Vstr{y}$.
\dtitle{Nulling strings}
We say that a string, call it \Vstr{x}, is
if and only if
\Vstr{x} \destar \epsilon.
We say that a string, call it \Vstr{x}, is
if and only if
for all \Vstr{y}, if $\Vstr{x} \destar \Vstr{y}$,
then $\Vstr{y} \destar \epsilon$.
These definitions can be satisfied vacuously.
If \Vstr{x} is the empty string of symbols,
then \Vstr{x} is nullable.
And, if \Vstr{x} is the empty string of symbols,
then \Vstr{x} is nulling.
The literature does not always distinguish between two
meanings of the term ``derivation step''.
It can sometimes mean a single string in a derivation,
and at other times means the action of one string
deriving another.
In this paper, we will say ``step'' to mean a single
string in a derivation,
and we will call the transition from one string to another,
a \dfn{derivation move} or, when it is clear in context,
a \dfn{move}.
For example,
\Vstr{x} \derives \Vstr{y},
By ``step'' we mean
\Vstr{x} and \Vstr{y} considered separately,
so that there are two steps in
By ``move''
we mean \eqref{eq:def-move-1} considered as a whole,
so that
\eqref{eq:def-move-1} is a single ``move'',
with \Vstr{x} as its left side,
\Vstr{y} as its right side.
We say that \Vstr{desc} is a direct descendant of \Vsym{A} if
it is \Vstr{A-rhs} where $\Vsym{A} \de \Vstr{A-rhs}$ is a rule,
or if it is the empty string where \Vsym{A} is a nulling terminal.
We say that a derivation is \dfn{leftmost} if at each of its steps,
its leftmost nonterminal is replaced by one of its direct descendants.
We say that a derivation is \dfn{rightmost} if at each of its steps,
its rightmost nonterminal is replaced by one of its direct descendants.
When we want to make clear which step a symbol instance is from,
we will write $\var{si}@\var{x}$ to indicate
the symbol instance \Vinst{si} at Step \var{x}.
We will sometimes write the symbol instance using only
the symbol, in which case
$\var{A}@\var{x}$ will indicate
an instance of symbol \Vsym{A} at Step \var{x},
For example, let
\Vsym{A} \derives \Vsym{A}
be a derivation,
where the left hand side of the derivation
in \eqref{eq:symbol-step-instance-notation}
is Step 1 of the derivation,
so that
the right hand side of the derivation
in \eqref{eq:symbol-step-instance-notation}
must be Step 2 of the derivation.
$\var{A}@1$ indicates the instance of \Vsym{A} in Step 1,
$\var{A}@2$ indicates the instance of \Vsym{A} in Step 2,
so that
\var{A}@1 \derives \var{A}@2
is equivalent to
Consider the derivation
\Vsym{A} \derives \Vstr{rhs} \destar \Vstr{left} \cat \Vsym{A} \cat \Vstr{right}
We say that the rule $\Vsym{A} \de \Vstr{rhs}$
and the symbol \Vsym{A} are
\dfn{middle-recursive} & if
\text{$\Vstr{left} \ndestar \epsilon$ and $\Vstr{right} \ndestar \epsilon$} \\
\dfn{left-recursive} & if $\Vstr{left} \destar \epsilon$ \\
\dfn{right-recursive} & if $\Vstr{right} \destar \epsilon$ \\
\dfn{cyclic} & if
\text{$\Vstr{left} \destar \epsilon$ and $\Vstr{right} \destar \epsilon$}.
Except where otherwise stated,
our discussions in this monograph will
assume, without loss of generality,
a grammar of interest in that context,
\var{g}, such that
\var{g} = (\Vsymset{nt}, \Vsymset{term}, \var{rules}, \Vsym{accept}), \quad \text{where}
\Vsymset{vocab} = \Vsymset{nt} \cup \Vsymset{term}.
The language of \var{g} is
\myL{\Cg} \defined \left\lbrace
\Vstr{z} \mid \Vstr{z} \in \var{term}^\ast \land \Vsym{accept} \destar \Vstr{z}
\Earley{} will refer to the Earley's original
\Leo{} will refer to Leo's revision of \Earley{}
as described in~\cite{Leo1991}.
\Marpa{} will refer to the parser described in
this monograph.
Where $\alg{Recce}$ is a recognizer,
$\myL{\alg{Recce},\Cg}$ will be the language accepted by $\alg{Recce}$
when parsing \Cg{}.
TODO: Reviewed to HERE
\section{Marpa internal grammars}
Following Aycock and Horspool~\cite{AH2002},
Marpa grammars use a rewrite to eliminate proper nullables,
and nulling rules.
\dtitle{Marpa external and internal grammars}
The pre-rewrite grammar is called a
\dfn{Marpa external grammar},
or, more briefly,
\dfn{external grammar}.
The post-rewrite grammar is a
\dfn{Marpa internal grammar}%
\index{recce-definitions}{grammar, Marpa internal|mysee{Marpa internal grammar}}%
\index{recce-definitions}{grammar, internal|mysee{Marpa internal grammar}}
or, more briefly,
\xdfn{internal grammar}{internal grammar|mysee{Marpa internal grammar}}.
Because of the rewrite,
a Marpa internal grammar
has no nulling rules, and
has no proper nullable symbols.
In the rest of this monograph,
when we refer to either a
\xdfn{grammar}{grammar|myixentry{defaulting to Marpa internal grammar}}
or a
\xdfn{Marpa grammar}{Marpa grammar|myixentry{defaulting to Marpa internal grammar}}%
\index{recce-definitions}{grammar, Marpa|myixentry{defaulting to Marpa internal grammar}}
we will mean a Marpa internal grammar,
unless otherwise stated.
The external, pre-rewrite grammars
are the only ones visible to
the users of Marpa::R2.
The internal, post-rewrite grammars
are the ones actually used by the parse engine,
and therefore are
the ones described in this paper.
Details of this rewrite are given
in Chapter \ref{ch:rewrite}.
\dtitle{Telluric symbols and strings}
We say that
a symbol of a Marpa internal grammar is
if and only if it is non-nulling.
We say that a string is
if and only if
the string contains a telluric symbol,
We use
the term ``telluric'' only for Marpa internal grammars.
In Marpa internal grammars, we know that there
are no proper nullable symbols,
so that we
can rely on
``telluric'' to be strict antonym of ``nullable''.
This is \textbf{not} the case for Marpa external grammars ---
non-nulling symbols of Marpa external grammars may be proper
nullables, so that ``non-nulling'' is not always
an antonym of ``nullable''.
We will make frequent use
of the fact that Marpa internal telluric symbols
are never nullable.
Calling such symbols ``telluric'' makes
it clear they have this property,
and will save us much confusion.
The term ``telluric'' originally means, roughly, ``of the earth''.
Later we will use it in contrast to the term
In the statement of the following theorems,
as in the rest of this monograph,
we will assume that we are speaking of Marpa internal grammars,
unless stated otherwise.
\ttitle{Nullable symbol if and only if nulling}
A symbol is nullable if and only if it is nulling.
For the purposes of this proof we read
\op{Nulling}{\Vsym{x}} as ``symbol \var{x} is nulling'',
and we read
\op{Nullable}{\Vsym{x}} as ``symbol \var{x} is nullable''.
Recall that, by default, we are speaking of Marpa internal
No symbol in a Marpa internal grammar is a proper nullable,
that is,
& \myparbox{%
there is no
\Vsym{s} such that
$\op{Nullable}{\Vsym{s}} \land \neg \op{Nulling}{\Vsym{s}}$
\dref[Marpa internal grammar]{def:marpa-grammar},
} \\
& \myparbox{%
For all \Vsym{s},
$\neg \op{Nullable}{\Vsym{s}} \lor \op{Nulling}{\Vsym{s}}$
} \\
& \myparbox{%
For all \Vsym{s},
$\op{Nullable}{\Vsym{s}} \implies \op{Nulling}{\Vsym{s}}$
} \\
& \myparbox{%
For all \Vsym{s},
$\op{Nulling}{\Vsym{s}} \implies \op{Nullable}{\Vsym{s}}$
Def of nulling and nullable for symbols.
} \\
& \myparbox{%
For all \Vsym{s},
$\op{Nulling}{\Vsym{s}} \iff \op{Nullable}{\Vsym{s}}$
which shows the theorem.
\ttitle{Telluric is not nullable}
A symbol is telluric,
if and only if
it is both non-nulling and non-nullable.
For the purposes of this proof,
we write \op{Telluric}{\Vsym{x}}
to say that ``symbol \var{x} is telluric'';
to say that ``symbol \var{x} is nulling'';
and \op{Nullable}{\Vsym{x}}
to say that ``symbol \var{x} is nullable''.
& \myparbox{%
For all \Vsym{s},
} \\
& \myparbox{%
For all \Vsym{s},
show the theorem.
\ttitle{Telluric string if and only if nullable}
A string is telluric if and only if
it is both non-nullable and non-nulling.
Assume for a reduction that \Vstr{tell}
is a telluric string.
\Vstr{tell} contains a telluric symbol,
call it \Vsym{tell}
\dref[telluric string]{def:telluric}.
Assume for a reductio,
that \Vstr{tell} is nullable,
so that, without loss of generality,
$\Vstr{tell} = \Vstr{pre} \Vsym{tell} \Vstr{post}
\derives \epsilon$.
} \\
$\Vsym{tell} \derives \epsilon$
} \\
\Vsym{tell} is nullable
} \\
\Vsym{tell} is not nullable
and \eqref{eq:telluric-string-iff-nullable-16}
show the reductio.
From the reductio,
we conclude that
& \myparbox{%
\Vstr{tell} is not nullable.
TODO: finish
All LHS symbols are telluric.
\begin{proof} TODO \end{proof}
All nullable symbols are terminals.
\begin{proof} TODO \end{proof}
A string is derived from a telluric string
if and only if
it contains at least one telluric symbol.
\begin{proof} TODO \end{proof}
A string is derived from a telluric string
if and only if it is telluric.
\begin{proof} TODO \end{proof}
A sentence is derived from a telluric string
if and only if
it has an input length of at least one.
\begin{proof} TODO \end{proof}
Since there are no nulling rules in Marpa's internal grammars,
a symbol is never nulled as the result of a derivation step.
Therefore, where \Vsym{nulling} is a nulling symbol,
\Vsym{nulling} & \nderives{1} \epsilon \quad \text{and} \\
\Vsym{nulling} & \ndeplus \epsilon.
Strictly speaking, it is not wrong to say that
\Vsym{nulling} & \xderives{0} \epsilon \quad \text{or} \\
\Vsym{nulling} & \destar \epsilon
although it is misleading,
and we avoid it in favor of
\Vsym{nulling} = \epsilon,
which more clearly conveys what it means for a
Marpa internal symbol to be ``nulling''.
\dtitle{Accept rule and symbol}
A Marpa internal grammar always has
a dedicated acceptance rule, \Vrule{accept}
and a dedicated acceptance symbol,
\Vsym{accept} = \LHS{\Vrule{accept}},
such that
for all \Vrule{x},
\Vsym{accept} \notin \RHS{\Vrule{x}}
\Vsym{accept} = \LHS{\Vrule{x}} \implies \Vrule{accept} = \Vrule{x}.
We assume that a Marpa grammar is cycle-free ---
that none of its rules are cyclic.
We assume that every symbol is productive ---
that is, that it derives a sentence.
We assume that every symbol is accessible ---
that is, that it is derivable from the start symbol.
Let the actual input to
the parse be \Cw{} such that $\Cw \in \myL{\Cg}$.
Locations in the input will be of type \dtype{LOC}.
Let \Vsize{w}%
\index{recce-notation}{\Pipe{}input\Pipe{}@\Vsize{input} (size of an input)}
be the length of the input, counted in symbols.
When we state our complexity results later,
they will often be in terms of $\var{n}$,
where $\var{n} = \Vsize{w}$.
Let \CVw{i} be character
at position \var{i}
of the input.
String position is zero-based,
so that
$0 \le \Vloc{i} < \Vsize{w}$.
Let $\var{w}[\var{a}, \var{b}]$
be the contiguous substring
from position \var{a} to
position \var{b}, inclusive,
so that
\[ \bigsize{\var{w}[\var{a}, \var{b}]} = (\var{b} \subtract \var{a}) + 1. \]
Our definition
of \Cw{} does not allow zero-length inputs.
The Marpa parser
deals with null parses
and nulling grammars as special cases,
and this monograph will not consider them.
(Nulling grammars are those that recognize only the null string.)
Parsers typically do work while examining their input,
so that they are, in effect, working with a set of possible inputs,
of which the actual input is just one element.
Reasoning about the set of inputs possible based on what has been
seen so far plays little role in
traditional deterministic parsers,
which do limited tracking
of input already seen,
and have even less of an idea of the inputs not yet seen.
But Marpa is left-eidetic ---
it has a full, exact idea of the input already seen ---
and Earley parsers
also have a very exact idea of what the unseen portion of the input
could be.
We will call the current set of inputs, \CW{}.
\CW{} will always be such that
\Cw \in \CW \quad \text{and} \quad \CW{} \subseteq \myL{\Cg}.
We say that \CW{} is
\xdfn{seen from}{seen from \var{i} to \var{j}!wrt an input set}
\Vloc{i} to \Vloc{j} if and only if,
for all \Vstr{w1}, \Vstr{w2}
\Vstr{w1} \in \CW \land \Vstr{w2} \in \CW
\el{w1}{\var{i}, \, (\Vdecr{j}) \, } =
\el{w2}{\var{i}, \, (\Vdecr{j}) \, }.
Most parsing, including Earley parsing, takes place from left-to-right,
and Marpa examines its input from left to right.
We say that
\CW{} is
\xdfn{seen to}{seen to \var{j}!wrt a input set}
or that
\CW{} is
\xdfn{seen as far as}{seen as far as \var{j}!wrt an input set}
if \CW{} is seen between locations 0 and \Vloc{j}.
In this monograph we will usually speak of input sets that are seen
as far as some \Vloc{j}.
If \CW{} is seen to location 0, none of its input symbols have been
If \CW{} is seen to location \Vsize{\Cw},
where \Cw{} is the actual input,
$\CW = \lbrace \Cw \rbrace$,
and all of its input symbols
have been seen.
In most contexts, the current set of inputs will be assumed to be \CW{}.
For example,
instead of saying that \CW{} is seen as far as \Vloc{k},
we may say the ``the input is seen as far as \Vloc{k}''.
We will say that a derivation is
\xdfn{fully seen}{fully seen!wrt a derivation},
or more simply
\xdfn{full}{full!wrt a derivation},
if its last step is \Cw{}.
Intuitively, a \dfn{symbol instance} is
a symbol in the context of a parse.
In a fully seen derivation,
the right and left locations are well-defined
for every symbol instance of every step.
More formally,
a symbol instance is a triple whose elements
are a left location, a symbol name and a right location.
We often represent symbol instances in the form
\Vinst{inst} = \Vmkl{j} \Vsym{up} \Vmkr{k}.
\index{recce-notation}{[]inst[]@\Vmkl{j} \Vsym{up} \Vmkr{k} (symbol instance)}
\Vmk{j} and \Vmk{k} are always optional.
We also write
for \var{j},
the left location of \Vinst{inst};
for \var{k},
the right location of \Vinst{inst};
for \Vsym{up},
the symbol name of \Vinst{inst};
for \xxsubtract{\var{k}}{\var{j}},
the length of \Vinst{inst} in terms of input symbols; and
for \Vsize{\Vinst{inst}},
where $\Symbol{\Vinst{inst}} = \Vsym{A}$,
and the left and right locations of \Vinst{inst}
are understood from the context.
When \Vstr{si} is a sequence of symbol instances of length \Vsize{si}
whose indexes are 0 \ldots{} \var{last},
we will write
\Symbol{\el{si}{0}}, \;\;
\Symbol{\el{si}{1}}, \;\;
\ldots \;\;
We will write \Vsize{\Vstr{si}}%
\Vsize{\Vstr{str}} (length of a sentential form in terms of the input)}
\sum_{\var{i}=0}^\var{last} \Symbol{\Vel{si}{i}}.
Note that \Vsize{\Vstr{si}} is the length of the string
in terms of input symbols, and is, in general,
not a count of the
symbols in \Vstr{si}.
At some points, such as when we translate a derivation
to other notation,
we will want to justify the conversion to and from
a derivation carefully.
To do this, we will treat a derivation as a two-dimensional
ragged array.
The rows will be derivation steps,
and the columns of these variable-length rows will be symbol
We will write \drvVV{d}{s}{i} for
the \var{i}'th symbol instance of the \var{s}'th step
of the derivation \var{d}.
Steps will be numbered from 0, starting at the root.
Symbol instances will be numbered from 0, starting at the left.
Let \var{d} be a fully seen derivation,
and let $\var{wlen} = \Vsize{\Cw}$.
For every \var{d}, we will have
\drv{d}{0}{0} = \mk{0} \Vsym{accept} \Vmk{wlen}
and where the length of the derivation is \var{dlen},
for all \var{a} such that $0 \le \var{a} < \var{wlen}$,
\drv{d}{\Vdecr{dlen}}{\var{a}} = \Vinst{a} = \Vmk{a} \Vsym{a} \mk{\Vincr{a}}.
where \Vsym{a} is $\Cw[\var{a}]$,
the symbol at location \Vloc{a} of the input.
We will use type \dtype{STR} for sequences of symbol instances
as well as symbols.
We write \Vel{s1}{i} to refer
to refer to the \var{i}'th symbol or symbol instance
in a string,
where the first symbol or symbol instance is at
We write \el{s1}{\var{i} \ldots \var{j}} to refer
to the contiguous substring of \Vstr{s1} which starts with
\Vel{s1}{i} and ends with \Vel{s1}{j}.
The range is inclusive, so that
the length of \el{s1}{\var{i} \ldots \var{j}}
is $(\xxsubtract{\var{j}}{\var{i}})+1$.
We will find it convenient to write
\drv{d}{\var{s}}{\var{a} \ldots \var{z}}
for the sequence of symbol instances
\drvVV{d}{s}{a}, \;\;
\drv{d}{\var{s}}{\Vincr{a}}, \;\;
\ldots \;\;
\section{Focused derivations}
We now
recall our previous definition of a rightmost
Following ~\cite[Vol. 1, page 141, Lemma 2.12]{AU1972},
a rightmost derivation is defined in terms of a series of
expansions of a derivation tree, beginning at the root.
A rightmost derivation is one that always expands the rightmost
non-terminal into its direct descendants.
\dtitle{Focused derivation}
We say that the derivation \var{d}
\xdfn{focused}{focused (derivation)}
at \Vloc{k},
if \CW{} is seen to \Vloc{k},
if in every step \var{s},
if there is a non-terminal symbol instance
$\Vinst{ki} = \drvVV{d}{s}{x}$
such that $\Left{\Vinst{ki}} \le \Vloc{k} < \Right{\Vinst{ki}}$,
\var{d} expands
\drvVV{d}{s}{x}; and
otherwise, \var{d} expands the rightmost non-terminal symbol
We say that \var{d} is
\xdfn{focused within}{focused within!wrt a symobl instance and a derivation}
a symbol instance \Vinst{si}
if and only if it is
focused at some \Vloc{k} such
\Left{\var{si}} \le \Vloc{k} < \Right{\var{si}}.
We say that \var{d} i
\xdfn{focused within}{focused within!wrt an EIM and a derivation}
if and only if it is
focused at some \Vloc{k} such that
\Left{\LHS{\Veim{eim}}} \le \Vloc{k} < \Right{\LHS{\Veim{eim}}}.
Recall from parsing theory that every derivation has an equivalent rightmost
derivation, and that the rightmost derivation is unique in its derivation
Similarly, for every \Vloc{k},
every derivation which is seen to \Vincr{k}
has an equivalent derivation that is focused at \Vloc{k},
and that \Vloc{k}-focused derivation is unique in its
derivation tree.
\ttitle{Properties of focused derivations}
Let \var{d} be a derivation focused within
the symbol instance
\Vinst{i1}, where
\Vinst{i1} is at derivation step \var{s},
and \Symbol{\var{i1}} is
a non-terminal other than \Vsym{accept}.
Then \Vinst{i1} is a direct descendant
of a symbol instance \Vinst{i2},
such that
\text{\var{i2} is at derivation step \Vdecr{s},} \\
\text{\Symbol{\var{i2}} is a non-terminal,} \\
\Right{\var{i2}} \ge \Right{\var{i1}}, \\
\Left{\var{i2}} \le \Left{\var{i1}}, \\
\text{and \var{d} is focused within \var{i2}.}
In addition,
for some \Vstr{pre}, \Vstr{post},
where $\Symbol{\var{i1}} = \Vsym{i1}$ and
$\Symbol{\var{i2}} = \Vsym{i2}$,
[\Vsym{i2} \de \Vstr{pre} \Vsym{i1} \Vstr{post} ] \in \Crules,
& \Vsym{i2} && \text{Step \Vdecr{s}} \\
\derives \quad & \Vstr{pre} \Vsym{i1} \Vstr{post} \qquad && \text{Step \var{s}}
Since \Vinst{i1} is not the accept symbol,
there is a derivation step \Vdecr{s}.
In each derivation step, a symbol instance is either
copied over into the next step or expanded into
its direct descendants.
in step \Vdecr{s}, there is either another copy of symbol
instance \Vinst{i1}, or \Vinst{i1} is the direct descendent
of another symbol instance.
Either way, call that other symbol instance, \Vinst{i2}.
\Vinst{i2} is in derivation step
\Vdecr{s} by definition, which gives us
We show
by cases.
In the first case, where \Vinst{i1} is a direct descendant
of \Vinst{i2}, we recall that
In every parse using a context-free grammar,
the right and left locations of
a direct descendant are always inside
of the single symbol from which the
direct descendant was expanded.
The input may not have been seen to \Right{\var{i2}},
is true of all factorings of all inputs,
and we may conclude that
hold for every input in \CW{}.
If \Vinst{i1} is a copy of \Vinst{i2},
follow trivially.
By assumption, \var{d} is focused within \Vinst{i1},
and therefore at some \Vloc{k} such that
\Left{\var{i1}} \le \Vloc{k} < \Right{\var{i1}}.
follows from
We show \eqref{eq:focusing-props-13}
by cases.
In the first case,
\Vinst{i2} is a copy of \Vinst{i1}
so that
$\Symbol{\var{i2}} = \Symbol{\var{i1}}$,
and since
\Symbol{\var{i1}} is a non-terminal,
\Symbol{\var{i2}} is a non-terminal.
In the second case,
\Vinst{i2} is not a copy of \Vinst{i1},
so that \Vinst{i1} is a direct descendant of
In this second case,
\Vinst{i2} is a non-terminal
by definition.
This gives us both cases,
We have already shown
so we know that,
by the definition of a focused derivation,
\Vinst{i2} is expanded into
its direct descendants in step \var{s}.
and \eqref{eq:focusing-props-27}
follow from this observation and the definition
of a derivation.
For convenience the step numbers are shown in
The labeling of Step \var{s} follows from assumption
for the theorem.
The labeling of Step \Vdecr{s} follows from
\section{Location markers}
In the context of a grammar \Cg{} and an input \Cw{},
we will often use location-marked derivations.
Location-marked derivation steps are like the derivation steps
of the traditional rewriting system except that they also contain
location markers of the form \Vmk{x}, where \var{x} is a
location in \Cw{}.\footnote{
Our use of the location marker notation
was inspired by~\cite{Wich2005}.}
When not otherwise stated,
use of the location marker \Vmk{x}
implies that \CW{} has been seen to \Vmk{x}.
In its most general form,
a derivation step with a single location marker is
\Vstr{pre} \Vmkm{x} \Vstr{post}.
& \Vsym{accept} \destar \Vstr{before} \cat \Vstr{pre} \cat \Vstr{post} \cat \Vstr{after} \\
& \land \quad \Vstr{before} \cat \Vstr{pre} \destar \var{w}[0, (\Vdecr{x})] \\
& \land \quad \Vstr{post} \cat \Vstr{after} \destar \var{w}[\var{x}, (\Vsize{\Cw} \subtract 1)]
Derivations may have many location markers.
The meaning of a derivation with \var{j} different location markers,
\[ \var{m}[1], \var{m}[2] \ldots \var{m}[\var{j}], \]
is the same as the meaning of the conjunction of an ordered set of \var{j} derivations,
where the \var{i}'th member has all the markers removed except for $\var{m}[\var{i}]$.
For example,
& \Vsym{accept} \destar \Vstr{before} \Vmkm{i} \Vsym{A} \cat \Vstr{after} \\
& \qquad \derives \Vstr{before} \Vmkm{i} \Vstr{predot} \Vmkm{j} \Vstr{postdot} \cat \Vstr{after}.
is the equivalent of the logical conjunction of two derivations:
& \Vsym{accept} \destar \Vstr{before} \,[\var{i}]\, \Vsym{A} \cat \Vstr{after} \\
& \qquad \derives \Vstr{before} \,[\var{i}]\, \Vstr{predot} \Vstr{postdot} \cat \Vstr{after}
\end{split} \\
& \Vsym{accept} \destar \Vstr{before} \cat \Vsym{A} \cat \Vstr{after} \\
& \qquad \derives \Vstr{before} \cat \Vstr{predot} \,[\var{j}]\, \Vstr{postdot} \cat \Vstr{after}.
In this example,
\eqref{eq:location-marker-def-12} and
imply that
\Vstr{predot} \destar \var{w}[\var{i}, (\var{j} \subtract 1)]
and therefore
also implies
Derivations with location markers may be
composed in the same way as derivations without them,
as long as the location markers in the combined
derivation are consistent.
The location marker notation is intuitive and based on the
traditional notation for derivations,
but, since it is new, we will present examples.
For the examples of this section,
start with the traditional derivation
\Vstr{A} \\
\derives \Vstr{Aa} \Vsym{B} \Vsym{D} \Vstr{Aa} \\
\derives \Vstr{Aa} \Vstr{Ba} \Vsym{C} \Vstr{Bz} \Vsym{D} \Vstr{Az}
\textbf{Insert new location markers}:
If a location marker is not currently used, we can insert it
to mark any location that has been seen.
In this example,
we insert
new markers
\Vmk{i}, \Vmk{j}, \Vmk{k}, \mk{\ell} and \Vmk{m}:
\Vstr{A} \\
\derives \Vmk{i} \Vstr{Aa} \Vmk{j} \Vsym{B} \Vmk{k} \Vsym{D} \mk{\ell} \Vstr{Aa} \Vmk{m} \\
\derives \Vstr{Aa} \Vstr{Ba} \Vsym{C} \Vstr{Bz} \Vsym{D} \Vstr{Az}
Introduction of a location marker \Vmk{x}, unless otherwise stated,
assumes that the \CW{} has been seen to \Vmk{x},
so in this example, we assumed that input has been seen to
the rightmost marker, \Vmk{m}.
The examples to follow deal with movement of existing
location markers, so that it will have already been assumed
that \CW{} has been seen as far as those location markers.
\textbf{Move location markers from direct descendants to their parents}:
If a location marker is before the first of its direct descendants, we can move
it to before its parent in the previous step.
In this example, \Vmk{i} is moved:
\Vmk{i} \Vstr{A} \\
\derives \Vmk{i} \Vstr{Aa} \Vmk{j} \Vsym{B} \Vmk{k} \Vsym{D} \mk{\ell} \Vstr{Aa} \Vmk{m} \\
\derives \Vstr{Aa} \Vstr{Ba} \Vsym{C} \Vstr{Bz} \Vsym{D} \Vstr{Az}
Also, if a location marker is after the last of its direct desendants,
we can move it to after its parent in the previous step.
In this example, \Vmk{m} is moved:
\Vmk{i} \Vstr{A} \Vmk{m} \\
\derives \Vmk{i} \Vstr{Aa} \Vmk{j} \Vsym{B} \Vmk{k} \Vsym{D} \mk{\ell} \Vstr{Aa} \Vmk{m} \\
\derives \Vstr{Aa} \Vstr{Ba} \Vsym{C} \Vstr{Bz} \Vsym{D} \Vstr{Az}
\textbf{Move location markers from parents to direct descendants}:
Similarly if a location marker is before a parent, we can
move it to before the first of its direct descendants in the next step;
if a location marker is after a parent, we can
move it to after the last of its direct descendants in the next step.
In this example, \Vmk{j} and \Vmk{k} are moved:
\Vmk{i} \Vstr{A} \Vmk{m} \\
\derives \Vmk{i} \Vstr{Aa} \Vmk{j} \Vsym{B} \Vmk{k} \Vsym{D} \mk{\ell} \Vstr{Az} \Vmk{m} \\
\derives \Vstr{Aa} \Vmk{j} \Vstr{Ba} \Vsym{C} \Vstr{Bz} \Vmk{k} \Vsym{D} \Vstr{Az}
\textbf{Delete location markers}:
Location markers which are no longer of interest may be removed.
In this example, \Vmk{i} and \Vmk{m} are deleted:
\Vstr{A} \\
\derives \Vstr{Aa} \Vmk{j} \Vsym{B} \Vmk{k} \Vsym{D} \mk{\ell} \Vstr{Az} \\
\derives \Vstr{Aa} \Vmk{j} \Vstr{Ba} \Vsym{C} \Vstr{Bz} \Vmk{k} \Vsym{D} \Vstr{Az}
\textbf{Move location marker to before or after same symbol instance}:
A location marker which is before (after) a symbol instance in one derivation
step may be moved to before (after) the same symbol instance in another derivation step.
In this example, \mk{\ell} is moved:
\Vstr{A} \\
\derives \Vstr{Aa} \Vmk{j} \Vsym{B} \Vmk{k} \Vsym{D} \mk{\ell} \Vstr{Az} \\
\derives \Vstr{Aa} \Vmk{j} \Vstr{Ba} \Vsym{C} \Vstr{Bz} \Vmk{k} \Vsym{D} \mk{\ell} \Vstr{Az}
A derivation may be simplified from the bottom up,
by removing the symbol instances outside of two markers.
In this example, the last step is simplified outside of \Vloc{j} and \Vloc{k}:
\Vstr{A} \\
\derives \Vstr{Aa} \Vmk{j} \Vsym{B} \Vmk{k} \Vsym{D} \mk{\ell} \Vstr{Az} \\
\derives \ldots \Vmk{j} \Vstr{Ba} \Vsym{C} \Vstr{Bz} \Vmk{k} \ldots
When it is clear what is happening, the dots may be omitted:
\Vstr{A} \\
\derives \Vstr{Aa} \Vmk{j} \Vsym{B} \Vmk{k} \Vsym{D} \mk{\ell} \Vstr{Az} \\
\derives \Vmk{j} \Vstr{Ba} \Vsym{C} \Vstr{Bz} \Vmk{k}
\textbf{Null passing}:
A location marker may be duplicated from one side of a nulling symbol
to the other.
For this example,
assume that
$\Vstr{nul1} = \epsilon$,
$\Vstr{tel1} \neq \epsilon$,
$\Vstr{tel2} \neq \epsilon$.
\Vstr{tel1} \Vmk{x} \Vstr{nul1} \Vstr{tel2},
we can duplicate the \Vmk{x}
\Vstr{tel1} \Vmk{x} \Vstr{nul1} \Vmk{x} \Vstr{tel2}.
If one of the two \Vmk{x} markers is now clutter,
we may also delete it:
\Vstr{tel1} \Vstr{nul1} \Vmk{x} \Vstr{tel2}.
\subsection{Simplification and nulling symbols}
Caution must be exercised because nulling symbols
can fall on either side of a location-marker.
The following theorem shows that what looks like a cycle
in location-marked notation is,
in fact, a cycle,
and therefore cannot occur in Marpa grammar.
\ttitle{Location marked cycles}
Let \Vstr{sf} be a sentential form containing at
least one telluric symbol.
\Vmk{i} \Vstr{sf} \Vmk{j} \ndeplus \Vmk{i} \Vstr{sf} \Vmk{j}
We represent
the possible nulls
outside our location markers in
in full generality as
& \Vstr{nul1L} \Vmk{i} \Vstr{sf} \Vmk{j} \Vstr{nul1R} && \text{Step \var{x}} \\
\deplus & \Vstr{nul2L} \Vmk{i} \Vstr{sf} \Vmk{j} \Vstr{nul2R}. && \text{Step \var{y}}
$\Vstr{nul1L} = \Vstr{nul1R} = \Vstr{nul2L} = \Vstr{nul2R} = \epsilon$.
We assume
for an outer reductio.
By assumption for the theorem there is at least
one telluric symbol in \Vstr{sf}.
Let \Vsym{tell} be one of those telluric symbols in Step \var{x},
and let \Vsym{descs} be its descendants in Step \var{y}.
We will write \Vop{T}{x} for the number of telluric symbols in \Vstr{x}
or \Vsym{x}.
No telluric symbol is nulling, so that
$\bigop{T}{\Vsym{tell}} \le \bigop{T}{\Vstr{dd}}$,
and therefore
$\bigop{T}{\Vstr{dd}} \ge 1$.
Assume for an inner reductio, that
there is some \Vsym{tell2} in
Step \var{x} of
such that $\bigop{T}{\Vstr{descs2}} > 1$,
where \Vstr{descs2} are the descendants of \Vsym{tell2}
in Step \var{y}.
Let \var{tell-cnt} be the number of telluric symbols in
\var{tell-cnt} =
\bigop{T}{ \Vstr{nul1L} } +
\bigop{T}{ \Vstr{sf}@\var{x}} +
\bigop{T}{ \Vstr{nul1R} }.
Since $\Vstr{nul1L} = \Vstr{nul1R} = \epsilon$,
we have
$\var{tell-cnt} = \bigop{T}{\Vstr{sf}@\var{x}}$.
We have
\var{tell-cnt} = \bigop{T}{\Vstr{sf}@\var{x}}
= \bigop{T}{\Vstr{sf}@\var{y}}
by the self-identity of \Vstr{sf}.
$\Vstr{nul2L} = \Vstr{nul2R} = \epsilon$,
so that if there is any
telluric symbol
in Step \var{y},
it will be in
we see that if one telluric symbol in $\var{sf}@\var{x}$
\var{n} telluric symbols in $\var{sf}@\var{y}$,
then the other $\Vdecr{tell-cnt}$ telluric symbols
$\var{sf}@\var{x}$ must derive
telluric symbols.
This means that, if $\var{n} > 1$,
at least one telluric symbol in
zero telluric symbols in $\var{sf}@\var{y}$.
In other words,
if $\var{n} > 1$,
at least one telluric symbol in
$\var{sf}@\var{x}$ must be nulling.
But telluric symbols are never nulling,
which shows the inner reductio.
We conclude from the inner reductio that
no telluric symbol in
derives more than one telluric symbol in
We have already shown that every telluric symbol in
Step \var{x}
derives at least one telluric symbol in Step \var{y}.
So we have that,
for any telluric symbol
if \Vstr{descs2}@\var{y} are its direct descendants in
Step \var{y},
then $\bigop{T}{\Vstr{descs2}@\var{y}} = 1$.
Recall our earlier assumption
in the outer reductio
that \Vsym{tell} is an arbitrary
telluric symbol in \var{sf}@\var{x}
are its descendants.
We know from
$\bigop{T}{\Vstr{descs}@\var{y}} = 1$.
That is, each telluric symbol in Step \var{x}
maps one-to-one to a telluric descendant in
Step \var{y}.
What is the telluric symbol that \Vsym{tell}
maps to in \Vstr{descs}?
All the telluric symbols in
must be in
$\var{sf}@\var{x}$ and $\var{sf}@\var{y}$,
$\var{sf}@\var{x} = \var{sf}@\var{y}$,
so that the telluric symbols in
must be the same, and in
the same order,
as those in
So a one-to-one mapping of telluric symbols from
to telluric symbols in $\var{sf}@\var{y}$
must map each telluric symbol to itself.
Therefore, the telluric symbol in
is $\Vsym{tell}@\var{x}$.
We can therefore write the derivation of
\Vstr{descs} from
without loss of generality,
\Vsym{tell} \deplus \Vstr{nulL} \Vsym{tell} \Vstr{nulR} = \Vstr{descs}
$\Vstr{nulL} = \Vstr{nulR} = \epsilon$.
But, by the definition of cyclic,
is a cycle.
Cycles are not allowed in Marpa grammars,
which shows the outer reductio,
and the theorem.
\chapter{Rewriting the grammar}
We have already noted
that no rules of \Cg{}
have a zero-length RHS,
and that all symbols must be either nulling or telluric.
These restrictions follow Aycock and Horspool~\cite{AH2002}.
The elimination of empty rules and proper nullables
is done by rewriting the grammar.
\cite{AH2002} shows how to do this
without loss of generality.
Because Marpa claims to be a practical parser,
it is important to emphasize
that all grammar rewrites in this monograph
allow the original grammar to be reconstructed
simply and efficiently at evaluation time.
As implemented,
the Marpa parser allows users to associate
semantics with an external grammar
that has none of the restrictions imposed
on the internal grammars.
As his external grammar,
the Marpa::R2 user
may specify any proper context-free grammar.
A ``proper'' grammar is one which is cycle-free,
and which contains no unproductive or inaccessible
In fact, as of this writing, Marpa::R2 has options
which allow grammars with cycles and inaccessible symbols,
but there is very little interest in these,
and future Marpa versions are likely to remove this support.
Marpa external grammars allow nullable,
properly nullable and nulling symbols,
as well as empty rules.
The user specifies his semantics in terms
of the external grammar.
Marpa rewrites the external grammar into
an internal grammar.
Parsing and evaluation
are performed
in such a way as to keep the internal grammar
invisible to
the user.
From the user's point of view,
the external grammar is the
one being used for the parse,
and the one to which
his semantics is applied.
The rewrite currently used by Marpa is an improvement over
that of~\cite{AH2002}.
Rules with proper nullables are identified
and a new grammar is created
in which the external grammar's rules are divided
up so that no rule has more than two proper nullables.
(A ``proper nullable'' is a nullable symbol which is not nulling.)
This is similar to a rewrite into Chomsky form.
The proper nullable symbol is then cloned into two others:
one nulling and one telluric.
All occurrences of the original proper nullable symbol are then replaced
with one of the two new symbols,
and new rules are created as necessary to ensure that all possible combinations
of nulling and telluric symbols are accounted for.
The rewrite in~\cite{AH2002} was similar, but did not do the Chomsky-style
rewrite before replacing the proper nullables.
As a result the number of rules in the internal grammar could be
an exponential function of numbers in the external grammar.
In our version, the worst case growth in the number of rules in linear.
This rewrite can be undone easily.
In fact,
in the current implementation of Marpa,
the reverse rewrite,
from internal to external,
is often done ``on the fly'',
as the parse proceeds.
This translation back and forth is
and is done for error reporting, tracing,
and in the implementation of Marpa::R2's event
Future plans for Marpa include more aggressive use
of rewrites.
It should be possible, not only to eliminate proper
nullables from the internal grammar,
but also to eliminate nulling symbols.
We conjecture that elimination of nulling symbols
from the internal grammar will greatly simplify the implementation.
The reader may observe that it would
simplify this monograph if it did not have to deal with nulling
Not all rewrites lend themselves to easy translation
and reversal.
As a future direction, we will look at a general schema
for ``safe'' grammar rewrites.
In this schema, Marpa's internal grammar will have
``brick'' and ``mortar'' symbols.
Internal brick symbols correspond, many-to-one, to
external symbols.
Internal mortar symbols exist entirely for the purposes
of the internal grammar.
Only brick symbols have semantics attached to them.
Assume that we have a parse, using internal symbols.
We define a ``brick traversal'' from a ``root'' brick non-terminal instance.
The ``brick traversal'' is pre-order
and stops traversing any path when it hits
a brick symbol instance other than the ``brick root''.
In this way, it traces a subtree of the parse,
where the root of the subtree is the brick root symbol instance,
and the leaves of the subtree are a sequence of other brick
symbol instances.
The leaves of the subtree, as encountered in pre-order,
constitute its ``brick frontier''.
Mortar symbols will only occur in the interior of this ``brick'' subtree,
never as its root or its leaf.
An internal symbol instance \textbf{matches}
an external symbol instance if and only if
\item they have the same symbol;
\item they have the same left location; and
\item they have the same right location.
For a rewrite to be ``safe'':
\item Every brick symbol must translate to exactly one
external symbol
\item Every terminal symbol instance must be a brick symbol.
\item The internal and external input sequences must be the same length.
\item The internal and external input sequences must allow
a shared indexing
scheme, which indexes the input symbols consecutively from
left to right.
For every index \var{i},
the \var{i}'th symbol instance in the internal input sequence
must match
the \var{i}'th symbol instance in the external input sequence.
\item Every brick traveral must translate to an external rule
and vice versa,
as follows:
\item The brick root symbol instance must match
the LHS symbol instance of the external rule instance.
The brick frontier must be the same length as
the RHS of the rule.
The brick frontier and the external rule RHS must allow a shared indexing
which indexes both of them
from left to right.
For every index \var{i},
the \var{i}'th
instance in the brick frontier must
match the \var{i}'th symbol instance
of the external rule RHS.
\chapter{Dotted rules}
\dtitle{Dotted rule}
Let $\Vrule{r} \in \Crules$
be a rule.
Recall that \Vsize{r}
is the length of its RHS.
A dotted rule (type \dtype{DR}) is a duple, $[\Vrule{r}, \var{dotix}]$.
\var{dotix} is
\dfn{dot RHS index}
\dfn{dot index},
and is such that
$0 \le \var{dotix} \le \size{\Vrule{r}}$.
The dot index
indicates the extent to which
the rule has been recognized.
It is often
represented with a large raised dot.
so that if
[\Vsym{A} \de \Vsym{X} \Vsym{Y} \Vsym{Z}]
is a rule,
\Vdr{dr} = [\Vsym{A} \de \Vsym{X} \Vsym{Y} \mydot \Vsym{Z}]
is the dotted rule with the dot at
$\var{dotix} = 2$,
that is,
between \Vsym{Y} and \Vsym{Z}.
We write
for the dot index of
and \Vop{Rule}{x}%
for the rule of \Vdr{x}.
For example, where \Vdr{dr} is as in
\op{Dotix}{\Vdr{dr}} = 2 \\
\text{and} \quad
\op{Rule}{\Vdr{dr}} = [\Vsym{A} \de \Vsym{X} \Vsym{Y} \Vsym{Z}].
In the discussions to follow
we will also refer to a
``dot location''.
The dot location should not be confused with the
dot index.
Dot locations will be locations in the input,
and will require the dotted rule
to be placed in the context of an input,
as will be explained in
Chapter \ref{ch:earley-items}.
We will sometimes write the dotted rule as a duple,
for example,
we might write
[ [ \Vsym{A} \de \Vsym{X} \Vsym{Y} \Vsym{Z} ], 2 ] \\
or redundantly, as
[ [ \Vsym{A} \de \Vsym{X} \Vsym{Y} \mydot \Vsym{Z} ], 2 ].
\dtitle{Rule notions applied to dotted rules}
Whenever we apply a rule notion to a dotted rule,
call it \Vdr{dr},
we mean to apply that notion to
the rule of the dotted rule,
or \Rule{\Vdr{dr}}.
\Postdot{\Vdr{x}} \defined
& \Vsym{B}, \; && \text{if $\Vdr{x} = [\Vsym{A} \de \Vstr{s1} \mydot \Vsym{B} \cat \Vstr{s2}]$} \\
& \undefined, \; && \text{if $\Vdr{x} = [\Vsym{A} \de \Vstr{rhs} \mydot]$}
\Predot{\Vdr{x}} \defined
& \Vsym{B}, \; && \text{if $\Vdr{x} = [\Vsym{A} \de \Vstr{s1} \cat \Vsym{B} \mydot \Vstr{s2}]$} \\
& \undefined, \; && \text{if $\Vdr{x} = [\Vsym{A} \de \mydot \Vstr{rhs} ]$}
\Next{\Vdr{x}} \defined
[\Vsym{A} \de \Vstr{s1} \cat \Vsym{B} \mydot \Vstr{s2}], \\
& \qquad && \text{if $\Vdr{X} =
[\Vsym{A} \de \Vstr{s1} \mydot \Vsym{B} \cat \Vstr{s2}]$} \\
& \undefined, && \text{if $\Vdr{x} = [\Vsym{A} \de \Vstr{rhs} \mydot]$}
\Prev{\Vdr{x}} \defined
[\Vsym{A} \de \Vstr{s1} \mydot \Vsym{B} \cat \Vstr{s2}], \\
& \qquad && \text{if $\Vdr{X} =
[\Vsym{A} \de \Vstr{s1} \cat \Vsym{B} \mydot \Vstr{s2}]$} \\
& \undefined, && \text{if $\Vdr{x} = [\Vsym{A} \de \mydot \Vstr{rhs} ]$}
\dtitle{Prefix and suffix}
If a dotted rule is
[\Vsym{A} \de \Vstr{prefix} \mydot \Vstr{suffix}]
we say that \Vstr{prefix} is the
\dfn{dot prefix},
and that
\Vstr{suffix} is the
\dfn{dot suffix}.
The \dfn{start dotted rule} is
\Vdr{start} = [\Vsym{accept} \de \mydot \Vsym{start} ].
The \dfn{accept dotted rule} is
\Vdr{accept} = [\Vsym{accept} \de \Vsym{start} \mydot ].
We divide all dotted rules into five disjoint types:
start, prediction, null-scan, read and reduction.
start dotted rule was defined
in \eqref{eq:start-rule-def-10}.
Its type is
\xdfn{start}{start (dotted rule)}.
If a rule does not have a predot symbol and is not the start dotted rule,
it is a
\xdfn{predicted dotted rule}{predicted (dotted rule)}
or a
\xdfn{prediction}{prediction (dotted rule)}.
A predicted dotted rule
always has a dot position of zero,
for example,
\Vdr{predicted} = [\Vsym{A} \de \mydot \Vstr{alpha} ].
If a rule does have a predot symbol and that symbol is a nulling terminal,
that rule
is a
\xdfn{null-scan}{null-scan (dotted rule)}
dotted rule.
If a rule does have a predot symbol and that symbol is a telluric terminal,
the rule is a
\xdfn{read}{read (dotted rule)}
dotted rule.
If a rule does have a predot symbol and that symbol is a non-terminal
it is a
\xdfn{reduced}{reduced (dotted rule)}
dotted rule,
or a
\xdfn{reduction}{reduction (dotted rule)}.
\dtitle{Ethereal and telluric dotted rules}
A start dotted rule,
or a dotted rule with a telluric predot symbol
is called
a \xdfn{telluric}{telluric!DR}
dotted rule.
All other dotted rules are called
dotted rules.
The idea is that telluric dotted rules are ``grounded'' either
in the input or in the initial state of the parse,
ethereal dotted rules emerge out of an ``invisible'' realm.
\ttitle{Ethereal and telluric dotted rules}
Prediction and null-scan dotted rules are ethereal
dotted rules.
Start, reduction and read dotted rules are telluric.
The theorem follows from
\dref[start DR]{def:start-dr},
\dref[prediction DR]{def:prediction-dr},
\dref[null-scan DR]{def:null-scan-dr},
\dref[read DR]{def:read-dr},
\dref[reduction DR]{def:reduction-dr},
\dref[telluric and ethereal DR's]{def:telluric-dr}.
\dtitle{Confirmed dotted rules}
\xdfn{confirmed dotted rule}{confirmed (dotted rule)},
\xdfn{confirmation}{confirmation (dotted rule)},
is a dotted rule
with a dot position greater than zero.
\xdfn{complete dotted rule}{complete (dotted rule)},
is a dotted rule with its dot
position after the end of its RHS,
for example,
\Vdr{complete} = [\Vsym{A} \de \Vstr{alpha} \mydot ].
\dtitle{Penultimate dotted rules}
\xdfn{penultimate dotted rule}{penultimate (dotted rule)},
or a
\xdfn{penult}{penult (dotted rule)},
is a dotted rule with exactly one symbol
between its
dot position and the end of its RHS,
for example,
\Vdr{penult} = [\Vsym{A} \de \Vstr{alpha} \mydot \Vsym{B} ].
When classifying dotted rules,
it is often convenient
to ignore the effect of nulling symbols.
Intuitively, if a dotted rule is of the kind ``X'',
then a quasi-X dotted rule is a dotted rule that would be
of kind X, if it were not for its nulling symbols.
A dotted rule which has only nulling symbols in its dot
suffix is
\xdfn{quasi-complete}{quasi-complete (dotted rule)}.
A quasi-complete dotted rule is called an
\xdfn{quasi-complete}{quasi-complete (dotted rule)}.
A dotted rule which has exactly one telluric symbol in
its dot suffix is
\xdfn{quasi-penultimate}{quasi-penultimate (dotted rule)}.
A quasi-penultimate dotted rule is called an
\xdfn{quasi-penult}{quasi-penult (dotted rule)}.
If a dotted rule is not quasi-complete,
it is said to be
\xdfn{quasi-incomplete}{quasi-incomplete (dotted rule)}.
A dotted rule which has only nulling symbols before the dot
is a
\xdfn{quasi-predicted}{quasi-predicted (dotted rule)}
dotted rule,
or a
\xdfn{quasi-prediction}{quasi-prediction (dotted rule)}.
If a dotted rule is not a
then it is a
\xdfn{quasi-confirmed}{quasi-confirmed (dotted rule)}
dotted rule,
or a
\xdfn{quasi-confirmation}{quasi-confirmation (dotted rule)}.
The definitions of the quasi-types may be satisfied vacuously:
for example,
all complete dotted rules are quasi-complete dotted rules
all predicted dotted are quasi-predicted dotted rules.
\dtitle{Completion of a dotted rule}
\Vdr{quasi} & = [\Vsym{A} \de \Vstr{alpha} \mydot \Vstr{nulls} ] \\
\Vdr{completion} & = [\Vsym{A} \de \Vstr{alpha} \cat \Vstr{nulls} \mydot ] \\
& \qquad \text{where} \quad \Vstr{nulls} = \epsilon
is a pair of dotted rules,
we say that \Vdr{completion} is the
\xdfn{completion dotted rule}{completion (dotted rule)!wrt another dotted rule}
of the quasi-complete dotted rule \Vdr{quasi}.
This definition may be satisfied vacuously:
all predictions are quasi-predictions.
\ttitle{Quasi-predicted dotted rule is not quasi-completed}
In Marpa grammars,
no quasi-completed dotted rule
is a quasi-predicted dotted rule.
The rewrite of
Marpa grammars
eliminates all nullable rules.
So every rule must have a telluric symbol.
In a dotted rule, therefore,
there must be
at least one telluric symbol
and it must come either before the dot
or after it.
If a telluric symbol comes before the dot,
the dotted rule might be quasi-completed,
but it cannot be a quasi-prediction.
If a telluric symbol comes after the dot,
the dotted rule might be a quasi-prediction,
but it cannot be quasi-complete.
\section{Fleeting and lasting bases}
A dotted rule with a null predot symbol is called a
\xdfn{fleeting base}{fleeting base (dotted rule)}.
Any other dotted rule is called
\xdfn{lasting base}{lasting base (dotted rule)}.
A prediction is always a lasting base.
Every dotted rule,
even if it is not a lasting base itself,
has a
\xdfn{lasting base}{lasting base (dotted rule)!wrt another dotted rule}
\Vdr{bas} is the lasting base of \Vdr{dr}
if and only if
\Vdr{bas} is a lasting base,
\forall \; \var{i} : 0 \le \var{i} < \Dotix{\Veim{dr}}
\implies \RHS{\Vdr{dr}, \var{i}} = \epsilon
Note that
may be satisfied vacuously ---
a prediction is its own lasting base.
\ttitle{Dotted rule lasting base}
Every dotted rule has a lasting base.
The lasting base of a dotted rule is another,
not necessarily disinct, dotted rule.
Let \Vdr{dr} be a dotted rule.
\Predot{\Vdr{dr}} \neq \epsilon \; \lor \;
\Predot{\Vdr{dr}} = \undefined,
then \Vdr{dr} is its own lasting base.
If $\Predot{\Vdr{dr}} = \epsilon$ so that,
without loss of generality,
\Vdr{dr} = [\Vsym{A} \de \Vstr{before} \Vstr{nulls} \Vsym{nul} \mydot \Vstr{after} ]
where $\Vstr{nulls} = \epsilon$
and $\Vsym{nul} = \epsilon$,
then the
lasting base
of \Vdr{dr} is
[\Vsym{A} \de \Vstr{before} \mydot \Vstr{nulls} \Vsym{nul} \Vstr{after} ]
\section{The transition function}
We define
a partial transition function from
pairs of dotted rule and symbol
to sets of dotted rules.
\GOTO: \Cdr, (\epsilon \cup \var{vocab}) \mapsto 2^\Cdr.
$\GOTO(\Vdr{from}, \epsilon)$ is a
\dfn{null transition}
and its result is a \dfn{null transition set}.
``null'' is an overloaded term,
so we more often call the null transition
an \dfn{ethereal transition}
and the null transition set
an \dfn{ethereal transition set}.
If a transition is not an ethereal transition,
it is a \dfn{telluric transition},
and if a transition set
is not an ethereal transition set,
it is a \dfn{telluric transition set}.
A telluric transition set is always the empty set
or a singleton set.
Only ethereal transition sets have
cardinalities greater than one.
The dotted rules in the set that results from an ethereal transition
will be either predictions or confirmed rules with
a nulling predot symbol.
Where the transition is over a symbol,
call it \Vsym{A},
\GOTO(\Vdr{from}, \Vsym{A}) = \\
& \left\lbrace \Next{\Vdr{from}} \right\rbrace,
&& \text{if $\Vsym{A} = \Postdot{\Vdr{from}}$} \\
& \emptyset,
&& \text{otherwise}
Ethereal transitions are more complicated,
but their analysis will come in useful later.
Let \var{null-scan-dr-op} be the set of
pairs of dotted rules
\left\lbrace \Vdr{cause}, \Vdr{effect} \right\rbrace \quad \text{such that} \\
\Vdr{effect} = \Next{\Vdr{cause}} \quad \text{and} \\
\Predot{\Vdr{effect}} \derives \epsilon.
\dtitle{Causes of null scan dotted rule}
In equation
we say that \Vdr{cause} is the
\xdfn{top-down cause}{top-down cause!DR as top-down cause of null-scan DR}
of \Vdr{effect},
and that \Predot{\Vdr{effect}} is the
\xdfn{bottom-up cause}{bottom-up cause!DR as bottom-up cause of null-scan DR}
of \Vdr{effect}.
We can use \var{null-scan-dr-op} to define an equivalence relation.
Intuitively, two dotted rules, \Vdr{dr1}
and \Vdr{dr2} are \dfn{ethereally equivalent} if
\Vdr{dr1} can be changed into \Vdr{dr2}
by iteration of \var{null-scan-dr-op}.
More formally, we define \var{eth-eq} to be the reflexive, symmetric
and transitive closure of
We say that \Vdr{dr1} is
\dfn{ethereally equivalent} to \Vdr{dr2} if
and only if
\Vdr{dr1} is an element of the
equivalence class of \var{eth-eq} under \Vdr{dr2}.
Let \var{predict-dr-op} be the set of
pairs of dotted rules
\left\lbrace \Vdr{cause}, \Vdr{effect} \right\rbrace \quad \text{such that} \\
\Postdot{\Vdr{cause}} = \LHS{\Vdr{effect}} \quad \text{and} \\
\Dotix{\Vdr{effect}} = 0
\dtitle{Causes of predicted dotted rules}
In equation
we say that \Vdr{cause} is the
\xdfn{top-down cause}{top-down cause!DR as top-down cause of a predicted DR}
of \Vdr{effect}.
For symmetry with other types of dotted rule,
we say that \Vdr{effect}
has a bottom-up cause,
but that the
\xdfn{bottom-up cause}{bottom-up cause!DR, as bottom-up cause of a predicted DR}
of \Vdr{effect}
is ethereal.
\var{epsilon-dr-op} = \var{null-scan-dr-op} \cup \var{predict-dr-op}.
We are now in a position to define the ethereal transition of \GOTO{}
from the dotted rule \Vdr{base}.
It is the transitive closure of \var{epsilon-op}
over the singleton set containing the dotted rule argument of \GOTO{}
if there is a postdot symbol.
Otherwise it is the empty set.
\GOTO(\Vdr{from}, \epsilon) = \\
& \var{epsilon-op}^+(\lbrace \Vdr{base} \rbrace),
&& \text{if $\Postdot{\Vdr{from}} \neq \undefined$} \\
& \emptyset,
&& \text{otherwise}
\xdfn{ethereal closure}{ethereal closure (of a dotted rule)}
is the reflexive and transitive closure of
\[\var{ethereal-dr-closure} \defined \var{epsilon-dr-op}^\ast.\]
We say that
the ethereal closure for a dotted rule is the ethereal closure of the singleton
set containing that dotted rule:
\var{ethereal-dr-closure}(\Vdr{base}) \defined \var{ethereal-dr-closure}(\lbrace \Vdr{base} \rbrace)
\Vdrset{ec} = \var{ethereal-dr-closure}(\lbrace \Vdr{base} \rbrace).
We also call \Vdrset{ec} an \dfn{ethereal closure}
and we say that \Vdr{base} is its \dfn{base}.
If \Vdr{base} is telluric, we say that
\Vdr{base} is a \dfn{telluric base}.
We call \Vdr{tell} a telluric base
of a dotted rule \Vdr{dr2} if and only if
it is telluric and
\[ \Vdr{dr2} \in \var{ethereal-dr-closure}(\lbrace \Vdr{tell} \rbrace). \]
\section{Ethereal closures}
\ttitle{Ethereal equivalents have same telluric base}
If the dotted rules \Vdr{dr1}
and \Vdr{dr2} are ethereally equivalent,
and \Vdr{dr1} is quasi-confirmed,
then \Vdr{dr1}
and \Vdr{dr2} have the same telluric base.
By assumption for the theorem,
\Vdr{dr1} is quasi-confirmed,
so that
by Theorem \ref{t:quasi-drs-disjoint},
is not a quasi-prediction.
Therefore, \Vdr{dr1} has a telluric symbol in its
dot prefix.
\Vdr{dr1} has a telluric base.
Without loss of generality,
we let
\Vdr{dr1} = [ \Vsym{A} \de \Vstr{pre} \cat \Vsym{tell} \cat \Vstr{nulls1} \mydot \Vstr{post1} ],
and let the telluric base be
\Vdr{tell} = [ \Vsym{A} \de \Vstr{pre} \cat \Vsym{tell} \mydot \Vstr{nulls1} \cat \Vstr{post1} ],
where $\Vstr{nulls1} = \epsilon$.
We now proceed by overlapping cases.
In the first case,
the dot in \Vdr{dr2} comes at or after the dot in \Vdr{dr1}.
Since \Vdr{dr2} is ethererally equivalent
we have,
if we rewrite \Vstr{post1}
as $\Vstr{nulls2} \cat \Vstr{post2}$,
\Vdr{dr2} =
[ \Vsym{A} \de \Vstr{pre} \cat \Vsym{tell} \cat \Vstr{nulls1} \cat \Vstr{nulls2} \mydot \Vstr{post2} ].
Therefore, from
and the definition of ethereal closure,
\[ \Vdr{dr2} \in \var{ethereal-dr-closure}(\lbrace \Vdr{tell} \rbrace).\]
By the definition of telluric base,
\Vdr{tell} is the telluric base of \Vdr{dr2}.
In the second case
the dot in \Vdr{dr2} comes at or before the dot in \Vdr{dr1}.
We may write
\Vdr{dr2} = [ \Vsym{A} \de \Vstr{pre} \cat \Vsym{tell} \cat \Vstr{nulls1a} \mydot \Vstr{nulls1b} \cat \Vstr{post1} ],
where $\Vstr{nulls1} = \Vstr{nulls1a} \cat \Vstr{nulls1b}.$
by the definition of telluric base,
\Vdr{tell} is the telluric base of \Vdr{dr2}.
In both cases,
we have shown that
\Vdr{tell} is the telluric base of \Vdr{dr2}.
\ttitle{Telluric base of a quasi-confirmed dotted rule is unique}
If a dotted rule is quasi-confirmed,
its telluric base is unique.
Let the dotted rule be \Vdr{dr}.
This theorem follows
from Theorem \ref{t:eth-eq-share-telluric-base},
if you set both of its dotted rules to \Vdr{dr}.
The complexity of the ethereal closure is of interest:
we may want to compute it on the fly,
and in any case,
we certainly want to show that
the ethereal closure has finite time complexity.
\algtitle{Add a generation to the ethereal closure}{alg:ethereal-generation}
\Procedure{Ethereal next}{\Vdr{this}, \Vdrset{results}, \Vdrset{work}}
\If{\Vdr{this} has no postdot symbol}
\State return
\State Here \Vdr{this} is $[ \Vsym{lhs} \de \Vstr{before} \mydot \Vsym{A} \cat \Vstr{after} ]$
\State \Comment We can state this without loss of generality
\If{$\Vsym{A}$ is a nulling symbol}
\State $\Vdr{new} \gets [ \Vsym{lhs} \de \Vstr{before} \cat \Vsym{A} \mydot \Vstr{after} ]$
\State Add \Vdr{new} to \Vdrset{results} \ldots
\State $\qquad$ but only if it has never been added before
\State Add \Vdr{new} to \Vdrset{work} \ldots
\State $\qquad$ but only if it has never been added before
\State return
\State Here \Vsym{A} must be a telluric symbol
\For{ each \Vrule{r} in \Cg{}}
\If{ $\LHS{\Vrule{r}} = \Vsym{A}$ }
\State Here \Vrule{r} is $[ \Vsym{A} \de \Vstr{rhs} ]$
\State \Comment We can state this without loss of generality
\State $\Vdr{new} \gets [ \Vsym{A} \de \mydot \Vstr{rhs} ]$
\State Add \Vdr{new} to \Vdrset{results} \ldots
\State $\qquad$ but only if it has never been added before
\State Add \Vdr{new} to \Vdrset{work} \dots
\State $\qquad$ but only if it has never been added before
\State return
\algtitle{Create ethereal closure}{alg:ethereal-closure}
\Function{Create ethereal closure}{\Vdr{base}}
\State $\Vdrset{result} \gets \emptyset$
\State $\Vdrset{work} \gets \emptyset$
\State \Call{Ethereal next}{\Vdr{base}}
\While{$\Vdrset{work} \neq \emptyset$}
\State Remove a dotted rule from \Vdrset{work}, call it \Vdr{this}
\State \Call{Ethereal next}{\Vdr{this}, \Vdrset{result}, \Vdrset{work}}
\State return \Vdrset{result}
Algorithm \ref{alg:ethereal-closure}
is not actually used by any
of Marpa's versions ---
it is chosen because it is
convenient for exploring the theory.
In the actual implementation,
null-scans are dealt with implicitly,
while predictions are explicitly computed after
each Earley set is otherwise complete.
\ttitle{Ethereal closure is constant time}
Ethereal closure has time complexity \Oc{}.
We consider Algorithm \ref{alg:ethereal-closure}.
This clearly runs in \Oc{} time if there is a constant
number of calls to
Algorithm \ref{alg:ethereal-generation}.
To finish the proof, we need to show
Algorithm \ref{alg:ethereal-generation}
is called a constant number
of times.
Algorithm \ref{alg:ethereal-generation}
is called
once for the base dotted rule of the computation.
It is called again for every dotted rule added to the working set
of dotted rules, \Vdrset{work}.
We know that no dotted rule is added to
\Vdrset{work} twice.
Algorithm \ref{alg:ethereal-generation}
is called
at most once for each dotted rule.
\Cg{} has a fixed number of dotted rules,
that Algorithm \ref{alg:ethereal-generation}
is called
at most \Oc{} times.
\ttitle{Ethereal closure algorithm is correct}
Algorithm \ref{alg:ethereal-closure} is correct.
From examining
Algorithm \ref{alg:ethereal-closure},
in particular
Algorithm \ref{alg:ethereal-generation},
we see that
the null transitions
for nulling postdot symbols are complete and consistent,
and therefore correct.
From examining
Algorithm \ref{alg:ethereal-closure},
in particular
Algorithm \ref{alg:ethereal-generation},
we see that the null transitions for predictions are
properly made,
so that
the set of predictions is consistent.
It remains to show that the set of predictions is complete.
Algorithm \ref{alg:ethereal-closure}
clearly adds all predictions derivable in a single step
to its results.
It also
calls the ``Ethereal next'' function
repeatedly, so that indirect predictions will be added.
But it will refuse to
add a dotted rule to its working set more than once.
We need to consider whether this means some predictions
will not be derived.
Consider a prediction
\Vsym{lhs-pred} \de \mydot \Vsym{pred-rhs}
which is derived through a series of dotted rule predictions
added to the work list at line
of Algorithm \ref{alg:ethereal-generation}.
For a reductio,
assume that one prediction,
call it
[ \Vsym{lhs-dup} \de \mydot \Vsym{rhs-dup} ],
occurs twice.
Without loss of generality, let that chain be
& [ \Vsym{lhs0} \de \mydot \Vsym{rhs0} ] && \text{Step 0} \\
& [ \Vsym{lhs1} \de \mydot \Vsym{rhs1} ] && \text{Step 1} \\
& \ldots && \\
& [ \Vsym{lhs-predup} \de \mydot \Vsym{lhs-dup} \cat \Vstr{after-predup} ] && \text{Step \Vdecr{i}} \\
& [ \Vsym{lhs-dup} \de \mydot \Vsym{rhs-dup} ] && \text{Step \var{i}} \\
& \ldots && \\
& [ \Vsym{lhs-predup2} \de \mydot \Vsym{lhs-dup} \cat \Vstr{after-predup2} ] && \text{Step \Vdecr{j}} \\
& [ \Vsym{lhs-dup} \de \mydot \Vsym{rhs-dup} ] && \text{Step \var{j}}\\
& \ldots && \\
& [ \Vsym{lhs-penult} \de \mydot \Vsym{rhs-penult} ] && \\
& [ \Vsym{lhs-last} \de \mydot \Vsym{rhs-last} ]
where Step \var{i} is the first occurrence of
and Step \var{j} is the last.
We can create a shorter chain of predictions by removing the steps in
the chain from Step $\var{i}+1$ to Step \var{j}.
Call this process of removing steps, ``pruning duplicates''.
By pruning duplicates for every prediction which occurs twice in
the chain,
we see that we can create a chain that results in
but which does not contain any prediction more than once.
We can also see that, since
Algorithm \ref{alg:ethereal-closure} follows all chains
that contain no duplicate predictions,
Algorithm \ref{alg:ethereal-closure} will add
\eqref{ethereal-closure-correct-5} to its result.
\eqref{ethereal-closure-correct-5} was chosen without loss
of generality,
we see that every prediction can be reached by following
a chain of predictions with no duplicate predictions,
and that therefore
Algorithm \ref{alg:ethereal-closure} adds a complete
set of predictions.
We now summarize our results.
By definition, the ethereal closure is the
transitive closure of the union
of predictions and null-scans.
In \eqref{ethereal-closure-correct-2}
we showed that the sets of
null-scans added are correct and,
in \eqref{ethereal-closure-correct-3},
that the set of predictions added is consistent.
we showed
that the set of predictions
added is complete.
This shows the theorem.
\chapter{Earley items}
An Earley item (type \dtype{EIM})
is a triple
[\Vdr{dotted-rule}, \Vorig{x}, \Vloc{current} ]
\index{recce-notation}{[3]@[ dr, origin, current ]}%
\index{recce-notation}{[3]@[ dr, origin, current ]!EIM as 3-tuple}
of dotted rule, origin, and current location.
The \dfn{origin} is the location where recognition of the rule
(It is sometimes called the ``parent''.)
The \dfn{current} or \dfn{dot location} is the location
in the input, \Cw{}, of the dot position in \Vdr{dotted-rule}.
For convenience, the type \dtype{ORIG} will be a synonym
for \type{LOC}, indicating that the variable designates
the origin element of an Earley item.
\text{Where $\Veim{x} = [\Vdr{x}, \Vorig{x}, \Vloc{x}]$ we say that} \\
\DR{\Veim{x}} & = \Vdr{x}, \\
\Origin{\Veim{x}} & = \Vorig{x}, \\
\Current{\Veim{x}} & = \Vloc{x}, \\
\Left{\Veim{x}} & = \Vorig{x}, \; \text{and} \\
\Right{\Veim{x}} & = \Vloc{x}. \\
Traditionally, an Earley item is shown as a duple,
[\Vdr{dotted-rule}, \Vorig{x} ]
\index{recce-notation}{[2]@[ dr, origin ]}%
\index{recce-notation}{[2]@[ dr, origin ]!EIM as duple}
with \Vloc{current} omitted,
and we will sometimes use this form.
When the duple form is used,
the current location is specified by the context,
either explicitly or implicitly.
\dtitle{Dotted rule notions applied to EIMs}
Whenever we apply a dotted rule notion to an EIM,
we mean to apply that notion to the dotted rule of the EIM.
For example, a
\xdfn{complete EIM}{complete (EIM)!wrt another EIM}
a complete EIM is an EIM with a complete
dotted rule, and a
\xdfn{predicted EIM}{predicted (EIM)}
is an EIM with a predicted dotted rule.
\Veim{quasi} = [ \Vdr{quasi}, \var{i}, \var{j} ]
is a quasi-complete EIM,
then its
\xdfn{completion EIM}{completion EIM!wrt another EIM}
\xdfn{completion (EIM)}{completion (EIM)!wrt another EIM}
\big[ [ \Rule{\Veim{quasi}}, \Vdecr{\Vsize{\Rule{\Veim{quasi}}}} ],
\var{i}, \var{j}
\dtitle{Rule notions applied to EIMs}
Whenever we apply a rule notion to an EIM,
call it \Veim{e},
we mean to apply that notion to
the rule of the dotted rule of the EIM,
or \Rule{\DR{\Veim{e}}}.
\dtitle{Start EIM}
The \dfn{start EIM} is
\Veim{start} = [ [\Vsym{accept} \de \mydot \Vsym{start} ], 0, 0 ].
\dtitle{Accept EIM}
The \dfn{accept EIM} is
\Vdr{accept} = [ [\Vsym{accept} \de \Vsym{start} \mydot ], 0, \Vsize{\Cw} ].
\ttitle{Earley item types}
Every EIM falls into one of these
five disjoint types:
start, prediction, read, null-scan and reduction.
Recall that EIM's take their type from their dotted rule.
The proof then follows directly from definitions
and \ref{def:reduction-dr}
\dtitle{Telluric and ethereal EIM's}
An EIM is
if its dotted rule is telluric
if its dotted rule is ethereal.
\ttitle{Telluric and ethereal EIM's}
An EIM is telluric if it is the
start EIM,
a read EIM or a reduction EIM.
An EIM is ethereal if it is a null-scan
EIM or a prediction EIM.
The theorem follows directly from
\tref{t:telluric-dr} and
\dref[telluric EIM's]{def:telluric-eim}.
\xdfn{located symbol}{located symbol@located symbol|mysee{locsym}}
is a duple consisting of a symbol
and a parse location:
< \Vsym{sym}, \Vloc{loc} >.
\dtitle{Locsym of an EIM}
\qdfn{external locsym}%
\index{recce-definitions}{external locsym@external locsym!of an EIM}
of the EIM \Veim{eim}
<\LHS{\Veim{e}}, \Left{\Veim{e}}>.
References to the
\index{recce-definitions}{locsym@locsym!of an EIM}%
\index{recce-definitions}{locsym@locsym|seealso {external locsym}}
of an EIM
are to its external locsym.
We also write the locsym of \Veim{eim} as \LSY{\Veim{eim}}.
\dtitle{Locsym of a parse instance}
\index{recce-definitions}{locsym@locsym!of an INST}
of the parse instance \Vinst{inst}
<\Symbol{\Vinst{inst}}, \Left{\Vinst{inst}}>.
We also write the locsym of \Vinst{inst} as \LSY{\Vinst{inst}}.
\dtitle{Postdot locysm}
\qdfn{postdot locsym}%
\index{recce-definitions}{postdot locsym@postdot locsym!of an EIM}
of the EIM \Veim{eim}
<\Postdot{\Veim{e}}, \Current{\Veim{e}}>,