l3kernel/l3regex.dtx

% \iffalse meta-comment
%
%% File: l3regex.dtx
%
% Copyright (C) 2011-2021 The LaTeX Project
%
% It may be distributed and/or modified under the conditions of the
% LaTeX Project Public License (LPPL), either version 1.3c of this
% license or (at your option) any later version.  The latest version
% of this license is in the file
%
%    https://www.latex-project.org/lppl.txt
%
% This file is part of the "l3kernel bundle" (The Work in LPPL)
% and all files in that bundle must be distributed together.
%
% -----------------------------------------------------------------------
%
% The development version of the bundle can be found at
%
%    https://github.com/latex3/latex3
%
% for those people who are interested.
%
%<*driver>
\documentclass[full,kernel]{l3doc}
\begin{document}
  \DocInput{\jobname.dtx}
\end{document}
%</driver>
% \fi
%
% \title{^^A
%   The \textsf{l3regex} package: Regular expressions in \TeX{}^^A
% }
%
% \author{^^A
%  The \LaTeX{} Project\thanks
%    {^^A
%      E-mail:
%        \href{mailto:latex-team@latex-project.org}
%          {latex-team@latex-project.org}^^A
%    }^^A
% }
%
% \date{Released 2021-02-18}
%
% \maketitle
%
% \begin{documentation}
% \newenvironment{l3regex-syntax}
%   {\begin{itemize}\def\\{\char`\\}\def\makelabel##1{\hss\llap{\ttfamily##1}}}
%   {\end{itemize}}
%
% The \pkg{l3regex} package provides regular expression testing,
% extraction of submatches, splitting, and replacement, all acting
% on token lists. The syntax of regular expressions is mostly a subset
% of the \textsc{pcre} syntax (and very close to \textsc{posix}),
% with some additions
% due to the fact that \TeX{} manipulates tokens rather than characters.
% For performance reasons, only a limited set of features are implemented.
% Notably, back-references are not supported.
%
% Let us give a few examples. After
% \begin{verbatim}
%   \tl_set:Nn \l_my_tl { That~cat. }
%   \regex_replace_once:nnN { at } { is } \l_my_tl
% \end{verbatim}
% the token list variable \cs{l_my_tl} holds the text
% \enquote{\texttt{This cat.}}, where the first
% occurrence of \enquote{\texttt{at}} was replaced
% by \enquote{\texttt{is}}. A more complicated example is
% a pattern to emphasize each word and add a comma after it:
% \begin{verbatim}
%   \regex_replace_all:nnN { \w+ } { \c{emph}\cB\{ \0 \cE\} , } \l_my_tl
% \end{verbatim}
% The |\w| sequence represents any \enquote{word} character, and |+|
% indicates that the |\w| sequence should be repeated as many times as
% possible (at least once), hence matching a word in the input token
% list. In the replacement text, |\0| denotes the full match (here, a
% word).  The command |\emph| is inserted using |\c{emph}|, and its
% argument |\0| is put between braces |\cB\{| and |\cE\}|.
%
% If a regular expression is to be used several times,
% it can be compiled once, and stored in a regex
% variable using \cs{regex_const:Nn}. For example,
% \begin{verbatim}
%   \regex_const:Nn \c_foo_regex { \c{begin} \cB. (\c[^BE].*) \cE. }
% \end{verbatim}
% stores in \cs{c_foo_regex} a regular expression which matches the
% starting marker for an environment: \cs{begin}, followed by a
% begin-group token (|\cB.|), then any number of tokens which are
% neither begin-group nor end-group character tokens (|\c[^BE].*|),
% ending with an end-group token (|\cE.|). As explained in the next
% section, the parentheses \enquote{capture} the result of |\c[^BE].*|,
% giving us access to the name of the environment when doing
% replacements.
%
% \section{Syntax of regular expressions}
%
% We start with a few examples, and encourage the reader to apply
% \cs{regex_show:n} to these regular expressions.
% \begin{itemize}
% \item |Cat| matches the word \enquote{Cat} capitalized in this way,
%   but also matches the beginning of the word \enquote{Cattle}: use
%   |\bCat\b| to match a complete word only.
% \item |[abc]| matches one letter among \enquote{a}, \enquote{b},
%   \enquote{c}; the pattern \verb"(a|b|c)" matches the same three
%   possible letters (but see the discussion of submatches below).
% \item |[A-Za-z]*| matches any number (due to the quantifier
%   \verb"*") of Latin letters (not accented).
% \item |\c{[A-Za-z]*}| matches a control sequence made of Latin
%   letters.
% \item |\_[^\_]*\_| matches an underscore, any number of characters
%   other than underscore, and another underscore; it is equivalent to
%   |\_.*?\_| where |.| matches arbitrary characters and the
%   lazy quantifier |*?| means to match as few characters as
%   possible, thus avoiding matching underscores.
% \item |[\+\-]?\d+| matches an explicit integer with at most one
%   sign.
% \item \verb*"[\+\-\ ]*\d+\ *" matches an explicit integer with any
%   number of $+$ and $-$ signs, with spaces allowed except within the
%   mantissa, and surrounded by spaces.
% \item \verb*"[\+\-\ ]*(\d+|\d*\.\d+)\ *" matches an explicit integer or
%   decimal number; using \verb*"[.,]" instead of \verb*"\." would allow
%   the comma as a decimal marker.
% \item
%   \verb*"[\+\-\ ]*(\d+|\d*\.\d+)\ *((?i)pt|in|[cem]m|ex|[bs]p|[dn]d|[pcn]c)\ *"
%   \allowbreak matches an explicit dimension with any unit that \TeX{} knows, where
%   \verb*"(?i)" means to treat lowercase and uppercase letters
%   identically.
% \item \verb*"[\+\-\ ]*((?i)nan|inf|(\d+|\d*\.\d+)(\ *e[\+\-\ ]*\d+)?)\ *"
%   matches an explicit floating point number or the special values
%   \verb*"nan" and \verb*"inf" (with signs and spaces allowed).
% \item \verb*"[\+\-\ ]*(\d+|\cC.)\ *" matches an explicit integer or
%   control sequence (without checking whether it is an integer
%   variable).
% \item |\G.*?\K| at the beginning of a regular expression matches and
%   discards (due to |\K|) everything between the end of the previous
%   match (|\G|) and what is matched by the rest of the regular
%   expression; this is useful in \cs{regex_replace_all:nnN} when the
%   goal is to extract matches or submatches in a finer way than with
%   \cs{regex_extract_all:nnN}.
% \end{itemize}
% While it is impossible for a regular expression to match only integer
% expressions, \verb*"[\+\-\(]*\d+\)*([\+\-*/][\+\-\(]*\d+\)*)*" matches among
% other things all valid integer expressions (made only with explicit
% integers).  One should follow it with further testing.
%
% Most characters match exactly themselves,
% with an arbitrary category code. Some characters are
% special and must be escaped with a backslash (\emph{e.g.}, |\*|
% matches a star character). Some escape sequences of
% the form backslash--letter also have a special meaning
% (for instance |\d| matches any digit). As a rule,
% \begin{itemize}
% \item every alphanumeric character (\texttt{A}--\texttt{Z},
%   \texttt{a}--\texttt{z}, \texttt{0}--\texttt{9}) matches
%   exactly itself, and should not be escaped, because
%   |\A|, |\B|, \ldots{} have special meanings;
% \item non-alphanumeric printable ascii characters can (and should)
%   always be escaped: many of them have special meanings (\emph{e.g.},
%   use |\(|, |\)|, |\?|, |\.|);
% \item spaces should always be escaped (even in character
%   classes);
% \item any other character may be escaped or not, without any
%   effect: both versions match exactly that character.
% \end{itemize}
% Note that these rules play nicely with the fact that many
% non-alphanumeric characters are difficult to input into \TeX{}
% under normal category codes. For instance, |\\abc\%|
% matches the characters |\abc%| (with arbitrary category codes),
% but does not match the control sequence |\abc| followed by a
% percent character. Matching control sequences can be done
% using the |\c|\Arg{regex} syntax (see below).
%
% Any special character which appears at a place where its special
% behaviour cannot apply matches itself instead (for instance, a
% quantifier appearing at the beginning of a string), after raising a
% warning.
%
% Characters.
% \begin{l3regex-syntax}
%   \item[\\x\{hh\ldots{}\}] Character with hex code \texttt{hh\ldots{}}
%   \item[\\xhh] Character with hex code \texttt{hh}.
%   \item[\\a] Alarm (hex 07).
%   \item[\\e] Escape (hex 1B).
%   \item[\\f] Form-feed (hex 0C).
%   \item[\\n] New line (hex 0A).
%   \item[\\r] Carriage return (hex 0D).
%   \item[\\t] Horizontal tab (hex 09).
% \end{l3regex-syntax}
%
% Character types.
% \begin{l3regex-syntax}
%   \item[.] A single period matches any token.
%   \item[\\d] Any decimal digit.
%   \item[\\h] Any horizontal space character,
%     equivalent to |[\ \^^I]|: space and tab.
%   \item[\\s] Any space character,
%     equivalent to |[\ \^^I\^^J\^^L\^^M]|.
%   \item[\\v] Any vertical space character,
%     equivalent to |[\^^J\^^K\^^L\^^M]|. Note that |\^^K| is a vertical space,
%     but not a space, for compatibility with Perl.
%   \item[\\w] Any word character, \emph{i.e.},
%     alphanumerics and underscore, equivalent to the explicit
%     class |[A-Za-z0-9\_]|.
%   \item[\\D] Any token not matched by |\d|.
%   \item[\\H] Any token not matched by |\h|.
%   \item[\\N] Any token other than the |\n| character (hex 0A).
%   \item[\\S] Any token not matched by |\s|.
%   \item[\\V] Any token not matched by |\v|.
%   \item[\\W] Any token not matched by |\w|.
% \end{l3regex-syntax}
% Of those, |.|, |\D|, |\H|, |\N|, |\S|, |\V|, and |\W| match arbitrary
% control sequences.
%
% Character classes match exactly one token in the subject.
% \begin{l3regex-syntax}
%   \item[{[\ldots{}]}] Positive character class.
%     Matches any of the specified tokens.
%   \item[{[\char`\^\ldots{}]}] Negative character class.
%     Matches any token other than the specified characters.
%   \item[{x-y}] Within a character class, this denotes a range (can be
%     used with escaped characters).
%   \item[{[:\meta{name}:]}] Within a character class (one more set of
%     brackets), this denotes the \textsc{posix} character class
%     \meta{name}, which can be \texttt{alnum}, \texttt{alpha},
%     \texttt{ascii}, \texttt{blank}, \texttt{cntrl}, \texttt{digit},
%     \texttt{graph}, \texttt{lower}, \texttt{print}, \texttt{punct},
%     \texttt{space}, \texttt{upper}, \texttt{word}, or \texttt{xdigit}.
%   \item[{[:\char`\^\meta{name}:]}] Negative \textsc{posix} character class.
% \end{l3regex-syntax}
% For instance, |[a-oq-z\cC.]| matches any lowercase latin letter
% except |p|, as well as control sequences (see below for a description
% of |\c|).
%
% Quantifiers (repetition).
% \begin{l3regex-syntax}
%   \item[?] $0$ or $1$, greedy.
%   \item[??] $0$ or $1$, lazy.
%   \item[*] $0$ or more, greedy.
%   \item[*?] $0$ or more, lazy.
%   \item[+] $1$ or more, greedy.
%   \item[+?] $1$ or more, lazy.
%   \item[\{$n$\}] Exactly $n$.
%   \item[\{$n,$\}] $n$ or more, greedy.
%   \item[\{$n,$\}?] $n$ or more, lazy.
%   \item[\{$n,m$\}] At least $n$, no more than $m$, greedy.
%   \item[\{$n,m$\}?] At least $n$, no more than $m$, lazy.
% \end{l3regex-syntax}
%
% Anchors and simple assertions.
% \begin{l3regex-syntax}
%   \item[\\b] Word boundary: either the previous token is matched by
%     |\w| and the next by |\W|, or the opposite. For this purpose,
%     the ends of the token list are considered as |\W|.
%   \item[\\B] Not a word boundary: between two |\w| tokens
%     or two |\W| tokens (including the boundary).
%   \item[\char`^ \textrm{or} \\A]
%     Start of the subject token list.
%   \item[\char`$\textrm{,} \\Z \textrm{or} \\z] ^^A $
%     End of the subject token list.
%   \item[\\G] Start of the current match. This is only different from |^|
%     in the case of multiple matches: for instance
%     |\regex_count:nnN { \G a } { aaba } \l_tmpa_int| yields $2$, but
%     replacing |\G| by |^| would result in \cs{l_tmpa_int} holding the
%     value $1$.
% \end{l3regex-syntax}
%
% Alternation and capturing groups.
% \begin{l3regex-syntax}
%   \item[A\char`|B\char`|C] Either one of \texttt{A}, \texttt{B},
%     or \texttt{C}.
%   \item[(\ldots{})] Capturing group.
%   \item[(?:\ldots{})] Non-capturing group.
%   \item[(?\char`|\ldots{})] Non-capturing group which resets
%     the group number for capturing groups in each alternative.
%     The following group is numbered with the first unused
%     group number.
% \end{l3regex-syntax}
%
% The |\c| escape sequence allows to test the category code of tokens,
% and match control sequences. Each character category is represented
% by a single uppercase letter:
% \begin{itemize}
% \item |C| for control sequences;
% \item |B| for begin-group tokens;
% \item |E| for end-group tokens;
% \item |M| for math shift;
% \item |T| for alignment tab tokens;
% \item |P| for macro parameter tokens;
% \item |U| for superscript tokens (up);
% \item |D| for subscript tokens (down);
% \item |S| for spaces;
% \item |L| for letters;
% \item |O| for others; and
% \item |A| for active characters.
% \end{itemize}
% The |\c| escape sequence is used as follows.
% \begin{l3regex-syntax}
%   \item[\\c\Arg{regex}] A control sequence whose csname matches the
%     \meta{regex}, anchored at the beginning and end, so that |\c{begin}|
%     matches exactly \cs{begin}, and nothing else.
%   \item[\\cX] Applies to the next object, which can be a character,
%     character property, class, or group, and forces this object to
%     only match tokens with category |X| (any of |CBEMTPUDSLOA|. For
%     instance, |\cL[A-Z\d]| matches uppercase letters and digits of
%     category code letter, |\cC.| matches any control sequence, and
%     |\cO(abc)| matches |abc| where each character has category other.
%   \item[{\\c[XYZ]}] Applies to the next object, and forces it to only
%     match tokens with category |X|, |Y|, or |Z| (each being any of
%     |CBEMTPUDSLOA|). For instance, |\c[LSO](..)| matches two tokens of
%     category letter, space, or other.
%   \item[{\\c[\char`\^XYZ]}] Applies to the next object and prevents it
%     from matching any token with category |X|, |Y|, or |Z| (each being
%     any of |CBEMTPUDSLOA|). For instance, |\c[^O]\d| matches digits
%     which have any category different from other.
% \end{l3regex-syntax}
% The category code tests can be used inside classes; for instance,
% |[\cO\d \c[LO][A-F]]| matches what \TeX{} considers as hexadecimal
% digits, namely digits with category other, or uppercase letters from
% |A| to |F| with category either letter or other. Within a group
% affected by a category code test, the outer test can be overridden by
% a nested test: for instance, |\cL(ab\cO\*cd)| matches |ab*cd| where
% all characters are of category letter, except |*| which has category
% other.
%
% The |\u| escape sequence allows to insert the contents of a token list
% directly into a regular expression or a replacement, avoiding the need
% to escape special characters. Namely, |\u|\Arg{var~name} matches
% the exact contents of the variable \cs[no-index]{\meta{var~name}},
% which are obtained by applying \cs{exp_not:v} \Arg{var~name} at the
% time the regular expression is compiled. Within a |\c{...}|
% control sequence matching, the |\u| escape sequence only expands its
% argument once, in effect performing \cs{tl_to_str:v}. Quantifiers are
% not supported directly: use a group, for instance as in
% |(?:\u|\Arg{var~name}|){2,4}|.
%
% The option |(?i)| makes the match case insensitive (identifying
% \texttt{A}--\texttt{Z} with \texttt{a}--\texttt{z}; no Unicode support
% yet). This applies until the end of the group in which it appears, and
% can be reverted using |(?-i)|. For instance, in
% \verb"(?i)(a(?-i)b|c)d", the letters |a| and |d| are affected by the
% |i| option. Characters within ranges and classes are affected
% individually: |(?i)[Y-\\]| is equivalent to |[YZ\[\\yz]|, and
% |(?i)[^aeiou]| matches any character which is not a vowel. Neither
% character properties, nor |\c{...}| nor |\u{...}| are affected by the
% |i| option.
% ^^A \]
%
% In character classes, only |[|, |^|, |-|, |]|, |\| and spaces are
% special, and should be escaped. Other non-alphanumeric characters can
% still be escaped without harm. Any escape sequence which matches a
% single character (|\d|, |\D|, \emph{etc.}) is supported in character
% classes.  If the first character is |^|, then
% the meaning of the character class is inverted; |^| appearing anywhere
% else in the range is not special.  If the first character (possibly
% following a leading |^|) is |]| then it does not need to be escaped
% since ending the range there would make it empty.
% Ranges of characters
% can be expressed using |-|, for instance, |[\D 0-5]| and |[^6-9]| are
% equivalent.
%
% Capturing groups are a means of extracting information about the
% match. Parenthesized groups are labelled in the order of their
% opening parenthesis, starting at $1$. The contents of those groups
% corresponding to the \enquote{best} match (leftmost longest)
% can be extracted and stored in a sequence of token lists using for
% instance \cs{regex_extract_once:nnNTF}.
%
% The |\K| escape sequence resets the beginning of the match to the
% current position in the token list. This only affects what is reported
% as the full match. For instance,
% \begin{verbatim}
%   \regex_extract_all:nnN { a \K . } { a123aaxyz } \l_foo_seq
% \end{verbatim}
% results in \cs{l_foo_seq} containing the items |{1}| and |{a}|: the
% true matches are |{a1}| and |{aa}|, but they are trimmed by the use of
% |\K|. The |\K| command does not affect capturing groups: for instance,
% \begin{verbatim}
%   \regex_extract_once:nnN { (. \K c)+ \d } { acbc3 } \l_foo_seq
% \end{verbatim}
% results in \cs{l_foo_seq} containing the items |{c3}| and |{bc}|: the
% true match is |{acbc3}|, with first submatch |{bc}|, but |\K| resets
% the beginning of the match to the last position where it appears.
%
% \section{Syntax of the replacement text}
%
% Most of the features described in regular expressions do not make
% sense within the replacement text.  Backslash introduces various
% special constructions, described further below:
% \begin{itemize}
%   \item |\0| is the whole match;
%   \item |\1| is the submatch that was matched by the first (capturing)
%     group |(...)|; similarly for |\2|, \ldots{}, |\9| and
%     |\g{|\meta{number}|}|;
%   \item \verb*|\ | inserts a space (spaces are ignored when not
%     escaped);
%   \item |\a|, |\e|, |\f|, |\n|, |\r|, |\t|, |\xhh|, |\x{hhh}|
%     correspond to single characters as in regular expressions;
%   \item |\c|\Arg{cs~name} inserts a control sequence;
%   \item |\c|\meta{category}\meta{character} (see below);
%   \item |\u|\Arg{tl~var~name} inserts the contents of the
%     \meta{tl~var} (see below).
% \end{itemize}
% Characters other than backslash and space are simply inserted in the
% result (but since the replacement text is first converted to a string,
% one should also escape characters that are special for \TeX{}, for
% instance use~|\#|).  Non-alphanumeric characters can always be safely
% escaped with a backslash.
%
% For instance,
% \begin{verbatim}
%   \tl_set:Nn \l_my_tl { Hello,~world! }
%   \regex_replace_all:nnN { ([er]?l|o) . } { (\0--\1) } \l_my_tl
% \end{verbatim}
% results in \cs{l_my_tl} holding |H(ell--el)(o,--o) w(or--o)(ld--l)!|
%
% The submatches are numbered according to the order in which the
% opening parenthesis of capturing groups appear in the regular
% expression to match.  The $n$-th submatch is empty if there are fewer
% than $n$ capturing groups or for capturing groups that appear in
% alternatives that were not used for the match.  In case a capturing
% group matches several times during a match (due to quantifiers) only
% the last match is used in the replacement text. Submatches always keep
% the same category codes as in the original token list.
%
% By default, the category code of characters inserted by the
% replacement are determined by the prevailing category code regime at
% the time where the replacement is made, with two exceptions:
% \begin{itemize}
% \item space characters (with character code $32$) inserted with
%   \verb*|\ | or |\x20| or |\x{20}| have category code~$10$ regardless
%   of the prevailing category code regime;
% \item if the category code would be $0$~(escape), $5$~(newline),
%   $9$~(ignore), $14$~(comment) or $15$~(invalid), it is replaced by
%   $12$~(other) instead.
% \end{itemize}
% The escape sequence |\c| allows to insert characters
% with arbitrary category codes, as well as control sequences.
% \begin{l3regex-syntax}
% \item[\\cX(\ldots{})] Produces the characters \enquote{\ldots{}} with
%   category~|X|, which must be one of |CBEMTPUDSLOA| as in regular
%   expressions.  Parentheses are optional for a single character (which
%   can be an escape sequence).  When nested, the innermost category
%   code applies, for instance |\cL(Hello\cS\ world)!| gives this text
%   with standard category codes.
% \item[\\c\Arg{text}] Produces the control sequence with csname
%   \meta{text}.  The \meta{text} may contain references to the
%   submatches |\0|, |\1|, and so on, as in the example for |\u| below.
% \end{l3regex-syntax}
%
% The escape sequence |\u|\Arg{tl~var~name} allows to insert the
% contents of the token list with name \meta{tl~var~name} directly into
% the replacement, giving an easier control of category codes.  When
% nested in |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the
% |\u| and |\c|~escape sequences perform \cs{tl_to_str:v}, namely
% extract the value of the control sequence and turn it into a string.
% Matches can also be used within the arguments of |\c| and |\u|.  For
% instance,
% \begin{verbatim}
%   \tl_set:Nn \l_my_one_tl { first }
%   \tl_set:Nn \l_my_two_tl { \emph{second} }
%   \tl_set:Nn \l_my_tl { one , two , one , one }
%   \regex_replace_all:nnN { [^,]+ } { \u{l_my_\0_tl} } \l_my_tl
% \end{verbatim}
% results in \cs{l_my_tl} holding |first,\emph{second},first,first|.
%
% \section{Pre-compiling regular expressions}
%
% If a regular expression is to be used several times,
% it is better to compile it once rather than doing it
% each time the regular expression is used. The compiled
% regular expression is stored in a variable. All
% of the \pkg{l3regex} module's functions can be given their
% regular expression argument either as an explicit string
% or as a compiled regular expression.
%
% \begin{function}[added = 2017-05-26]{\regex_new:N}
%   \begin{syntax}
%     \cs{regex_new:N} \meta{regex~var}
%   \end{syntax}
%   Creates a new \meta{regex~var} or raises an error if the
%   name is already taken. The declaration is global. The
%   \meta{regex~var} is initially such that it never matches.
% \end{function}
%
% \begin{function}[added = 2017-05-26]
%   {\regex_set:Nn, \regex_gset:Nn, \regex_const:Nn}
%   \begin{syntax}
%     \cs{regex_set:Nn} \meta{regex~var} \Arg{regex}
%   \end{syntax}
%   Stores a compiled version of the \meta{regular expression}
%   in the \meta{regex~var}. For instance, this function can be used
%   as
%   \begin{verbatim}
%     \regex_new:N \l_my_regex
%     \regex_set:Nn \l_my_regex { my\ (simple\ )? reg(ex|ular\ expression) }
%   \end{verbatim}
%   The assignment is local for \cs{regex_set:Nn} and global for
%   \cs{regex_gset:Nn}. Use \cs{regex_const:Nn} for compiled expressions
%   which never change.
% \end{function}
%
% \begin{function}[added = 2017-05-26]{\regex_show:n, \regex_show:N}
%   \begin{syntax}
%     \cs{regex_show:n} \Arg{regex}
%   \end{syntax}
%   Shows how \pkg{l3regex} interprets the \meta{regex}. For instance,
%   \cs{regex_show:n} \verb+{\A X|Y}+ shows
%   \begin{verbatim}
%     +-branch
%       anchor at start (\A)
%       char code 88
%     +-branch
%       char code 89
%   \end{verbatim}
%   indicating that the anchor |\A| only applies to the first branch:
%   the second branch is not anchored to the beginning of the match.
% \end{function}
%
% \section{Matching}
%
% All regular expression functions are available in both |:n| and |:N|
% variants. The former require a \enquote{standard} regular expression,
% while the later require a compiled expression as generated by
% \cs{regex_(g)set:Nn}.
%
% \begin{function}[TF, added = 2017-05-26]{\regex_match:nn, \regex_match:Nn}
%   \begin{syntax}
%     \cs{regex_match:nnTF} \Arg{regex} \Arg{token list} \Arg{true code} \Arg{false code}
%   \end{syntax}
%   Tests whether the \meta{regular expression} matches any part
%   of the \meta{token list}. For instance,
%   \begin{verbatim}
%     \regex_match:nnTF { b [cde]* } { abecdcx } { TRUE } { FALSE }
%     \regex_match:nnTF { [b-dq-w] } { example } { TRUE } { FALSE }
%   \end{verbatim}
%   leaves \texttt{TRUE} then \texttt{FALSE} in the input stream.
% \end{function}
%
% \begin{function}[added = 2017-05-26]{\regex_count:nnN, \regex_count:NnN}
%   \begin{syntax}
%     \cs{regex_count:nnN} \Arg{regex} \Arg{token list} \meta{int var}
%   \end{syntax}
%   Sets \meta{int var} within the current \TeX{} group level
%   equal to the number of times
%   \meta{regular expression} appears in \meta{token list}.
%   The search starts by finding the left-most longest match,
%   respecting greedy and lazy (non-greedy) operators. Then the search
%   starts again from the character following the last character
%   of the previous match, until reaching the end of the token list.
%   Infinite loops are prevented in the case where the regular expression
%   can match an empty token list: then we count one match between each
%   pair of characters.
%   For instance,
%   \begin{verbatim}
%     \int_new:N \l_foo_int
%     \regex_count:nnN { (b+|c) } { abbababcbb } \l_foo_int
%   \end{verbatim}
%   results in \cs{l_foo_int} taking the value $5$.
% \end{function}
%
% \section{Submatch extraction}
%
% \begin{function}[noTF, added = 2017-05-26]
%   {\regex_extract_once:nnN, \regex_extract_once:NnN}
%   \begin{syntax}
%     \cs{regex_extract_once:nnN} \Arg{regex} \Arg{token list} \meta{seq~var}
%     \cs{regex_extract_once:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code}
%   \end{syntax}
%   Finds the first match of the \meta{regular expression} in the
%   \meta{token list}. If it exists, the match is stored as the first
%   item of the \meta{seq~var}, and further items are the contents of
%   capturing groups, in the order of their opening parenthesis. The
%   \meta{seq~var} is assigned locally. If there is no match, the
%   \meta{seq~var} is cleared.  The testing versions insert the
%   \meta{true code} into the input stream if a match was found, and the
%   \meta{false code} otherwise.
%
%   For instance, assume that you type
%   \begin{verbatim}
%     \regex_extract_once:nnNTF { \A(La)?TeX(!*)\Z } { LaTeX!!! } \l_foo_seq
%       { true } { false }
%   \end{verbatim}
%   Then the regular expression (anchored at the start with |\A| and
%   at the end with |\Z|) must match the whole token list. The first
%   capturing group, |(La)?|, matches |La|, and the second capturing
%   group, |(!*)|, matches |!!!|. Thus, |\l_foo_seq| contains as a result
%   the items |{LaTeX!!!}|, |{La}|, and |{!!!}|, and the \texttt{true}
%   branch is left in the input stream.
%   Note that the $n$-th item of |\l_foo_seq|, as obtained using
%   \cs{seq_item:Nn}, correspond to the submatch numbered $(n-1)$ in
%   functions such as \cs{regex_replace_once:nnN}.
% \end{function}
%
% \begin{function}[noTF, added = 2017-05-26]
%   {\regex_extract_all:nnN, \regex_extract_all:NnN}
%   \begin{syntax}
%     \cs{regex_extract_all:nnN} \Arg{regex} \Arg{token list} \meta{seq~var}
%     \cs{regex_extract_all:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code}
%   \end{syntax}
%   Finds all matches of the \meta{regular expression}
%   in the \meta{token list}, and stores all the submatch information
%   in a single sequence (concatenating the results of
%   multiple \cs{regex_extract_once:nnN} calls).
%   The \meta{seq~var} is assigned locally. If there is no match,
%   the \meta{seq~var} is cleared.
%   The testing versions insert the \meta{true code} into the input
%   stream if a match was found, and the \meta{false code} otherwise.
%   For instance, assume that you type
%   \begin{verbatim}
%     \regex_extract_all:nnNTF { \w+ } { Hello,~world! } \l_foo_seq
%       { true } { false }
%   \end{verbatim}
%   Then the regular expression matches twice, the resulting
%   sequence contains the two items |{Hello}| and |{world}|,
%   and the \texttt{true} branch is left in the input stream.
% \end{function}
%
% \begin{function}[noTF, added = 2017-05-26]{\regex_split:nnN, \regex_split:NnN}
%   \begin{syntax}
%     \cs{regex_split:nnN} \Arg{regular expression} \Arg{token list} \meta{seq~var}
%     \cs{regex_split:nnNTF} \Arg{regular expression} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code}
%   \end{syntax}
%   Splits the \meta{token list} into a sequence of parts, delimited by
%   matches of the \meta{regular expression}. If the \meta{regular expression}
%   has capturing groups, then the token lists that they match are stored as
%   items of the sequence as well. The assignment to \meta{seq~var} is local.
%   If no match is found the resulting \meta{seq~var} has the
%   \meta{token list} as its sole item. If the \meta{regular expression}
%   matches the empty token list, then the \meta{token list} is split
%   into single tokens.
%   The testing versions insert the \meta{true code} into the input
%   stream if a match was found, and the \meta{false code} otherwise.
%   For example, after
%   \begin{verbatim}
%     \seq_new:N \l_path_seq
%     \regex_split:nnNTF { / } { the/path/for/this/file.tex } \l_path_seq
%       { true } { false }
%   \end{verbatim}
%   the sequence |\l_path_seq| contains the items |{the}|, |{path}|,
%   |{for}|, |{this}|, and |{file.tex}|, and the \texttt{true} branch
%   is left in the input stream.
% \end{function}
%
% \section{Replacement}
%
% \begin{function}[noTF, added = 2017-05-26]
%   {\regex_replace_once:nnN,\regex_replace_once:NnN}
%   \begin{syntax}
%     \cs{regex_replace_once:nnN} \Arg{regular expression} \Arg{replacement} \meta{tl~var}
%     \cs{regex_replace_once:nnNTF} \Arg{regular expression} \Arg{replacement} \meta{tl~var} \Arg{true code} \Arg{false code}
%   \end{syntax}
%   Searches for the \meta{regular expression} in the \meta{token list}
%   and replaces the first match with the \meta{replacement}. The result
%   is assigned locally to \meta{tl~var}. In the \meta{replacement},
%   |\0| represents the full match, |\1| represent the contents of the
%   first capturing group, |\2| of the second, \emph{etc.}
% \end{function}
%
% \begin{function}[noTF, added = 2017-05-26]
%   {\regex_replace_all:nnN, \regex_replace_all:NnN}
%   \begin{syntax}
%     \cs{regex_replace_all:nnN} \Arg{regular expression} \Arg{replacement} \meta{tl~var}
%     \cs{regex_replace_all:nnNTF} \Arg{regular expression} \Arg{replacement} \meta{tl~var} \Arg{true code} \Arg{false code}
%   \end{syntax}
%   Replaces all occurrences of the \meta{regular expression} in the
%   \meta{token list} by the \meta{replacement}, where |\0| represents
%   the full match, |\1| represent the contents of the first capturing
%   group, |\2| of the second, \emph{etc.} Every match is treated
%   independently, and matches cannot overlap.  The result is assigned
%   locally to \meta{tl~var}.
% \end{function}
%
% \section{Constants and variables}
%
% \begin{variable}[added = 2017-12-11]{\l_tmpa_regex, \l_tmpb_regex}
%   Scratch regex for local assignment. These are never used by
%   the kernel code, and so are safe for use with any \LaTeX3-defined
%   function. However, they may be overwritten by other non-kernel
%   code and so should only be used for short-term storage.
% \end{variable}
%
% \begin{variable}[added = 2017-12-11]{\g_tmpa_regex, \g_tmpb_regex}
%   Scratch regex for global assignment. These are never used by
%   the kernel code, and so are safe for use with any \LaTeX3-defined
%   function. However, they may be overwritten by other non-kernel
%   code and so should only be used for short-term storage.
% \end{variable}
%
% \section{Bugs, misfeatures, future work, and other possibilities}
%
% The following need to be done now.
% \begin{itemize}
%   \item Rewrite the documentation in a more ordered way, perhaps add a
%     \textsc{bnf}?
% \end{itemize}
%
% Additional error-checking to come.
% \begin{itemize}
%   \item Clean up the use of messages.
%   \item Cleaner error reporting in the replacement phase.
%   \item Add tracing information.
%   \item Detect attempts to use back-references and other
%     non-implemented syntax.
%   \item Test for the maximum register \cs{c_max_register_int}.
%   \item Find out whether the fact that |\W| and friends match the
%     end-marker leads to bugs. Possibly update \cs[no-index]{__regex_item_reverse:n}.
%   \item The empty cs should be matched by |\c{}|, not by
%     |\c{csname.?endcsname\s?}|.
% \end{itemize}
%
% Code improvements to come.
% \begin{itemize}
%   \item Shift arrays so that the useful information starts at
%     position~$1$.
%   \item Only build |\c{...}| once.
%   \item Use arrays for the left and right state stacks when
%     compiling a regex.
%   \item Should \cs[no-index]{__regex_action_free_group:n} only be used for greedy
%     |{n,}| quantifier? (I think not.)
%   \item Quantifiers for |\u| and assertions.
%   \item When matching, keep track of an explicit stack of
%     \texttt{curr_state} and \texttt{curr_submatches}.
%   \item If possible, when a state is reused by the same thread, kill
%     other subthreads.
%   \item Use an array rather than \cs[no-index]{l__regex_balance_tl}
%     to build the function \cs[no-index]{__regex_replacement_balance_one_match:n}.
%   \item Reduce the number of epsilon-transitions in alternatives.
%   \item Optimize simple strings: use less states (|abcade| should give
%     two states, for |abc| and |ade|). [Does that really make sense?]
%   \item Optimize groups with no alternative.
%   \item Optimize states with a single \cs[no-index]{__regex_action_free:n}.
%   \item Optimize the use of \cs[no-index]{__regex_action_success:} by inserting it
%     in state $2$ directly instead of having an extra transition.
%   \item Optimize the use of \cs{int_step_...} functions.
%   \item Groups don't capture within regexes for csnames; optimize and
%     document.
%   \item Better \enquote{show} for anchors, properties, and catcode tests.
%   \item Does |\K| really need a new state for itself?
%   \item When compiling, use a boolean \texttt{in_cs} and less magic
%     numbers.
%   \item Instead of checking whether the character is special or
%     alphanumeric using its character code, check if it is special in
%     regexes with \cs{cs_if_exist} tests.
% \end{itemize}
%
% The following features are likely to be implemented at some point
% in the future.
% \begin{itemize}
%   \item General look-ahead/behind assertions.
%   \item Regex matching on external files.
%   \item Conditional subpatterns with look ahead/behind: \enquote{if
%       what follows is [\ldots{}], then [\ldots{}]}.
%   \item |(*..)| and |(?..)| sequences to set some options.
%   \item UTF-8 mode for \pdfTeX{}.
%   \item Newline conventions are not done.
%     In particular, we should have an option for |.| not to match newlines.
%     Also, |\A| should differ from |^|, and |\Z|, |\z| and |$| should
%     differ.
%   \item Unicode properties: |\p{..}| and |\P{..}|;
%     |\X| which should match any \enquote{extended} Unicode sequence.
%     This requires to manipulate a lot of data, probably using tree-boxes.
%   \item Provide a syntax such as |\ur{l_my_regex}| to use an
%     already-compiled regex in a more complicated regex.  This makes
%     regexes more easily composable.
% \end{itemize}
%
% The following features of \textsc{pcre} or Perl may or may not be
% implemented.
% \begin{itemize}
%   \item Callout with |(?C...)| or other syntax: some internal code
%     changes make that possible, and it can be useful for instance in
%     the replacement code to stop a regex replacement when some marker
%     has been found; this raises the question of a potential
%     |\regex_break:| and then of playing well with \cs{tl_map_break:}
%     called from within the code in a regex.  It also raises the
%     question of nested calls to the regex machinery, which is a
%     problem since \tn{fontdimen} are global.
%   \item Conditional subpatterns (other than with a look-ahead or
%     look-behind condition): this is non-regular, isn't it?
%   \item Named subpatterns: \TeX{} programmers have lived so far
%     without any need for named macro parameters.
% \end{itemize}
%
% The following features of \textsc{pcre} or Perl will definitely not be
% implemented.
% \begin{itemize}
%   \item Back-references: non-regular feature, this requires
%     backtracking, which is prohibitively slow.
%   \item Recursion: this is a non-regular feature.
%   \item Atomic grouping, possessive quantifiers: those tools, mostly
%     meant to fix catastrophic backtracking, are unnecessary in a
%     non-backtracking algorithm, and difficult to implement.
%   \item Subroutine calls: this syntactic sugar is difficult to include
%     in a non-backtracking algorithm, in particular because the
%     corresponding group should be treated as atomic.
%   \item Backtracking control verbs: intrinsically tied to
%     backtracking.
%   \item |\ddd|, matching the character with octal code \texttt{ddd}:
%     we already have |\x{...}| and the syntax is confusingly close to
%     what we could have used for backreferences (|\1|, |\2|, \ldots{}),
%     making it harder to produce useful error message.
%   \item |\cx|, similar to \TeX{}'s own |\^^x|.
%   \item Comments: \TeX{} already has its own system for comments.
%   \item |\Q...\E| escaping: this would require to read the argument
%     verbatim, which is not in the scope of this module.
%   \item |\C| single byte in UTF-8 mode: \XeTeX{} and \LuaTeX{} serve
%     us characters directly, and splitting those into bytes is tricky,
%     encoding dependent, and most likely not useful anyways.
% \end{itemize}
%
% \end{documentation}
%
% \begin{implementation}
%
% \section{\pkg{l3regex} implementation}
%
%    \begin{macrocode}
%<*package>
%    \end{macrocode}
%
%    \begin{macrocode}
%<@@=regex>
%    \end{macrocode}
%
% \subsection{Plan of attack}
%
% Most regex engines use backtracking. This allows to provide very
% powerful features (back-references come to mind first), but it is
% costly, and raises the problem of catastrophic backtracking. Since
% \TeX{} is not first and foremost a programming language, complicated
% code tends to run slowly, and we must use faster, albeit slightly more
% restrictive, techniques, coming from automata theory.
%
% Given a regular expression of $n$ characters, we do the following:
% \begin{itemize}
%   \item (Compiling.) Analyse the regex, finding invalid input, and
%     convert it to an internal representation.
%   \item (Building.) Convert the compiled regex to a non-deterministic
%     finite automaton (\textsc{nfa}) with $O(n)$ states which
%     accepts precisely token lists matching that regex.
%   \item (Matching.) Loop through the query token list one token (one
%     \enquote{position}) at a time, exploring in parallel every
%     possible path (\enquote{active thread}) through the \textsc{nfa},
%     considering active threads in an order determined by the
%     quantifiers' greediness.
% \end{itemize}
%
% We use the following vocabulary in the code comments (and in variable
% names).
% \begin{itemize}
%   \item \emph{Group}: index of the capturing group, $-1$ for
%     non-capturing groups. ^^A start/end index?
%   \item \emph{Position}: each token in the query is labelled by an
%     integer \meta{position}, with $\texttt{min_pos} - 1 \leq
%     \meta{position} \leq \texttt{max_pos}$. The lowest and highest
%     positions $\texttt{min_pos} - 1$ and $\texttt{max_pos}$
%     correspond to imaginary begin and end markers (with
%     non-existent category code and character code).
%     $\texttt{max_pos}$ is only set quite late in the processing.
%   \item \emph{Query}: the token list to which we apply the regular
%     expression.
%   \item \emph{State}: each state of the \textsc{nfa} is labelled by an
%     integer \meta{state} with $\texttt{min_state} \leq \meta{state} <
%     \texttt{max_state}$.
%   \item \emph{Active thread}: state of the \textsc{nfa} that is reached
%     when reading the query token list for the matching. Those threads
%     are ordered according to the greediness of quantifiers.
%   \item \emph{Step}: used when matching, starts at $0$, incremented
%     every time a character is read, and is not reset when searching
%     for repeated matches. The integer \cs{l_@@_step_int} is a
%     unique id for all the steps of the matching algorithm.
% \end{itemize}
%
% We use \pkg{l3intarray} to manipulate arrays of integers.
% We also abuse \TeX{}'s
% \tn{toks} registers, by accessing them directly by number rather than
% tying them to control sequence using the \tn{newtoks} allocation
% functions. Specifically, these arrays and \tn{toks} are used as
% follows.  When building,
% \tn{toks}\meta{state} holds the tests and actions to perform in the
% \meta{state} of the \textsc{nfa}.  When matching,
% \begin{itemize}
%   \item \cs{g_@@_state_active_intarray} holds the last \meta{step} in
%     which each \meta{state} was active.
%   \item \cs{g_@@_thread_info_intarray} consists of blocks for each
%     \meta{thread} (with $\texttt{min_thread} \leq \meta{thread} <
%     \texttt{max_thread}$).  Each block has
%     $1+2\cs{l_@@_capturing_group_int}$ entries: the \meta{state} in
%     which the \meta{thread} currently is, followed by the beginnings
%     of all submatches, and then the ends of all submatches. The
%     \meta{threads} are ordered starting from the best to the least
%     preferred.
%   \item \cs{g_@@_submatch_prev_intarray}, \cs{g_@@_submatch_begin_intarray}
%     and \cs{g_@@_submatch_end_intarray} hold, for each submatch (as would
%     be extracted by \cs{regex_extract_all:nnN}), the place where the
%     submatch started to be looked for and its two end-points.  For
%     historical reasons, the minimum index is twice \texttt{max_state},
%     and the used registers go up to \cs{l_@@_submatch_int}. They are
%     organized in blocks of \cs{l_@@_capturing_group_int} entries, each
%     block corresponding to one match with all its submatches stored in
%     consecutive entries.
% \end{itemize}
% When actually building the result,
% \begin{itemize}
%   \item \tn{toks}\meta{position} holds \meta{tokens} which \texttt{o}-
%     and \texttt{x}-expand to the \meta{position}-th token in the query.
%   \item \cs{g_@@_balance_intarray} holds the balance of begin-group and
%     end-group character tokens which appear before that point in the
%     token list.
% \end{itemize}
%
% The code is structured as follows. Variables are introduced in the
% relevant section. First we present some generic helper functions. Then
% comes the code for compiling a regular expression, and for showing the
% result of the compilation. The building phase converts a compiled
% regex to \textsc{nfa} states, and the automaton is run by the code in
% the following section. The only remaining brick is parsing the
% replacement text and performing the replacement. We are then ready for
% all the user functions. Finally, messages, and a little bit of tracing
% code.
%
% \subsection{Helpers}
%
% \begin{macro}{\@@_int_eval:w}
%   Access the primitive: performance is key here, so we do not use
%   the slower route \emph{via} \cs{int_eval:n}.
%    \begin{macrocode}
\cs_new_eq:NN \@@_int_eval:w \tex_numexpr:D
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_standard_escapechar:}
%   Make the \tn{escapechar} into the standard backslash.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_standard_escapechar:
  { \int_set:Nn \tex_escapechar:D { `\\ } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[EXP]{\@@_toks_use:w}
%   Unpack a \tn{toks} given its number.
%    \begin{macrocode}
\cs_new:Npn \@@_toks_use:w { \tex_the:D \tex_toks:D }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_toks_clear:N, \@@_toks_set:Nn, \@@_toks_set:No}
%   Empty a \tn{toks} or set it to a value, given its number.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_toks_clear:N #1
  { \@@_toks_set:Nn #1 { } }
\cs_new_eq:NN \@@_toks_set:Nn \tex_toks:D
\cs_new_protected:Npn \@@_toks_set:No #1
  { \tex_toks:D #1 \exp_after:wN }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_toks_memcpy:NNn}
%   Copy |#3| \tn{toks} registers from |#2| onwards to |#1| onwards,
%   like |C|'s |memcpy|.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_toks_memcpy:NNn #1#2#3
  {
    \prg_replicate:nn {#3}
      {
        \tex_toks:D #1 = \tex_toks:D #2
        \int_incr:N #1
        \int_incr:N #2
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_toks_put_left:Nx}
% \begin{macro}{\@@_toks_put_right:Nx, \@@_toks_put_right:Nn}
%   During the building phase we wish to add \texttt{x}-expanded
%   material to \tn{toks}, either to the left or to the right. The
%   expansion is done \enquote{by hand} for optimization (these
%   operations are used quite a lot). The \texttt{Nn} version of
%   \cs{@@_toks_put_right:Nx} is provided because it is more
%   efficient than \texttt{x}-expanding with \cs{exp_not:n}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_toks_put_left:Nx #1#2
  {
    \cs_set_nopar:Npx \@@_tmp:w { #2 }
    \tex_toks:D #1 \exp_after:wN \exp_after:wN \exp_after:wN
      { \exp_after:wN \@@_tmp:w \tex_the:D \tex_toks:D #1 }
  }
\cs_new_protected:Npn \@@_toks_put_right:Nx #1#2
  {
    \cs_set_nopar:Npx \@@_tmp:w {#2}
    \tex_toks:D #1 \exp_after:wN
      { \tex_the:D \tex_toks:D \exp_after:wN #1 \@@_tmp:w }
  }
\cs_new_protected:Npn \@@_toks_put_right:Nn #1#2
  { \tex_toks:D #1 \exp_after:wN { \tex_the:D \tex_toks:D #1 #2 } }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}[rEXP]{\@@_curr_cs_to_str:}
%   Expands to the string representation of the token (known to be a
%   control sequence) at the current position \cs{l_@@_curr_pos_int}.
%   It should only be used in \texttt{x}-expansion to avoid losing a
%   leading space.
%    \begin{macrocode}
\cs_new:Npn \@@_curr_cs_to_str:
  {
    \exp_after:wN \exp_after:wN \exp_after:wN \cs_to_str:N
    \l_@@_curr_token_tl
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_intarray_item:NnF, \@@_intarray_item_aux:nNF}
%   Item of intarray, with a default value.
%    \begin{macrocode}
\cs_new:Npn \@@_intarray_item:NnF #1#2
  { \exp_args:Nf \@@_intarray_item_aux:nNF { \int_eval:n {#2} } #1 }
\cs_new:Npn \@@_intarray_item_aux:nNF #1#2
  {
    \if_int_compare:w #1 > \c_zero_int
      \exp_after:wN \use_i:nn
    \else:
      \exp_after:wN \use_ii:nn
    \fi:
    { \__kernel_intarray_item:Nn #2 {#1} }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_maplike_break:}
%   Analogous to \cs{tl_map_break:}, this correctly exits
%   \cs{tl_map_inline:nn} and similar constructions and jumps to the
%   matching \cs{prg_break_point:Nn} \cs{@@_maplike_break:} |{| |}|.
%    \begin{macrocode}
\cs_new:Npn \@@_maplike_break:
  { \prg_map_break:Nn \@@_maplike_break: { } }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Constants and variables}
%
% \begin{macro}{\@@_tmp:w}
%   Temporary function used for various short-term purposes.
%    \begin{macrocode}
\cs_new:Npn \@@_tmp:w { }
%    \end{macrocode}
% \end{macro}
%
% \begin{variable}
%   {
%     \l_@@_internal_a_tl,  \l_@@_internal_b_tl,
%     \l_@@_internal_a_int, \l_@@_internal_b_int,
%     \l_@@_internal_c_int, \l_@@_internal_bool,
%     \l_@@_internal_seq,   \g_@@_internal_tl,
%   }
%   Temporary variables used for various purposes.
%    \begin{macrocode}
\tl_new:N   \l_@@_internal_a_tl
\tl_new:N   \l_@@_internal_b_tl
\int_new:N  \l_@@_internal_a_int
\int_new:N  \l_@@_internal_b_int
\int_new:N  \l_@@_internal_c_int
\bool_new:N \l_@@_internal_bool
\seq_new:N  \l_@@_internal_seq
\tl_new:N   \g_@@_internal_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_build_tl}
%   This temporary variable is specifically for use with the |tl_build|
%   machinery.
%    \begin{macrocode}
\tl_new:N \l_@@_build_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\c_@@_no_match_regex}
%   This regular expression matches nothing, but is still a valid
%   regular expression. We could use a failing assertion, but I went for
%   an empty class. It is used as the initial value for regular
%   expressions declared using \cs{regex_new:N}.
%    \begin{macrocode}
\tl_const:Nn \c_@@_no_match_regex
  {
    \@@_branch:n
      { \@@_class:NnnnN \c_true_bool { } { 1 } { 0 } \c_true_bool }
  }
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_balance_int}
%   During this phase, \cs{l_@@_balance_int} counts the balance of
%   begin-group and end-group character tokens which appear before a
%   given point in the token list. This variable is also used to keep
%   track of the balance in the replacement text.
%    \begin{macrocode}
\int_new:N \l_@@_balance_int
%    \end{macrocode}
% \end{variable}
%
% \subsubsection{Testing characters}
%
% \begin{macro}{\c_@@_ascii_min_int, \c_@@_ascii_max_control_int, \c_@@_ascii_max_int}
%    \begin{macrocode}
\int_const:Nn \c_@@_ascii_min_int { 0 }
\int_const:Nn \c_@@_ascii_max_control_int { 31 }
\int_const:Nn \c_@@_ascii_max_int { 127 }
%    \end{macrocode}
% \end{macro}
%
% \begin{variable}{\c_@@_ascii_lower_int}
%    \begin{macrocode}
\int_const:Nn \c_@@_ascii_lower_int { `a - `A }
%    \end{macrocode}
% \end{variable}
%
% \subsubsection{Internal auxiliaries}
%
% \begin{variable}{\q_@@_recursion_stop}
%   Internal recursion quarks.
%    \begin{macrocode}
\quark_new:N \q_@@_recursion_stop
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}[EXP]{
%     \@@_use_none_delimit_by_q_recursion_stop:w,
%     \@@_use_i_delimit_by_q_recursion_stop:nw
%   }
%   Functions to gobble up to a quark.
%    \begin{macrocode}
\cs_new:Npn \@@_use_none_delimit_by_q_recursion_stop:w
  #1 \q_@@_recursion_stop { }
\cs_new:Npn \@@_use_i_delimit_by_q_recursion_stop:nw
  #1 #2 \q_@@_recursion_stop {#1}
%    \end{macrocode}
% \end{macro}
%
% \begin{variable}{\q_@@_nil}
%   Internal quarks.
%    \begin{macrocode}
\quark_new:N \q_@@_nil
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}[pTF]{\@@_quark_if_nil:n}
%   Branching quark conditional.
%    \begin{macrocode}
\__kernel_quark_new_conditional:Nn \@@_quark_if_nil:N { F }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_break_point:TF}
% \begin{macro}{\@@_break_true:w}
%   When testing whether a character of the query token list matches
%   a given character class in the regular expression, we often
%   have to test it against several ranges of characters, checking
%   if any one of those matches. This is done with a structure like
%   \begin{quote}
%     \meta{test1} \ldots{} \meta{test$\sb{n}$} \\
%     \cs{@@_break_point:TF} \Arg{true code} \Arg{false code}
%   \end{quote}
%   If any of the tests succeeds, it calls \cs{@@_break_true:w},
%   which cleans up and leaves \meta{true code} in the input stream.
%   Otherwise, \cs{@@_break_point:TF} leaves the \meta{false code}
%   in the input stream.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_break_true:w
   #1 \@@_break_point:TF #2 #3 {#2}
\cs_new_protected:Npn \@@_break_point:TF #1 #2 { #2 }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_item_reverse:n}
%   This function makes showing regular expressions easier, and lets us
%   define |\D| in terms of |\d| for instance. There is a subtlety: the
%   end of the query is marked by $-2$, and thus matches |\D| and
%   other negated properties; this case is caught by another part of
%   the code.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_item_reverse:n #1
  {
    #1
    \@@_break_point:TF { } \@@_break_true:w
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {\@@_item_caseful_equal:n, \@@_item_caseful_range:nn}
%   Simple comparisons triggering \cs{@@_break_true:w} when true.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_item_caseful_equal:n #1
  {
    \if_int_compare:w #1 = \l_@@_curr_char_int
      \exp_after:wN \@@_break_true:w
    \fi:
  }
\cs_new_protected:Npn \@@_item_caseful_range:nn #1 #2
  {
    \reverse_if:N \if_int_compare:w #1 > \l_@@_curr_char_int
      \reverse_if:N \if_int_compare:w #2 < \l_@@_curr_char_int
        \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {\@@_item_caseless_equal:n, \@@_item_caseless_range:nn}
%   For caseless matching, we perform the test both on the
%   \texttt{curr_char} and on the \texttt{case_changed_char}. Before
%   doing the second set of tests, we make sure that
%   \texttt{case_changed_char} has been computed.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_item_caseless_equal:n #1
  {
    \if_int_compare:w #1 = \l_@@_curr_char_int
      \exp_after:wN \@@_break_true:w
    \fi:
    \if_int_compare:w \l_@@_case_changed_char_int = \c_max_int
      \@@_compute_case_changed_char:
    \fi:
    \if_int_compare:w #1 = \l_@@_case_changed_char_int
      \exp_after:wN \@@_break_true:w
    \fi:
  }
\cs_new_protected:Npn \@@_item_caseless_range:nn #1 #2
  {
    \reverse_if:N \if_int_compare:w #1 > \l_@@_curr_char_int
      \reverse_if:N \if_int_compare:w #2 < \l_@@_curr_char_int
        \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w
      \fi:
    \fi:
    \if_int_compare:w \l_@@_case_changed_char_int = \c_max_int
      \@@_compute_case_changed_char:
    \fi:
    \reverse_if:N \if_int_compare:w #1 > \l_@@_case_changed_char_int
      \reverse_if:N \if_int_compare:w #2 < \l_@@_case_changed_char_int
        \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compute_case_changed_char:}
%   This function is called when \cs{l_@@_case_changed_char_int} has
%   not yet been computed (or rather, when it is set to the marker value
%   \cs{c_max_int}). If the current character code is in the range
%   $[65,90]$ (upper-case), then add $32$, making it lowercase. If it is
%   in the lower-case letter range $[97,122]$, subtract $32$.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compute_case_changed_char:
  {
    \int_set_eq:NN \l_@@_case_changed_char_int \l_@@_curr_char_int
    \if_int_compare:w \l_@@_curr_char_int > `Z \exp_stop_f:
      \if_int_compare:w \l_@@_curr_char_int > `z \exp_stop_f: \else:
        \if_int_compare:w \l_@@_curr_char_int < `a \exp_stop_f: \else:
          \int_sub:Nn \l_@@_case_changed_char_int
            { \c_@@_ascii_lower_int }
        \fi:
      \fi:
    \else:
      \if_int_compare:w \l_@@_curr_char_int < `A \exp_stop_f: \else:
        \int_add:Nn \l_@@_case_changed_char_int
          { \c_@@_ascii_lower_int }
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[EXP]{\@@_item_equal:n, \@@_item_range:nn}
%   Those must always be defined to expand to a \texttt{caseful}
%   (default) or \texttt{caseless} version, and not be protected: they
%   must expand when compiling, to hard-code which tests are caseless or
%   caseful.
%    \begin{macrocode}
\cs_new_eq:NN \@@_item_equal:n ?
\cs_new_eq:NN \@@_item_range:nn ?
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_item_catcode:nT, \@@_item_catcode_reverse:nT}
% \begin{macro}{\@@_item_catcode:}
%   The argument is a sum of powers of $4$ with exponents given by the
%   allowed category codes (between $0$ and $13$). Dividing by a given
%   power of $4$ gives an odd result if and only if that category code
%   is allowed. If the catcode does not match, then skip the character
%   code tests which follow.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_item_catcode:
  {
    "
    \if_case:w \l_@@_curr_catcode_int
         1       \or: 4       \or: 10      \or: 40
    \or: 100     \or:         \or: 1000    \or: 4000
    \or: 10000   \or:         \or: 100000  \or: 400000
    \or: 1000000 \or: 4000000 \else: 1*0
    \fi:
  }
\cs_new_protected:Npn \@@_item_catcode:nT #1
  {
    \if_int_odd:w \int_eval:n { #1 / \@@_item_catcode: } \exp_stop_f:
      \exp_after:wN \use:n
    \else:
      \exp_after:wN \use_none:n
    \fi:
  }
\cs_new_protected:Npn \@@_item_catcode_reverse:nT #1#2
  { \@@_item_catcode:nT {#1} { \@@_item_reverse:n {#2} } }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_item_exact:nn, \@@_item_exact_cs:n}
%   This matches an exact \meta{category}-\meta{character code} pair, or
%   an exact control sequence, more precisely one of several possible
%   control sequences, separated by \cs{scan_stop:}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_item_exact:nn #1#2
  {
    \if_int_compare:w #1 = \l_@@_curr_catcode_int
      \if_int_compare:w #2 = \l_@@_curr_char_int
        \exp_after:wN \exp_after:wN \exp_after:wN \@@_break_true:w
      \fi:
    \fi:
  }
\cs_new_protected:Npn \@@_item_exact_cs:n #1
  {
    \int_compare:nNnTF \l_@@_curr_catcode_int = 0
      {
        \__kernel_tl_set:Nx \l_@@_internal_a_tl
          { \scan_stop: \@@_curr_cs_to_str: \scan_stop: }
        \tl_if_in:noTF { \scan_stop: #1 \scan_stop: }
          \l_@@_internal_a_tl
          { \@@_break_true:w } { }
      }
      { }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_item_cs:n}
%   Match a control sequence (the argument is a compiled regex).
%   First test the catcode of the current token to be zero.
%   Then perform the matching test, and break if the csname
%   indeed matches.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_item_cs:n #1
  {
    \int_compare:nNnT \l_@@_curr_catcode_int = 0
      {
        \group_begin:
          \@@_single_match:
          \@@_disable_submatches:
          \@@_build_for_cs:n {#1}
          \bool_set_eq:NN \l_@@_saved_success_bool
            \g_@@_success_bool
          \exp_args:Nx \@@_match_cs:n { \@@_curr_cs_to_str: }
          \if_meaning:w \c_true_bool \g_@@_success_bool
            \group_insert_after:N \@@_break_true:w
          \fi:
          \bool_gset_eq:NN \g_@@_success_bool
            \l_@@_saved_success_bool
        \group_end:
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Character property tests}
%
% \begin{macro}
%   {
%     \@@_prop_d:, \@@_prop_h:, \@@_prop_s:,
%     \@@_prop_v:, \@@_prop_w:, \@@_prop_N:
%   }
%   Character property tests for |\d|, |\W|, \emph{etc.} These character
%   properties are not affected by the |(?i)| option. The characters
%   recognized by each one are as follows: |\d=[0-9]|,
%   |\w=[0-9A-Z_a-z]|, \verb*+\s=[\ \^^I\^^J\^^L\^^M]+,
%   \verb*+\h=[\ \^^I]+, |\v=[\^^J-\^^M]|, and the upper case
%   counterparts match anything that the lower case does not match.  The
%   order in which the various tests appear is optimized for usual
%   mostly lower case letter text.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_prop_d:
  { \@@_item_caseful_range:nn { `0 } { `9 } }
\cs_new_protected:Npn \@@_prop_h:
  {
    \@@_item_caseful_equal:n { `\ }
    \@@_item_caseful_equal:n { `\^^I }
  }
\cs_new_protected:Npn \@@_prop_s:
  {
    \@@_item_caseful_equal:n { `\ }
    \@@_item_caseful_equal:n { `\^^I }
    \@@_item_caseful_equal:n { `\^^J }
    \@@_item_caseful_equal:n { `\^^L }
    \@@_item_caseful_equal:n { `\^^M }
  }
\cs_new_protected:Npn \@@_prop_v:
  { \@@_item_caseful_range:nn { `\^^J } { `\^^M } } % lf, vtab, ff, cr
\cs_new_protected:Npn \@@_prop_w:
  {
    \@@_item_caseful_range:nn { `a } { `z }
    \@@_item_caseful_range:nn { `A } { `Z }
    \@@_item_caseful_range:nn { `0 } { `9 }
    \@@_item_caseful_equal:n { `_ }
  }
\cs_new_protected:Npn \@@_prop_N:
  {
    \@@_item_reverse:n
      { \@@_item_caseful_equal:n { `\^^J } }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {
%     \@@_posix_alnum:, \@@_posix_alpha:, \@@_posix_ascii:,
%     \@@_posix_blank:, \@@_posix_cntrl:, \@@_posix_digit:,
%     \@@_posix_graph:, \@@_posix_lower:, \@@_posix_print:,
%     \@@_posix_punct:, \@@_posix_space:, \@@_posix_upper:,
%     \@@_posix_word: , \@@_posix_xdigit:
%   }
%   \textsc{posix} properties. No surprise.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_posix_alnum:
  { \@@_posix_alpha: \@@_posix_digit: }
\cs_new_protected:Npn \@@_posix_alpha:
  { \@@_posix_lower: \@@_posix_upper: }
\cs_new_protected:Npn \@@_posix_ascii:
  {
    \@@_item_caseful_range:nn
      \c_@@_ascii_min_int
      \c_@@_ascii_max_int
  }
\cs_new_eq:NN \@@_posix_blank: \@@_prop_h:
\cs_new_protected:Npn \@@_posix_cntrl:
  {
    \@@_item_caseful_range:nn
      \c_@@_ascii_min_int
      \c_@@_ascii_max_control_int
    \@@_item_caseful_equal:n \c_@@_ascii_max_int
  }
\cs_new_eq:NN \@@_posix_digit: \@@_prop_d:
\cs_new_protected:Npn \@@_posix_graph:
  { \@@_item_caseful_range:nn { `! } { `\~ } }
\cs_new_protected:Npn \@@_posix_lower:
  { \@@_item_caseful_range:nn { `a } { `z } }
\cs_new_protected:Npn \@@_posix_print:
  { \@@_item_caseful_range:nn { `\  } { `\~ } }
\cs_new_protected:Npn \@@_posix_punct:
  {
    \@@_item_caseful_range:nn { `! } { `/ }
    \@@_item_caseful_range:nn { `: } { `@ }
    \@@_item_caseful_range:nn { `[ } { `` }
    \@@_item_caseful_range:nn { `\{ } { `\~ }
  }
\cs_new_protected:Npn \@@_posix_space:
  {
    \@@_item_caseful_equal:n { `\  }
    \@@_item_caseful_range:nn { `\^^I } { `\^^M }
  }
\cs_new_protected:Npn \@@_posix_upper:
  { \@@_item_caseful_range:nn { `A } { `Z } }
\cs_new_eq:NN \@@_posix_word: \@@_prop_w:
\cs_new_protected:Npn \@@_posix_xdigit:
  {
    \@@_posix_digit:
    \@@_item_caseful_range:nn { `A } { `F }
    \@@_item_caseful_range:nn { `a } { `f }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Simple character escape}
%
% Before actually parsing the regular expression or the replacement
% text, we go through them once, converting |\n| to the character $10$,
% \emph{etc.} In this pass, we also convert any special character
% (\texttt{*}, \texttt{?}, \texttt{\{}, etc.) or escaped alphanumeric
% character into a marker indicating that this was a special sequence,
% and replace escaped special characters and non-escaped alphanumeric
% characters by markers indicating that those were \enquote{raw}
% characters. The rest of the code can then avoid caring about escaping
% issues (those can become quite complex to handle in combination with
% ranges in character classes).
%
% Usage: \cs{@@_escape_use:nnnn} \meta{inline~1} \meta{inline~2}
% \meta{inline~3} \Arg{token list} The \meta{token list} is converted to
% a string, then read from left to right, interpreting backslashes as
% escaping the next character.  Unescaped characters are fed to the
% function \meta{inline~1}, and escaped characters are fed to the function
% \meta{inline~2} within an \texttt{x}-expansion context (typically those
% functions perform some tests on their argument to decide how to output
% them).  The escape sequences |\a|, |\e|, |\f|, |\n|, |\r|, |\t| and
% |\x| are recognized, and those are replaced by the corresponding
% character, then fed to \meta{inline~3}. The result is then left in the
% input stream. Spaces are ignored unless escaped.
%
% The conversion is done within an \texttt{x}-expanding assignment.
%
% \begin{macro}{\@@_escape_use:nnnn}
%   The result is built in \cs{l_@@_internal_a_tl}, which is then left
%   in the input stream.  Tracing code is added as appropriate inside
%   this token list.  Go through |#4| once, applying |#1|,
%   |#2|, or |#3| as relevant to each character (after de-escaping
%   it).
%    \begin{macrocode}
\cs_new_protected:Npn \@@_escape_use:nnnn #1#2#3#4
  {
    \group_begin:
      \tl_clear:N \l_@@_internal_a_tl
      \cs_set:Npn \@@_escape_unescaped:N ##1 { #1 }
      \cs_set:Npn \@@_escape_escaped:N ##1 { #2 }
      \cs_set:Npn \@@_escape_raw:N ##1 { #3 }
      \@@_standard_escapechar:
      \__kernel_tl_gset:Nx \g_@@_internal_tl
        { \__kernel_str_to_other_fast:n {#4} }
      \tl_put_right:Nx \l_@@_internal_a_tl
        {
          \exp_after:wN \@@_escape_loop:N \g_@@_internal_tl
          { break } \prg_break_point:
        }
      \exp_after:wN
    \group_end:
    \l_@@_internal_a_tl
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_escape_loop:N}
% \begin{macro}+\@@_escape_\:w+
%   \cs{@@_escape_loop:N} reads one character: if it is special
%   (space, backslash, or end-marker), perform the associated action,
%   otherwise it is simply an unescaped character. After a backslash,
%   the same is done, but unknown characters are \enquote{escaped}.
%    \begin{macrocode}
\cs_new:Npn \@@_escape_loop:N #1
  {
    \cs_if_exist_use:cF { @@_escape_\token_to_str:N #1:w }
      { \@@_escape_unescaped:N #1 }
    \@@_escape_loop:N
  }
\cs_new:cpn { @@_escape_ \c_backslash_str :w }
    \@@_escape_loop:N #1
  {
    \cs_if_exist_use:cF { @@_escape_/\token_to_str:N #1:w }
      { \@@_escape_escaped:N #1 }
    \@@_escape_loop:N
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}
%   {\@@_escape_unescaped:N, \@@_escape_escaped:N, \@@_escape_raw:N}
%   Those functions are never called before being given a new meaning,
%   so their definitions here don't matter.
%    \begin{macrocode}
\cs_new_eq:NN \@@_escape_unescaped:N ?
\cs_new_eq:NN \@@_escape_escaped:N   ?
\cs_new_eq:NN \@@_escape_raw:N       ?
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {
%     \@@_escape_break:w, \@@_escape_/break:w,
%     \@@_escape_/a:w, \@@_escape_/e:w, \@@_escape_/f:w,
%     \@@_escape_/n:w, \@@_escape_/r:w, \@@_escape_/t:w
%   }
% \begin{macro}+\@@_escape_ :w+
%   The loop is ended upon seeing the end-marker
%   \enquote{\texttt{break}}, with an error if the string ended in a
%   backslash.  Spaces are ignored, and |\a|, |\e|, |\f|, |\n|, |\r|,
%   |\t| take their meaning here.
%    \begin{macrocode}
\cs_new_eq:NN \@@_escape_break:w \prg_break:
\cs_new:cpn { @@_escape_/break:w }
  {
    \__kernel_msg_expandable_error:nn { regex } { trailing-backslash }
    \prg_break:
  }
\cs_new:cpn { @@_escape_~:w } { }
\cs_new:cpx { @@_escape_/a:w }
  { \exp_not:N \@@_escape_raw:N \iow_char:N \^^G }
\cs_new:cpx { @@_escape_/t:w }
  { \exp_not:N \@@_escape_raw:N \iow_char:N \^^I }
\cs_new:cpx { @@_escape_/n:w }
  { \exp_not:N \@@_escape_raw:N \iow_char:N \^^J }
\cs_new:cpx { @@_escape_/f:w }
  { \exp_not:N \@@_escape_raw:N \iow_char:N \^^L }
\cs_new:cpx { @@_escape_/r:w }
  { \exp_not:N \@@_escape_raw:N \iow_char:N \^^M }
\cs_new:cpx { @@_escape_/e:w }
  { \exp_not:N \@@_escape_raw:N \iow_char:N \^^[ }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_escape_/x:w}
% \begin{macro}{\@@_escape_x_end:w, \@@_escape_x_large:n}
%   When |\x| is encountered, \cs{@@_escape_x_test:N} is responsible for
%   grabbing some hexadecimal digits, and feeding the result to
%   \cs{@@_escape_x_end:w}. If the number is too big interrupt the
%   assignment and produce an error, otherwise call \cs{@@_escape_raw:N}
%   on the corresponding character token.
%    \begin{macrocode}
\cs_new:cpn { @@_escape_/x:w } \@@_escape_loop:N
  {
    \exp_after:wN \@@_escape_x_end:w
    \int_value:w "0 \@@_escape_x_test:N
  }
\cs_new:Npn \@@_escape_x_end:w #1 ;
  {
    \int_compare:nNnTF {#1} > \c_max_char_int
      {
        \__kernel_msg_expandable_error:nnff { regex } { x-overflow }
          {#1} { \int_to_Hex:n {#1} }
      }
      {
        \exp_last_unbraced:Nf \@@_escape_raw:N
          { \char_generate:nn {#1} { 12 } }
      }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_escape_x_test:N, \@@_escape_x_testii:N}
%   Find out whether the first character is a left brace (allowing any
%   number of hexadecimal digits), or not (allowing up to two
%   hexadecimal digits). We need to check for the end-of-string marker.
%   Eventually, call either \cs{@@_escape_x_loop:N} or
%   \cs{@@_escape_x:N}.
%    \begin{macrocode}
\cs_new:Npn \@@_escape_x_test:N #1
  {
    \str_if_eq:nnTF {#1} { break } { ; }
      {
        \if_charcode:w \c_space_token #1
          \exp_after:wN \@@_escape_x_test:N
        \else:
          \exp_after:wN \@@_escape_x_testii:N
          \exp_after:wN #1
        \fi:
      }
  }
\cs_new:Npn \@@_escape_x_testii:N #1
  {
    \if_charcode:w \c_left_brace_str #1
      \exp_after:wN \@@_escape_x_loop:N
    \else:
      \@@_hexadecimal_use:NTF #1
        { \exp_after:wN \@@_escape_x:N }
        { ; \exp_after:wN \@@_escape_loop:N \exp_after:wN #1 }
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_escape_x:N}
%   This looks for the second digit in the unbraced case.
%    \begin{macrocode}
\cs_new:Npn \@@_escape_x:N #1
  {
    \str_if_eq:nnTF {#1} { break } { ; }
      {
        \@@_hexadecimal_use:NTF #1
          { ; \@@_escape_loop:N }
          { ; \@@_escape_loop:N #1 }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_escape_x_loop:N, \@@_escape_x_loop_error:}
%   Grab hexadecimal digits, skip spaces, and at the end, check that
%   there is a right brace, otherwise raise an error outside the
%   assignment.
%    \begin{macrocode}
\cs_new:Npn \@@_escape_x_loop:N #1
  {
    \str_if_eq:nnTF {#1} { break }
      { ; \@@_escape_x_loop_error:n { } {#1} }
      {
        \@@_hexadecimal_use:NTF #1
          { \@@_escape_x_loop:N }
          {
            \token_if_eq_charcode:NNTF \c_space_token #1
              { \@@_escape_x_loop:N }
              {
                ;
                \exp_after:wN
                \token_if_eq_charcode:NNTF \c_right_brace_str #1
                  { \@@_escape_loop:N }
                  { \@@_escape_x_loop_error:n {#1} }
              }
          }
      }
  }
\cs_new:Npn \@@_escape_x_loop_error:n #1
  {
    \__kernel_msg_expandable_error:nnn { regex } { x-missing-rbrace } {#1}
    \@@_escape_loop:N #1
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[rEXP]{\@@_hexadecimal_use:NTF}
%   \TeX{} detects uppercase hexadecimal digits for us but not the
%   lowercase letters, which we need to detect and replace by their
%   uppercase counterpart.
%    \begin{macrocode}
\prg_new_conditional:Npnn \@@_hexadecimal_use:N #1 { TF }
  {
    \if_int_compare:w 1 < "1 \token_to_str:N #1 \exp_stop_f:
      #1 \prg_return_true:
    \else:
      \if_case:w
        \int_eval:n { \exp_after:wN ` \token_to_str:N #1 - `a }
           A
      \or: B
      \or: C
      \or: D
      \or: E
      \or: F
      \else:
        \prg_return_false:
        \exp_after:wN \use_none:n
      \fi:
      \prg_return_true:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[EXP]
%   {\@@_char_if_alphanumeric:NTF, \@@_char_if_special:NTF}
%   These two tests are used in the first pass when parsing a regular
%   expression. That pass is responsible for finding escaped and
%   non-escaped characters, and recognizing which ones have special
%   meanings and which should be interpreted as \enquote{raw}
%   characters. Namely,
%   \begin{itemize}
%     \item alphanumerics are \enquote{raw} if they are not escaped, and
%       may have a special meaning when escaped;
%     \item non-alphanumeric printable ascii characters are
%       \enquote{raw} if they are escaped, and may have a special
%       meaning when not escaped;
%     \item characters other than printable ascii are always
%       \enquote{raw}.
%   \end{itemize}
%   The code is ugly, and highly based on magic numbers and the ascii
%   codes of characters. This is mostly unavoidable for performance
%   reasons.  Maybe the tests can be optimized a little bit more.
%   Here, \enquote{alphanumeric} means \texttt{0}--\texttt{9},
%   \texttt{A}--\texttt{Z}, \texttt{a}--\texttt{z};
%   \enquote{special} character means non-alphanumeric
%   but printable ascii, from space (hex \texttt{20}) to
%   \texttt{del} (hex \texttt{7E}).
%    \begin{macrocode}
\prg_new_conditional:Npnn \@@_char_if_special:N #1 { TF }
  {
    \if_int_compare:w `#1 > `Z \exp_stop_f:
      \if_int_compare:w `#1 > `z \exp_stop_f:
        \if_int_compare:w `#1 < \c_@@_ascii_max_int
          \prg_return_true: \else: \prg_return_false: \fi:
      \else:
        \if_int_compare:w `#1 < `a \exp_stop_f:
          \prg_return_true: \else: \prg_return_false: \fi:
      \fi:
    \else:
      \if_int_compare:w `#1 > `9 \exp_stop_f:
        \if_int_compare:w `#1 < `A \exp_stop_f:
          \prg_return_true: \else: \prg_return_false: \fi:
      \else:
        \if_int_compare:w `#1 < `0 \exp_stop_f:
          \if_int_compare:w `#1 < `\ \exp_stop_f:
            \prg_return_false: \else: \prg_return_true: \fi:
        \else: \prg_return_false: \fi:
      \fi:
    \fi:
  }
\prg_new_conditional:Npnn \@@_char_if_alphanumeric:N #1 { TF }
  {
    \if_int_compare:w `#1 > `Z \exp_stop_f:
      \if_int_compare:w `#1 > `z \exp_stop_f:
        \prg_return_false:
      \else:
        \if_int_compare:w `#1 < `a \exp_stop_f:
          \prg_return_false: \else: \prg_return_true: \fi:
      \fi:
    \else:
      \if_int_compare:w `#1 > `9 \exp_stop_f:
        \if_int_compare:w `#1 < `A \exp_stop_f:
          \prg_return_false: \else: \prg_return_true: \fi:
      \else:
        \if_int_compare:w `#1 < `0 \exp_stop_f:
          \prg_return_false: \else: \prg_return_true: \fi:
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsection{Compiling}
%
% A regular expression starts its life as a string of characters. In
% this section, we convert it to internal instructions, resulting in a
% \enquote{compiled} regular expression. This compiled expression is
% then turned into states of an automaton in the building
% phase. Compiled regular expressions consist of the following:
% \begin{itemize}
%   \item \cs{@@_class:NnnnN} \meta{boolean} \Arg{tests} \Arg{min}
%     \Arg{more} \meta{lazyness}
%   \item \cs{@@_group:nnnN} \Arg{branches} \Arg{min} \Arg{more}
%     \meta{lazyness}, also \cs{@@_group_no_capture:nnnN} and
%     \cs{@@_group_resetting:nnnN} with the same syntax.
%   \item \cs{@@_branch:n} \Arg{contents}
%   \item \cs{@@_command_K:}
%   \item \cs{@@_assertion:Nn} \meta{boolean} \Arg{assertion test},
%     where the \meta{assertion test} is \cs{@@_b_test:} or
%     \cs{@@_Z_test:} or \cs{@@_A_test:} or \cs{@@_G_test:}
% \end{itemize}
% Tests can be the following:
% \begin{itemize}
%   \item \cs{@@_item_caseful_equal:n} \Arg{char code}
%   \item \cs{@@_item_caseless_equal:n} \Arg{char code}
%   \item \cs{@@_item_caseful_range:nn} \Arg{min} \Arg{max}
%   \item \cs{@@_item_caseless_range:nn} \Arg{min} \Arg{max}
%   \item \cs{@@_item_catcode:nT} \Arg{catcode bitmap} \Arg{tests}
%   \item \cs{@@_item_catcode_reverse:nT} \Arg{catcode bitmap} \Arg{tests}
%   \item \cs{@@_item_reverse:n} \Arg{tests}
%   \item \cs{@@_item_exact:nn} \Arg{catcode} \Arg{char code}
%   \item \cs{@@_item_exact_cs:n} \Arg{csnames}, more precisely given as
%     \meta{csname} \cs{scan_stop:} \meta{csname} \cs{scan_stop:}
%     \meta{csname} and so on in a brace group.
%   \item \cs{@@_item_cs:n} \Arg{compiled regex}
% \end{itemize}
%
% \subsubsection{Variables used when compiling}
%
% \begin{variable}{\l_@@_group_level_int}
%   We make sure to open the same number of groups as we close.
%    \begin{macrocode}
\int_new:N \l_@@_group_level_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_mode_int}
% \begin{variable}
%   {
%     \c_@@_cs_in_class_mode_int,
%     \c_@@_cs_mode_int,
%     \c_@@_outer_mode_int,
%     \c_@@_catcode_mode_int,
%     \c_@@_class_mode_int,
%     \c_@@_catcode_in_class_mode_int
%   }
%   While compiling, ten modes are recognized, labelled $-63$, $-23$,
%   $-6$, $-2$, $0$, $2$, $3$, $6$, $23$, $63$. See
%   section~\ref{sec:regex-modes}.  We only define some of these as
%   constants.
%    \begin{macrocode}
\int_new:N \l_@@_mode_int
\int_const:Nn \c_@@_cs_in_class_mode_int { -6 }
\int_const:Nn \c_@@_cs_mode_int { -2 }
\int_const:Nn \c_@@_outer_mode_int { 0 }
\int_const:Nn \c_@@_catcode_mode_int { 2 }
\int_const:Nn \c_@@_class_mode_int { 3 }
\int_const:Nn \c_@@_catcode_in_class_mode_int { 6 }
%    \end{macrocode}
% \end{variable}
% \end{variable}
%
% \begin{variable}{\l_@@_catcodes_int, \l_@@_default_catcodes_int}
% \begin{variable}{\l_@@_catcodes_bool}
%   We wish to allow constructions such as |\c[^BE](..\cL[a-z]..)|,
%   where the outer catcode test applies to the whole group, but is
%   superseded by the inner catcode test. For this to work, we need to
%   keep track of lists of allowed category codes:
%   \cs{l_@@_catcodes_int} and \cs{l_@@_default_catcodes_int} are
%   bitmaps, sums of $4^c$, for all allowed catcodes $c$. The latter is
%   local to each capturing group, and we reset
%   \cs{l_@@_catcodes_int} to that value after each character or
%   class, changing it only when encountering a |\c| escape. The boolean
%   records whether the list of categories of a catcode test has to be
%   inverted: compare |\c[^BE]| and |\c[BE]|.
%    \begin{macrocode}
\int_new:N \l_@@_catcodes_int
\int_new:N \l_@@_default_catcodes_int
\bool_new:N \l_@@_catcodes_bool
%    \end{macrocode}
% \end{variable}
% \end{variable}
%
% \begin{variable}
%   {
%     \c_@@_catcode_C_int, \c_@@_catcode_B_int, \c_@@_catcode_E_int,
%     \c_@@_catcode_M_int, \c_@@_catcode_T_int, \c_@@_catcode_P_int,
%     \c_@@_catcode_U_int, \c_@@_catcode_D_int, \c_@@_catcode_S_int,
%     \c_@@_catcode_L_int, \c_@@_catcode_O_int, \c_@@_catcode_A_int
%   }
% \begin{variable}{\c_@@_all_catcodes_int}
%   Constants: $4^c$ for each category, and the sum of all powers of $4$.
%    \begin{macrocode}
\int_const:Nn \c_@@_catcode_C_int { "1 }
\int_const:Nn \c_@@_catcode_B_int { "4 }
\int_const:Nn \c_@@_catcode_E_int { "10 }
\int_const:Nn \c_@@_catcode_M_int { "40 }
\int_const:Nn \c_@@_catcode_T_int { "100 }
\int_const:Nn \c_@@_catcode_P_int { "1000 }
\int_const:Nn \c_@@_catcode_U_int { "4000 }
\int_const:Nn \c_@@_catcode_D_int { "10000 }
\int_const:Nn \c_@@_catcode_S_int { "100000 }
\int_const:Nn \c_@@_catcode_L_int { "400000 }
\int_const:Nn \c_@@_catcode_O_int { "1000000 }
\int_const:Nn \c_@@_catcode_A_int { "4000000 }
\int_const:Nn \c_@@_all_catcodes_int { "5515155 }
%    \end{macrocode}
% \end{variable}
% \end{variable}
%
% \begin{variable}{\l_@@_internal_regex}
%   The compilation step stores its result in this variable.
%    \begin{macrocode}
\cs_new_eq:NN \l_@@_internal_regex \c_@@_no_match_regex
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_show_prefix_seq}
%   This sequence holds the prefix that makes up the line displayed to
%   the user. The various items must be removed from the right, which is
%   tricky with a token list, hence we use a sequence.
%    \begin{macrocode}
\seq_new:N \l_@@_show_prefix_seq
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_show_lines_int}
%   A hack. To know whether a given class has a single item in it or
%   not, we count the number of lines when showing the class.
%    \begin{macrocode}
\int_new:N \l_@@_show_lines_int
%    \end{macrocode}
% \end{variable}
%
% \subsubsection{Generic helpers used when compiling}
%
% \begin{macro}{\@@_two_if_eq:NNNNTF}
%   Used to compare pairs of things like \cs{@@_compile_special:N} |?|
%   together.  It's often inconvenient to get the catcodes of the
%   character to match so we just compare the character code.
%   Besides, the expanding behaviour of \cs{if:w} is very useful as that
%   means we can use \cs{c_left_brace_str} and the like.
%    \begin{macrocode}
\prg_new_conditional:Npnn \@@_two_if_eq:NNNN #1#2#3#4 { TF }
  {
    \if_meaning:w #1 #3
      \if:w #2 #4
        \prg_return_true:
      \else:
        \prg_return_false:
      \fi:
    \else:
      \prg_return_false:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_get_digits:NTFw}
% \begin{macro}[rEXP]{\@@_get_digits_loop:w}
%   If followed by some raw digits, collect them one by one in the
%   integer variable |#1|, and take the \texttt{true} branch. Otherwise,
%   take the \texttt{false} branch.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_get_digits:NTFw #1#2#3#4#5
  {
    \@@_if_raw_digit:NNTF #4 #5
      { #1 = #5 \@@_get_digits_loop:nw {#2} }
      { #3 #4 #5 }
  }
\cs_new:Npn \@@_get_digits_loop:nw #1#2#3
  {
    \@@_if_raw_digit:NNTF #2 #3
      { #3 \@@_get_digits_loop:nw {#1} }
      { \scan_stop: #1 #2 #3 }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}[EXP]{\@@_if_raw_digit:NNTF}
%   Test used when grabbing digits for the |{m,n}| quantifier.
%   It only accepts non-escaped digits.
%    \begin{macrocode}
\prg_new_conditional:Npnn \@@_if_raw_digit:NN #1#2 { TF }
  {
    \if_meaning:w \@@_compile_raw:N #1
      \if_int_compare:w 1 < 1 #2 \exp_stop_f:
        \prg_return_true:
      \else:
        \prg_return_false:
      \fi:
    \else:
      \prg_return_false:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Mode}
% \label{sec:regex-modes}
%
% When compiling the \textsc{nfa} corresponding to a given regex string,
% we can be in ten distinct modes, which we label by some magic numbers:
% \begin{itemize}
%   \item[-6] |[\c{...}]| control sequence in a class,
%   \item[-2] |\c{...}| control sequence,
%   \item[0] |...| outer,
%   \item[2] |\c...| catcode test,
%   \item[6] |[\c...]| catcode test in a class,
%   \item[-63] |[\c{[...]}]| class inside mode $-6$,
%   \item[-23] |\c{[...]}| class inside mode $-2$,
%   \item[3] |[...]| class inside mode $0$,
%   \item[23] |\c[...]| class inside mode $2$,
%   \item[63] |[\c[...]]| class inside mode $6$.
% \end{itemize}
% This list is exhaustive, because |\c| escape sequences cannot be
% nested, and character classes cannot be nested directly. The choice of
% numbers is such as to optimize the most useful tests, and make
% transitions from one mode to another as simple as possible.
% \begin{itemize}
%   \item Even modes mean that we are not directly in a character class.
%     In this case, a left bracket appends $3$ to the mode. In a
%     character class, a right bracket changes the mode as $m\to
%     (m-15)/13$, truncated.
%   \item Grouping, assertion, and anchors are allowed in non-positive
%     even modes ($0$, $-2$, $-6$), and do not change the
%     mode. Otherwise, they trigger an error.
%   \item A left bracket is special in even modes, appending $3$ to the
%     mode; in those modes, quantifiers and the dot are recognized, and
%     the right bracket is normal. In odd modes (within classes), the
%     left bracket is normal, but the right bracket ends the class,
%     changing the mode from $m$ to $(m-15)/13$, truncated; also, ranges
%     are recognized.
%   \item In non-negative modes, left and right braces are normal. In
%     negative modes, however, left braces trigger a warning; right
%     braces end the control sequence, going from $-2$ to $0$ or $-6$ to
%     $3$, with error recovery for odd modes.
%   \item Properties (such as the |\d| character class) can appear in
%     any mode.
% \end{itemize}
%
% \begin{macro}[EXP]{\@@_if_in_class:TF}
%   Test whether we are directly in a character class (at the innermost
%   level of nesting). There, many escape sequences are not recognized,
%   and special characters are normal. Also, for every raw character, we
%   must look ahead for a possible raw dash.
%   \begin{macrocode}
\cs_new:Npn \@@_if_in_class:TF
  {
    \if_int_odd:w \l_@@_mode_int
      \exp_after:wN \use_i:nn
    \else:
      \exp_after:wN \use_ii:nn
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[EXP]{\@@_if_in_cs:TF}
%   Right braces are special only directly inside control sequences (at
%   the inner-most level of nesting, not counting groups).
%    \begin{macrocode}
\cs_new:Npn \@@_if_in_cs:TF
  {
    \if_int_odd:w \l_@@_mode_int
      \exp_after:wN \use_ii:nn
    \else:
      \if_int_compare:w \l_@@_mode_int < \c_@@_outer_mode_int
        \exp_after:wN \exp_after:wN \exp_after:wN \use_i:nn
      \else:
        \exp_after:wN \exp_after:wN \exp_after:wN \use_ii:nn
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[EXP]{\@@_if_in_class_or_catcode:TF}
%   Assertions are only allowed in modes $0$, $-2$, and $-6$,
%   \emph{i.e.}, even, non-positive modes.
%    \begin{macrocode}
\cs_new:Npn \@@_if_in_class_or_catcode:TF
  {
    \if_int_odd:w \l_@@_mode_int
      \exp_after:wN \use_i:nn
    \else:
      \if_int_compare:w \l_@@_mode_int > \c_@@_outer_mode_int
        \exp_after:wN \exp_after:wN \exp_after:wN \use_i:nn
      \else:
        \exp_after:wN \exp_after:wN \exp_after:wN \use_ii:nn
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[EXP]{\@@_if_within_catcode:TF}
%   This test takes the true branch if we are in a catcode test, either
%   immediately following it (modes $2$ and $6$) or in a class on which
%   it applies (modes $23$ and $63$). This is used to tweak how left
%   brackets behave in modes $2$ and $6$.
%    \begin{macrocode}
\cs_new:Npn \@@_if_within_catcode:TF
  {
    \if_int_compare:w \l_@@_mode_int > \c_@@_outer_mode_int
      \exp_after:wN \use_i:nn
    \else:
      \exp_after:wN \use_ii:nn
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_chk_c_allowed:T}
%   The |\c| escape sequence is only allowed in modes $0$ and $3$,
%   \emph{i.e.}, not within any other |\c| escape sequence.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_chk_c_allowed:T
  {
    \if_int_compare:w \l_@@_mode_int = \c_@@_outer_mode_int
      \exp_after:wN \use:n
    \else:
      \if_int_compare:w \l_@@_mode_int = \c_@@_class_mode_int
        \exp_after:wN \exp_after:wN \exp_after:wN \use:n
      \else:
        \__kernel_msg_error:nn { regex } { c-bad-mode }
        \exp_after:wN \exp_after:wN \exp_after:wN \use_none:n
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_mode_quit_c:}
%   This function changes the mode as it is needed just after a catcode
%   test.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_mode_quit_c:
  {
    \if_int_compare:w \l_@@_mode_int = \c_@@_catcode_mode_int
      \int_set_eq:NN \l_@@_mode_int \c_@@_outer_mode_int
    \else:
      \if_int_compare:w \l_@@_mode_int =
        \c_@@_catcode_in_class_mode_int
        \int_set_eq:NN \l_@@_mode_int \c_@@_class_mode_int
      \fi:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Framework}
%
% \begin{macro}{\@@_compile:w, \@@_compile_end:}
%   Used when compiling a user regex or a regex for the |\c{...}| escape
%   sequence within another regex. Start building a token list within a
%   group (with \texttt{x}-expansion at the outset), and set a few
%   variables (group level, catcodes), then start the first branch. At
%   the end, make sure there are no dangling classes nor groups, close
%   the last branch: we are done building \cs{l_@@_internal_regex}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile:w
  {
    \group_begin:
      \tl_build_begin:N \l_@@_build_tl
      \int_zero:N \l_@@_group_level_int
      \int_set_eq:NN \l_@@_default_catcodes_int
        \c_@@_all_catcodes_int
      \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int
      \cs_set:Npn \@@_item_equal:n  { \@@_item_caseful_equal:n }
      \cs_set:Npn \@@_item_range:nn { \@@_item_caseful_range:nn }
      \tl_build_put_right:Nn \l_@@_build_tl
        { \@@_branch:n { \if_false: } \fi: }
  }
\cs_new_protected:Npn \@@_compile_end:
  {
      \@@_if_in_class:TF
        {
          \__kernel_msg_error:nn { regex } { missing-rbrack }
          \use:c { @@_compile_]: }
          \prg_do_nothing: \prg_do_nothing:
        }
        { }
      \if_int_compare:w \l_@@_group_level_int > 0 \exp_stop_f:
        \__kernel_msg_error:nnx { regex } { missing-rparen }
          { \int_use:N \l_@@_group_level_int }
        \prg_replicate:nn
          { \l_@@_group_level_int }
          {
              \tl_build_put_right:Nn \l_@@_build_tl
                {
                  \if_false: { \fi: }
                  \if_false: { \fi: } { 1 } { 0 } \c_true_bool
                }
              \tl_build_end:N \l_@@_build_tl
              \exp_args:NNNo
            \group_end:
            \tl_build_put_right:Nn \l_@@_build_tl
              { \l_@@_build_tl }
          }
      \fi:
      \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } }
      \tl_build_end:N \l_@@_build_tl
      \exp_args:NNNx
    \group_end:
    \tl_set:Nn \l_@@_internal_regex { \l_@@_build_tl }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile:n}
%   The compilation is done between \cs{@@_compile:w} and
%   \cs{@@_compile_end:}, starting in mode~$0$. Then
%   \cs{@@_escape_use:nnnn} distinguishes special characters, escaped
%   alphanumerics, and raw characters, interpreting |\a|, |\x| and other
%   sequences. The $4$ trailing \cs{prg_do_nothing:} are needed because
%   some functions defined later look up to $4$ tokens ahead. Before
%   ending, make sure that any |\c{...}| is properly closed.  No need to
%   check that brackets are closed properly since \cs{@@_compile_end:}
%   does that.  However, catch the case of a trailing |\cL|
%   construction.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile:n #1
  {
    \@@_compile:w
      \@@_standard_escapechar:
      \int_set_eq:NN \l_@@_mode_int \c_@@_outer_mode_int
      \@@_escape_use:nnnn
        {
          \@@_char_if_special:NTF ##1
            \@@_compile_special:N \@@_compile_raw:N ##1
        }
        {
          \@@_char_if_alphanumeric:NTF ##1
            \@@_compile_escaped:N \@@_compile_raw:N ##1
        }
        { \@@_compile_raw:N ##1 }
        { #1 }
      \prg_do_nothing: \prg_do_nothing:
      \prg_do_nothing: \prg_do_nothing:
      \int_compare:nNnT \l_@@_mode_int = \c_@@_catcode_mode_int
        { \__kernel_msg_error:nn { regex } { c-trailing } }
      \int_compare:nNnT \l_@@_mode_int < \c_@@_outer_mode_int
        {
          \__kernel_msg_error:nn { regex } { c-missing-rbrace }
          \@@_compile_end_cs:
          \prg_do_nothing: \prg_do_nothing:
          \prg_do_nothing: \prg_do_nothing:
        }
    \@@_compile_end:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_escaped:N, \@@_compile_special:N}
%   If the special character or escaped alphanumeric has a particular
%   meaning in regexes, the corresponding function is used. Otherwise,
%   it is interpreted as a raw character. We distinguish special
%   characters from escaped alphanumeric characters because they behave
%   differently when appearing as an end-point of a range.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_special:N #1
  {
    \cs_if_exist_use:cF { @@_compile_#1: }
      { \@@_compile_raw:N #1 }
  }
\cs_new_protected:Npn \@@_compile_escaped:N #1
  {
    \cs_if_exist_use:cF { @@_compile_/#1: }
      { \@@_compile_raw:N #1 }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_one:n}
%   This is used after finding one \enquote{test}, such as |\d|, or a
%   raw character. If that followed a catcode test (\emph{e.g.}, |\cL|),
%   then restore the mode. If we are not in a class, then the test is
%   \enquote{standalone}, and we need to add \cs{@@_class:NnnnN} and
%   search for quantifiers. In any case, insert the test, possibly
%   together with a catcode test if appropriate.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_one:n #1
  {
    \@@_mode_quit_c:
    \@@_if_in_class:TF { }
      {
        \tl_build_put_right:Nn \l_@@_build_tl
          { \@@_class:NnnnN \c_true_bool { \if_false: } \fi: }
      }
    \tl_build_put_right:Nx \l_@@_build_tl
      {
        \if_int_compare:w \l_@@_catcodes_int <
          \c_@@_all_catcodes_int
          \@@_item_catcode:nT { \int_use:N \l_@@_catcodes_int }
            { \exp_not:N \exp_not:n {#1} }
        \else:
          \exp_not:N \exp_not:n {#1}
        \fi:
      }
    \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int
    \@@_if_in_class:TF { } { \@@_compile_quantifier:w }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {\@@_compile_abort_tokens:n, \@@_compile_abort_tokens:x}
%   This function places the collected tokens back in the input stream,
%   each as a raw character. Spaces are not preserved.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_abort_tokens:n #1
  {
    \use:x
      {
        \exp_args:No \tl_map_function:nN { \tl_to_str:n {#1} }
          \@@_compile_raw:N
      }
  }
\cs_generate_variant:Nn \@@_compile_abort_tokens:n { x }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Quantifiers}
%
% \begin{macro}{\@@_compile_quantifier:w}
%   This looks ahead and finds any quantifier (special character equal
%   to either of \texttt{?+*\{}).
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_quantifier:w #1#2
  {
    \token_if_eq_meaning:NNTF #1 \@@_compile_special:N
      {
        \cs_if_exist_use:cF { @@_compile_quantifier_#2:w }
          { \@@_compile_quantifier_none: #1 #2 }
      }
      { \@@_compile_quantifier_none: #1 #2 }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_quantifier_none:}
% \begin{macro}{\@@_compile_quantifier_abort:xNN}
%   Those functions are called whenever there is no quantifier, or a
%   braced construction is invalid (equivalent to no quantifier, and
%   whatever characters were grabbed are left raw).
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_quantifier_none:
  {
    \tl_build_put_right:Nn \l_@@_build_tl
      { \if_false: { \fi: } { 1 } { 0 } \c_false_bool }
  }
\cs_new_protected:Npn \@@_compile_quantifier_abort:xNN #1#2#3
  {
    \@@_compile_quantifier_none:
    \__kernel_msg_warning:nnxx { regex } { invalid-quantifier } {#1} {#3}
    \@@_compile_abort_tokens:x {#1}
    #2 #3
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_compile_quantifier_lazyness:nnNN}
%   Once the \enquote{main} quantifier (\texttt{?}, \texttt{*},
%   \texttt{+} or a braced construction) is found, we check whether it
%   is lazy (followed by a question mark). We then add to the compiled
%   regex a closing brace (ending \cs{@@_class:NnnnN} and friends),
%   the start-point of the range, its end-point, and a boolean,
%   \texttt{true} for lazy and \texttt{false} for greedy operators.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_quantifier_lazyness:nnNN #1#2#3#4
  {
    \@@_two_if_eq:NNNNTF #3 #4 \@@_compile_special:N ?
      {
        \tl_build_put_right:Nn \l_@@_build_tl
          { \if_false: { \fi: } { #1 } { #2 } \c_true_bool }
      }
      {
        \tl_build_put_right:Nn \l_@@_build_tl
          { \if_false: { \fi: } { #1 } { #2 } \c_false_bool }
        #3 #4
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {
%     \@@_compile_quantifier_?:w,
%     \@@_compile_quantifier_*:w,
%     \@@_compile_quantifier_+:w
%   }
%   For each \enquote{basic} quantifier, |?|, |*|, |+|, feed the correct
%   arguments to \cs{@@_compile_quantifier_lazyness:nnNN}, $-1$ means
%   that there is no upper bound on the number of repetitions.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_quantifier_?:w }
  { \@@_compile_quantifier_lazyness:nnNN { 0 } { 1 } }
\cs_new_protected:cpn { @@_compile_quantifier_*:w }
  { \@@_compile_quantifier_lazyness:nnNN { 0 } { -1 } }
\cs_new_protected:cpn { @@_compile_quantifier_+:w }
  { \@@_compile_quantifier_lazyness:nnNN { 1 } { -1 } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}+\@@_compile_quantifier_{:w+ ^^A}
% \begin{macro}
%   {
%     \@@_compile_quantifier_braced_auxi:w,
%     \@@_compile_quantifier_braced_auxii:w,
%     \@@_compile_quantifier_braced_auxiii:w,
%   }
%   Three possible syntaxes: \texttt{\{\meta{int}\}},
%   \texttt{\{\meta{int},\}}, or \texttt{\{\meta{int},\meta{int}\}}. Any
%   other syntax causes us to abort and put whatever we collected back
%   in the input stream, as \texttt{raw} characters, including the
%   opening brace. Grab a number into \cs{l_@@_internal_a_int}. If
%   the number is followed by a right brace, the range is $[a,a]$. If
%   followed by a comma, grab one more number, and call the \texttt{_ii}
%   or \texttt{_iii} auxiliary. Those auxiliaries check for a closing
%   brace, leading to the range $[a,\infty]$ or $[a,b]$, encoded as
%   $\{a\}\{-1\}$ and $\{a\}\{b-a\}$.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_quantifier_ \c_left_brace_str :w }
  {
    \@@_get_digits:NTFw \l_@@_internal_a_int
      { \@@_compile_quantifier_braced_auxi:w }
      { \@@_compile_quantifier_abort:xNN { \c_left_brace_str } }
  }
\cs_new_protected:Npn \@@_compile_quantifier_braced_auxi:w #1#2
  {
    \str_case_e:nnF { #1 #2 }
      {
        { \@@_compile_special:N \c_right_brace_str }
          {
            \exp_args:No \@@_compile_quantifier_lazyness:nnNN
              { \int_use:N \l_@@_internal_a_int } { 0 }
          }
        { \@@_compile_special:N , }
          {
            \@@_get_digits:NTFw \l_@@_internal_b_int
              { \@@_compile_quantifier_braced_auxiii:w }
              { \@@_compile_quantifier_braced_auxii:w }
          }
      }
      {
        \@@_compile_quantifier_abort:xNN
          { \c_left_brace_str \int_use:N \l_@@_internal_a_int }
        #1 #2
      }
  }
\cs_new_protected:Npn \@@_compile_quantifier_braced_auxii:w #1#2
  {
    \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N \c_right_brace_str
      {
        \exp_args:No \@@_compile_quantifier_lazyness:nnNN
          { \int_use:N \l_@@_internal_a_int } { -1 }
      }
      {
        \@@_compile_quantifier_abort:xNN
          { \c_left_brace_str \int_use:N \l_@@_internal_a_int , }
        #1 #2
      }
  }
\cs_new_protected:Npn \@@_compile_quantifier_braced_auxiii:w #1#2
  {
    \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N \c_right_brace_str
      {
        \if_int_compare:w \l_@@_internal_a_int >
          \l_@@_internal_b_int
          \__kernel_msg_error:nnxx { regex } { backwards-quantifier }
            { \int_use:N \l_@@_internal_a_int }
            { \int_use:N \l_@@_internal_b_int }
          \int_zero:N \l_@@_internal_b_int
        \else:
          \int_sub:Nn \l_@@_internal_b_int \l_@@_internal_a_int
        \fi:
        \exp_args:Noo \@@_compile_quantifier_lazyness:nnNN
          { \int_use:N \l_@@_internal_a_int }
          { \int_use:N \l_@@_internal_b_int }
      }
      {
        \@@_compile_quantifier_abort:xNN
          {
            \c_left_brace_str
            \int_use:N \l_@@_internal_a_int ,
            \int_use:N \l_@@_internal_b_int
          }
        #1 #2
      }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \subsubsection{Raw characters}
%
% \begin{macro}{\@@_compile_raw_error:N}
%   Within character classes, and following catcode tests, some escaped
%   alphanumeric sequences such as |\b| do not have any meaning. They
%   are replaced by a raw character, after spitting out an error.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_raw_error:N #1
  {
    \__kernel_msg_error:nnx { regex } { bad-escape } {#1}
    \@@_compile_raw:N #1
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_raw:N}
%   If we are in a character class and the next character is an
%   unescaped dash, this denotes a range. Otherwise, the current
%   character |#1| matches itself.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_raw:N #1#2#3
  {
    \@@_if_in_class:TF
      {
        \@@_two_if_eq:NNNNTF #2 #3 \@@_compile_special:N -
          { \@@_compile_range:Nw #1 }
          {
            \@@_compile_one:n
              { \@@_item_equal:n { \int_value:w `#1 } }
            #2 #3
          }
      }
      {
        \@@_compile_one:n
          { \@@_item_equal:n { \int_value:w `#1 } }
        #2 #3
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_range:Nw, \@@_if_end_range:NNTF}
%   We have just read a raw character followed by a dash; this should be
%   followed by an end-point for the range. Valid end-points are: any
%   raw character; any special character, except a right bracket. In
%   particular, escaped characters are forbidden.
%    \begin{macrocode}
\prg_new_protected_conditional:Npnn \@@_if_end_range:NN #1#2 { TF }
  {
    \if_meaning:w \@@_compile_raw:N #1
      \prg_return_true:
    \else:
      \if_meaning:w \@@_compile_special:N #1
        \if_charcode:w ] #2
          \prg_return_false:
        \else:
          \prg_return_true:
        \fi:
      \else:
        \prg_return_false:
      \fi:
    \fi:
  }
\cs_new_protected:Npn \@@_compile_range:Nw #1#2#3
  {
    \@@_if_end_range:NNTF #2 #3
      {
        \if_int_compare:w `#1 > `#3 \exp_stop_f:
          \__kernel_msg_error:nnxx { regex } { range-backwards } {#1} {#3}
        \else:
          \tl_build_put_right:Nx \l_@@_build_tl
            {
              \if_int_compare:w `#1 = `#3 \exp_stop_f:
                \@@_item_equal:n
              \else:
                \@@_item_range:nn { \int_value:w `#1 }
              \fi:
              { \int_value:w `#3 }
            }
        \fi:
      }
      {
        \__kernel_msg_warning:nnxx { regex } { range-missing-end }
          {#1} { \c_backslash_str #3 }
        \tl_build_put_right:Nx \l_@@_build_tl
          {
            \@@_item_equal:n { \int_value:w `#1 \exp_stop_f: }
            \@@_item_equal:n { \int_value:w `- \exp_stop_f: }
          }
        #2#3
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Character properties}
%
% \begin{macro}{\@@_compile_.:, \@@_prop_.:}
%   In a class, the dot has no special meaning. Outside, insert
%   \cs{@@_prop_.:}, which matches any character or control
%   sequence, and refuses $-2$ (end-marker).
%    \begin{macrocode}
\cs_new_protected:cpx { @@_compile_.: }
  {
    \exp_not:N \@@_if_in_class:TF
      { \@@_compile_raw:N . }
      { \@@_compile_one:n \exp_not:c { @@_prop_.: } }
  }
\cs_new_protected:cpn { @@_prop_.: }
  {
    \if_int_compare:w \l_@@_curr_char_int > - 2 \exp_stop_f:
      \exp_after:wN \@@_break_true:w
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {
%     \@@_compile_/d:, \@@_compile_/D:,
%     \@@_compile_/h:, \@@_compile_/H:,
%     \@@_compile_/s:, \@@_compile_/S:,
%     \@@_compile_/v:, \@@_compile_/V:,
%     \@@_compile_/w:, \@@_compile_/W:,
%     \@@_compile_/N:,
%   }
%   The constants \cs{@@_prop_d:}, \emph{etc.} hold
%   a list of tests which match the corresponding character
%   class, and jump to the \cs{@@_break_point:TF} marker.
%   As for a normal character, we check for quantifiers.
%    \begin{macrocode}
\cs_set_protected:Npn \@@_tmp:w #1#2
  {
    \cs_new_protected:cpx { @@_compile_/#1: }
      { \@@_compile_one:n \exp_not:c { @@_prop_#1: } }
    \cs_new_protected:cpx { @@_compile_/#2: }
      {
        \@@_compile_one:n
          { \@@_item_reverse:n \exp_not:c { @@_prop_#1: } }
      }
  }
\@@_tmp:w d D
\@@_tmp:w h H
\@@_tmp:w s S
\@@_tmp:w v V
\@@_tmp:w w W
\cs_new_protected:cpn { @@_compile_/N: }
  { \@@_compile_one:n \@@_prop_N: }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Anchoring and simple assertions}
%
% \begin{macro}{\@@_compile_anchor_letter:NNN}
% \begin{macro}{\@@_compile_/A:, \@@_compile_/G:, \@@_compile_/Z:, \@@_compile_/z:, \@@_compile_/b:, \@@_compile_/B:}
% \begin{macro}+\@@_compile_^:+
% \begin{macro}+\@@_compile_$:+
%   In modes where assertions are forbidden, anchors such as |\A|
%   produce an error (|\A|~is invalid in classes); otherwise they add an
%   \cs{@@_assertion:Nn} test as appropriate (the only negative
%   assertion is~|\B|).  The test functions are defined later.  The
%   implementation for
%   |$| and |^| is only different from |\A| etc because these are valid
%   in a class.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_anchor_letter:NNN #1#2#3
  {
    \@@_if_in_class_or_catcode:TF { \@@_compile_raw_error:N #1 }
      {
        \tl_build_put_right:Nn \l_@@_build_tl
          { \@@_assertion:Nn #2 {#3} }
      }
  }
\cs_new_protected:cpn { @@_compile_/A: }
  { \@@_compile_anchor_letter:NNN A \c_true_bool \@@_A_test: }
\cs_new_protected:cpn { @@_compile_/G: }
  { \@@_compile_anchor_letter:NNN G \c_true_bool \@@_G_test: }
\cs_new_protected:cpn { @@_compile_/Z: }
  { \@@_compile_anchor_letter:NNN Z \c_true_bool \@@_Z_test: }
\cs_new_protected:cpn { @@_compile_/z: }
  { \@@_compile_anchor_letter:NNN z \c_true_bool \@@_Z_test: }
\cs_new_protected:cpn { @@_compile_/b: }
  { \@@_compile_anchor_letter:NNN b \c_true_bool \@@_b_test: }
\cs_new_protected:cpn { @@_compile_/B: }
  { \@@_compile_anchor_letter:NNN B \c_false_bool \@@_b_test: }
\cs_set_protected:Npn \@@_tmp:w #1#2
  {
    \cs_new_protected:cpn { @@_compile_#1: }
      {
        \@@_if_in_class_or_catcode:TF { \@@_compile_raw:N #1 }
          {
            \tl_build_put_right:Nn \l_@@_build_tl
              { \@@_assertion:Nn \c_true_bool {#2} }
          }
      }
  }
\exp_args:Nx \@@_tmp:w { \iow_char:N \^ } { \@@_A_test: }
\exp_args:Nx \@@_tmp:w { \iow_char:N \$ } { \@@_Z_test: }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \subsubsection{Character classes}
%
% \begin{macro}{\@@_compile_]:}
%   Outside a class, right brackets have no meaning. In a class, change
%   the mode ($m\to (m-15)/13$, truncated) to reflect the fact that we
%   are leaving the class. Look for quantifiers, unless we are still in
%   a class after leaving one (the case of |[...\cL[...]...]|).
%   quantifiers.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_]: }
  {
    \@@_if_in_class:TF
      {
        \if_int_compare:w \l_@@_mode_int >
          \c_@@_catcode_in_class_mode_int
          \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } }
        \fi:
        \tex_advance:D \l_@@_mode_int - 15 \exp_stop_f:
        \tex_divide:D \l_@@_mode_int 13 \exp_stop_f:
        \if_int_odd:w \l_@@_mode_int \else:
          \exp_after:wN \@@_compile_quantifier:w
        \fi:
      }
      { \@@_compile_raw:N ] }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_[:}
%   In a class, left brackets might introduce a \textsc{posix} character
%   class, or mean nothing. Immediately following |\c|\meta{category},
%   we must insert the appropriate catcode test, then parse the class; we
%   pre-expand the catcode as an optimization. Otherwise (modes $0$,
%   $-2$ and $-6$) just parse the class.  The mode is updated later.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_[: }
  {
    \@@_if_in_class:TF
      { \@@_compile_class_posix_test:w }
      {
        \@@_if_within_catcode:TF
          {
            \exp_after:wN \@@_compile_class_catcode:w
              \int_use:N \l_@@_catcodes_int ;
          }
          { \@@_compile_class_normal:w }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_class_normal:w}
%   In the \enquote{normal} case, we insert \cs{@@_class:NnnnN}
%   \meta{boolean} in the compiled code. The \meta{boolean} is true for
%   positive classes, and false for negative classes, characterized by a
%   leading |^|. The auxiliary \cs{@@_compile_class:TFNN} also
%   checks for a leading |]| which has a special meaning.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_class_normal:w
  {
    \@@_compile_class:TFNN
      { \@@_class:NnnnN \c_true_bool }
      { \@@_class:NnnnN \c_false_bool }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_class_catcode:w}
%   This function is called for a left bracket in modes $2$ or $6$
%   (catcode test, and catcode test within a class). In mode $2$ the
%   whole construction needs to be put in a class (like single
%   character). Then determine if the class is positive or negative,
%   inserting \cs{@@_item_catcode:nT} or the \texttt{reverse} variant
%   as appropriate, each with the current catcodes bitmap |#1| as an
%   argument, and reset the catcodes.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_class_catcode:w #1;
  {
    \if_int_compare:w \l_@@_mode_int = \c_@@_catcode_mode_int
      \tl_build_put_right:Nn \l_@@_build_tl
        { \@@_class:NnnnN \c_true_bool { \if_false: } \fi: }
    \fi:
    \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int
    \@@_compile_class:TFNN
      { \@@_item_catcode:nT {#1} }
      { \@@_item_catcode_reverse:nT {#1} }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {\@@_compile_class:TFNN, \@@_compile_class:NN}
%   If the first character is |^|, then the class is negative (use
%   |#2|), otherwise it is positive (use |#1|). If the next character
%   is a right bracket, then it should be changed to a raw one.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_class:TFNN #1#2#3#4
  {
    \l_@@_mode_int = \int_value:w \l_@@_mode_int 3 \exp_stop_f:
    \@@_two_if_eq:NNNNTF #3 #4 \@@_compile_special:N ^
      {
        \tl_build_put_right:Nn \l_@@_build_tl { #2 { \if_false: } \fi: }
        \@@_compile_class:NN
      }
      {
        \tl_build_put_right:Nn \l_@@_build_tl { #1 { \if_false: } \fi: }
        \@@_compile_class:NN #3 #4
      }
  }
\cs_new_protected:Npn \@@_compile_class:NN #1#2
  {
    \token_if_eq_charcode:NNTF #2 ]
      { \@@_compile_raw:N #2 }
      { #1 #2 }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {
%     \@@_compile_class_posix_test:w,
%     \@@_compile_class_posix:NNNNw,
%     \@@_compile_class_posix_loop:w,
%     \@@_compile_class_posix_end:w
%   }
%   Here we check for a syntax such as |[:alpha:]|. We also detect |[=|
%   and |[.| which have a meaning in \textsc{posix} regular expressions,
%   but are not implemented in \pkg{l3regex}. In case we see |[:|, grab
%   raw characters until hopefully reaching |:]|. If that's missing, or
%   the \textsc{posix} class is unknown, abort. If all is right, add the
%   test to the current class, with an extra \cs{@@_item_reverse:n}
%   for negative classes.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_class_posix_test:w #1#2
  {
    \token_if_eq_meaning:NNT \@@_compile_special:N #1
      {
        \str_case:nn { #2 }
          {
            : { \@@_compile_class_posix:NNNNw }
            = {
                \__kernel_msg_warning:nnx { regex }
                  { posix-unsupported } { = }
              }
            . {
                \__kernel_msg_warning:nnx { regex }
                  { posix-unsupported } { . }
              }
          }
      }
    \@@_compile_raw:N [ #1 #2
  }
\cs_new_protected:Npn \@@_compile_class_posix:NNNNw #1#2#3#4#5#6
  {
    \@@_two_if_eq:NNNNTF #5 #6 \@@_compile_special:N ^
      {
        \bool_set_false:N \l_@@_internal_bool
        \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi:
          \@@_compile_class_posix_loop:w
      }
      {
        \bool_set_true:N \l_@@_internal_bool
        \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi:
          \@@_compile_class_posix_loop:w #5 #6
      }
  }
\cs_new:Npn \@@_compile_class_posix_loop:w #1#2
  {
    \token_if_eq_meaning:NNTF \@@_compile_raw:N #1
      { #2 \@@_compile_class_posix_loop:w }
      { \if_false: { \fi: } \@@_compile_class_posix_end:w #1 #2 }
  }
\cs_new_protected:Npn \@@_compile_class_posix_end:w #1#2#3#4
  {
    \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N :
      { \@@_two_if_eq:NNNNTF #3 #4 \@@_compile_special:N ] }
      { \use_ii:nn }
      {
        \cs_if_exist:cTF { @@_posix_ \l_@@_internal_a_tl : }
          {
            \@@_compile_one:n
              {
                \bool_if:NF \l_@@_internal_bool \@@_item_reverse:n
                \exp_not:c { @@_posix_ \l_@@_internal_a_tl : }
              }
          }
          {
            \__kernel_msg_warning:nnx { regex } { posix-unknown }
              { \l_@@_internal_a_tl }
            \@@_compile_abort_tokens:x
              {
                [: \bool_if:NF \l_@@_internal_bool { ^ }
                \l_@@_internal_a_tl :]
              }
          }
      }
      {
        \__kernel_msg_error:nnxx { regex } { posix-missing-close }
          { [: \l_@@_internal_a_tl } { #2 #4 }
        \@@_compile_abort_tokens:x { [: \l_@@_internal_a_tl }
        #1 #2 #3 #4
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Groups and alternations}
%
% \begin{macro}{\@@_compile_group_begin:N, \@@_compile_group_end:}
%   The contents of a regex group are turned into compiled code in
%   \cs{l_@@_build_tl}, which ends up with items of the form
%   \cs{@@_branch:n} \Arg{concatenation}. This construction is done
%   using \cs[no-index]{tl_build_\ldots{}} functions within a \TeX{} group, which automatically
%   makes sure that options (case-sensitivity and default catcode) are
%   reset at the end of the group. The argument |#1| is
%   \cs{@@_group:nnnN} or a variant thereof. A small subtlety to
%   support |\cL(abc)| as a shorthand for |(\cLa\cLb\cLc)|: exit any
%   pending catcode test, save the category code at the start of the
%   group as the default catcode for that group, and make sure that the
%   catcode is restored to the default outside the group.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_group_begin:N #1
  {
    \tl_build_put_right:Nn \l_@@_build_tl { #1 { \if_false: } \fi: }
    \@@_mode_quit_c:
    \group_begin:
      \tl_build_begin:N \l_@@_build_tl
      \int_set_eq:NN \l_@@_default_catcodes_int \l_@@_catcodes_int
      \int_incr:N \l_@@_group_level_int
      \tl_build_put_right:Nn \l_@@_build_tl
        { \@@_branch:n { \if_false: } \fi: }
  }
\cs_new_protected:Npn \@@_compile_group_end:
  {
    \if_int_compare:w \l_@@_group_level_int > 0 \exp_stop_f:
        \tl_build_put_right:Nn \l_@@_build_tl { \if_false: { \fi: } }
        \tl_build_end:N \l_@@_build_tl
        \exp_args:NNNx
      \group_end:
      \tl_build_put_right:Nn \l_@@_build_tl { \l_@@_build_tl }
      \int_set_eq:NN \l_@@_catcodes_int \l_@@_default_catcodes_int
      \exp_after:wN \@@_compile_quantifier:w
    \else:
      \__kernel_msg_warning:nn { regex } { extra-rparen }
      \exp_after:wN \@@_compile_raw:N \exp_after:wN )
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_(:}
%   In a class, parentheses are not special.  In a catcode test inside a
%   class, a left parenthesis gives an error, to catch |[a\cL(bcd)e]|.
%   Otherwise check for a |?|, denoting special groups, and run the code
%   for the corresponding special group.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_(: }
  {
    \@@_if_in_class:TF { \@@_compile_raw:N ( }
      {
        \if_int_compare:w \l_@@_mode_int =
          \c_@@_catcode_in_class_mode_int
          \__kernel_msg_error:nn { regex } { c-lparen-in-class }
          \exp_after:wN \@@_compile_raw:N \exp_after:wN (
        \else:
          \exp_after:wN \@@_compile_lparen:w
        \fi:
      }
  }
\cs_new_protected:Npn \@@_compile_lparen:w #1#2#3#4
  {
    \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N ?
      {
        \cs_if_exist_use:cF
          { @@_compile_special_group_\token_to_str:N #4 :w }
          {
            \__kernel_msg_warning:nnx { regex } { special-group-unknown }
              { (? #4 }
            \@@_compile_group_begin:N \@@_group:nnnN
              \@@_compile_raw:N ? #3 #4
          }
      }
      {
        \@@_compile_group_begin:N \@@_group:nnnN
          #1 #2 #3 #4
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}+\@@_compile_|:+
%   In a class, the pipe is not special. Otherwise, end the current
%   branch and open another one.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_|: }
  {
    \@@_if_in_class:TF { \@@_compile_raw:N | }
      {
        \tl_build_put_right:Nn \l_@@_build_tl
          { \if_false: { \fi: } \@@_branch:n { \if_false: } \fi: }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_):}
%   Within a class, parentheses are not special. Outside, close a group.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_): }
  {
    \@@_if_in_class:TF { \@@_compile_raw:N ) }
      { \@@_compile_group_end: }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_special_group_::w}
% \begin{macro}+\@@_compile_special_group_|:w+
%   Non-capturing, and resetting groups are easy to take care of during
%   compilation; for those groups, the harder parts come when building.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_special_group_::w }
  { \@@_compile_group_begin:N \@@_group_no_capture:nnnN }
\cs_new_protected:cpn { @@_compile_special_group_|:w }
  { \@@_compile_group_begin:N \@@_group_resetting:nnnN }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}
%   {\@@_compile_special_group_i:w, \@@_compile_special_group_-:w}
%   The match can be made case-insensitive by setting the option with
%   \texttt{(?i)}; the original behaviour is restored by \texttt{(?-i)}.
%   This is the only supported option.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_special_group_i:w #1#2
  {
    \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N )
      {
        \cs_set:Npn \@@_item_equal:n
          { \@@_item_caseless_equal:n }
        \cs_set:Npn \@@_item_range:nn
          { \@@_item_caseless_range:nn }
      }
      {
        \__kernel_msg_warning:nnx { regex } { unknown-option } { (?i #2 }
        \@@_compile_raw:N (
        \@@_compile_raw:N ?
        \@@_compile_raw:N i
        #1 #2
      }
  }
\cs_new_protected:cpn { @@_compile_special_group_-:w } #1#2#3#4
  {
    \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_raw:N i
      { \@@_two_if_eq:NNNNTF #3 #4 \@@_compile_special:N ) }
      { \use_ii:nn }
      {
        \cs_set:Npn \@@_item_equal:n
          { \@@_item_caseful_equal:n }
        \cs_set:Npn \@@_item_range:nn
          { \@@_item_caseful_range:nn }
      }
      {
        \__kernel_msg_warning:nnx { regex } { unknown-option } { (?-#2#4 }
        \@@_compile_raw:N (
        \@@_compile_raw:N ?
        \@@_compile_raw:N -
        #1 #2 #3 #4
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Catcodes and csnames}
%
% \begin{macro}{\@@_compile_/c:, \@@_compile_c_test:NN}
%   The |\c| escape sequence can be followed by a capital letter
%   representing a character category, by a left bracket which starts a
%   list of categories, or by a brace group holding a regular expression
%   for a control sequence name. Otherwise, raise an error.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_/c: }
  { \@@_chk_c_allowed:T { \@@_compile_c_test:NN } }
\cs_new_protected:Npn \@@_compile_c_test:NN #1#2
  {
    \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N
      {
        \int_if_exist:cTF { c_@@_catcode_#2_int }
          {
            \int_set_eq:Nc \l_@@_catcodes_int
              { c_@@_catcode_#2_int }
            \l_@@_mode_int
              = \if_case:w \l_@@_mode_int
                  \c_@@_catcode_mode_int
                \else:
                  \c_@@_catcode_in_class_mode_int
                \fi:
            \token_if_eq_charcode:NNT C #2 { \@@_compile_c_C:NN }
          }
      }
      { \cs_if_exist_use:cF { @@_compile_c_#2:w } }
          {
            \__kernel_msg_error:nnx { regex } { c-missing-category } {#2}
            #1 #2
          }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_c_C:NN}
%   If |\cC| is not followed by |.| or |(...)| then complain because
%   that construction cannot match anything, except in cases like
%   |\cC[\c{...}]|, where it has no effect.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_c_C:NN #1#2
  {
    \token_if_eq_meaning:NNTF #1 \@@_compile_special:N
      {
        \token_if_eq_charcode:NNTF #2 .
          { \use_none:n }
          { \token_if_eq_charcode:NNF #2 ( } % )
      }
      { \use:n }
    { \__kernel_msg_error:nnn { regex } { c-C-invalid } {#2} }
    #1 #2
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {
%     \@@_compile_c_[:w,
%     \@@_compile_c_lbrack_loop:NN,
%     \@@_compile_c_lbrack_add:N,
%     \@@_compile_c_lbrack_end:,
%   }
%   When encountering |\c[|, the task is to collect uppercase letters
%   representing character categories. First check for |^| which negates
%   the list of category codes.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_c_[:w } #1#2
  {
    \l_@@_mode_int
      = \if_case:w \l_@@_mode_int
          \c_@@_catcode_mode_int
        \else:
          \c_@@_catcode_in_class_mode_int
        \fi:
    \int_zero:N \l_@@_catcodes_int
    \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N ^
      {
        \bool_set_false:N \l_@@_catcodes_bool
        \@@_compile_c_lbrack_loop:NN
      }
      {
        \bool_set_true:N \l_@@_catcodes_bool
        \@@_compile_c_lbrack_loop:NN
        #1 #2
      }
  }
\cs_new_protected:Npn \@@_compile_c_lbrack_loop:NN #1#2
  {
    \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N
      {
        \int_if_exist:cTF { c_@@_catcode_#2_int }
          {
            \exp_args:Nc \@@_compile_c_lbrack_add:N
              { c_@@_catcode_#2_int }
            \@@_compile_c_lbrack_loop:NN
          }
      }
      {
        \token_if_eq_charcode:NNTF #2 ]
          { \@@_compile_c_lbrack_end: }
      }
          {
            \__kernel_msg_error:nnx { regex } { c-missing-rbrack } {#2}
            \@@_compile_c_lbrack_end:
            #1 #2
          }
  }
\cs_new_protected:Npn \@@_compile_c_lbrack_add:N #1
  {
    \if_int_odd:w \int_eval:n { \l_@@_catcodes_int / #1 } \exp_stop_f:
    \else:
      \int_add:Nn \l_@@_catcodes_int {#1}
    \fi:
  }
\cs_new_protected:Npn \@@_compile_c_lbrack_end:
  {
    \if_meaning:w \c_false_bool \l_@@_catcodes_bool
      \int_set:Nn \l_@@_catcodes_int
        { \c_@@_all_catcodes_int - \l_@@_catcodes_int }
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}+\@@_compile_c_{:+
%   The case of a left brace is easy, based on what we have done so far:
%   in a group, compile the regular expression, after changing the mode
%   to forbid nesting |\c|. Additionally, disable submatch tracking
%   since groups don't escape the scope of |\c{...}|.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_c_ \c_left_brace_str :w }
  {
    \@@_compile:w
      \@@_disable_submatches:
      \l_@@_mode_int
        = \if_case:w \l_@@_mode_int
            \c_@@_cs_mode_int
          \else:
            \c_@@_cs_in_class_mode_int
          \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}+\@@_compile_}:+
% \begin{macro}{\@@_compile_end_cs:}
% \begin{macro}[EXP]{\@@_compile_cs_aux:Nn, \@@_compile_cs_aux:NNnnnN}
%   Non-escaped right braces are only special if they appear when
%   compiling the regular expression for a csname, but not within a
%   class: |\c{[{}]}| matches the control sequences |\{| and |\}|.  So,
%   end compiling the inner regex (this closes any dangling class or
%   group).  Then insert the corresponding test in the outer regex.  As
%   an optimization, if the control sequence test simply consists of
%   several explicit possibilities (branches) then use
%   \cs{@@_item_exact_cs:n} with an argument consisting of all
%   possibilities separated by \cs{scan_stop:}.
%    \begin{macrocode}
\flag_new:n { @@_cs }
\cs_new_protected:cpn { @@_compile_ \c_right_brace_str : }
  {
    \@@_if_in_cs:TF
      { \@@_compile_end_cs: }
      { \exp_after:wN \@@_compile_raw:N \c_right_brace_str }
  }
\cs_new_protected:Npn \@@_compile_end_cs:
  {
    \@@_compile_end:
    \flag_clear:n { @@_cs }
    \__kernel_tl_set:Nx \l_@@_internal_a_tl
      {
        \exp_after:wN \@@_compile_cs_aux:Nn \l_@@_internal_regex
        \q_@@_nil \q_@@_nil \q_@@_recursion_stop
      }
    \exp_args:Nx \@@_compile_one:n
      {
        \flag_if_raised:nTF { @@_cs }
          { \@@_item_cs:n { \exp_not:o \l_@@_internal_regex } }
          {
            \@@_item_exact_cs:n
              { \tl_tail:N \l_@@_internal_a_tl }
          }
      }
  }
\cs_new:Npn \@@_compile_cs_aux:Nn #1#2
  {
    \cs_if_eq:NNTF #1 \@@_branch:n
      {
        \scan_stop:
        \@@_compile_cs_aux:NNnnnN #2
        \q_@@_nil \q_@@_nil \q_@@_nil
        \q_@@_nil \q_@@_nil \q_@@_nil \q_@@_recursion_stop
        \@@_compile_cs_aux:Nn
      }
      {
        \@@_quark_if_nil:NF #1 { \flag_raise_if_clear:n { @@_cs } }
        \@@_use_none_delimit_by_q_recursion_stop:w
      }
  }
\cs_new:Npn \@@_compile_cs_aux:NNnnnN #1#2#3#4#5#6
  {
    \bool_lazy_all:nTF
      {
        { \cs_if_eq_p:NN #1 \@@_class:NnnnN }
        {#2}
        { \tl_if_head_eq_meaning_p:nN {#3} \@@_item_caseful_equal:n }
        { \int_compare_p:nNn { \tl_count:n {#3} } = { 2 } }
        { \int_compare_p:nNn {#5} = { 0 } }
      }
      {
        \prg_replicate:nn {#4}
          { \char_generate:nn { \use_ii:nn #3 } {12} }
        \@@_compile_cs_aux:NNnnnN
      }
      {
        \@@_quark_if_nil:NF #1
          {
            \flag_raise_if_clear:n { @@_cs }
            \@@_use_i_delimit_by_q_recursion_stop:nw
          }
        \@@_use_none_delimit_by_q_recursion_stop:w
      }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
%
% \subsubsection{Raw token lists with \cs{u}}
%
% \begin{macro}{\@@_compile_/u:}
% \begin{macro}[EXP]{\@@_compile_u_loop:NN}
%   The |\u| escape is invalid in classes and directly following a
%   catcode test. Otherwise, it must be followed by a left brace. We
%   then collect the characters for the argument of |\u| within an
%   \texttt{x}-expanding assignment. In principle we could just wait to
%   encounter a right brace, but this is unsafe: if the right brace was
%   missing, then we would reach the end-markers of the regex, and
%   continue, leading to obscure fatal errors. Instead, we only allow
%   raw and special characters, and stop when encountering a special
%   right brace, any escaped character, or the end-marker.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_/u: } #1#2
  {
    \@@_if_in_class_or_catcode:TF
      { \@@_compile_raw_error:N u #1 #2 }
      {
        \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N \c_left_brace_str
          {
            \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi:
            \@@_compile_u_loop:NN
          }
          {
            \__kernel_msg_error:nn { regex } { u-missing-lbrace }
            \@@_compile_raw:N u #1 #2
          }
      }
  }
\cs_new:Npn \@@_compile_u_loop:NN #1#2
  {
    \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N
      { #2 \@@_compile_u_loop:NN }
      {
        \token_if_eq_meaning:NNTF #1 \@@_compile_special:N
          {
            \exp_after:wN \token_if_eq_charcode:NNTF \c_right_brace_str #2
              { \if_false: { \fi: } \@@_compile_u_end: }
              { #2 \@@_compile_u_loop:NN }
          }
          {
            \if_false: { \fi: }
            \__kernel_msg_error:nnx { regex } { u-missing-rbrace } {#2}
            \@@_compile_u_end:
            #1 #2
          }
      }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_compile_u_end:}
%   Once we have extracted the variable's name, we store the contents of
%   that variable in \cs{l_@@_internal_a_tl}. The behaviour of |\u|
%   then depends on whether we are within a |\c{...}| escape (in this
%   case, the variable is turned to a string), or not.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_u_end:
  {
    \tl_set:Nv \l_@@_internal_a_tl { \l_@@_internal_a_tl }
    \if_int_compare:w \l_@@_mode_int = \c_@@_outer_mode_int
      \@@_compile_u_not_cs:
    \else:
      \@@_compile_u_in_cs:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_u_in_cs:}
%   When |\u| appears within a control sequence, we convert the variable
%   to a string with escaped spaces. Then for each character insert a
%   class matching exactly that character, once.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_u_in_cs:
  {
    \__kernel_tl_gset:Nx \g_@@_internal_tl
      {
        \exp_args:No \__kernel_str_to_other_fast:n
          { \l_@@_internal_a_tl }
      }
    \tl_build_put_right:Nx \l_@@_build_tl
      {
        \tl_map_function:NN \g_@@_internal_tl
          \@@_compile_u_in_cs_aux:n
      }
  }
\cs_new:Npn \@@_compile_u_in_cs_aux:n #1
  {
    \@@_class:NnnnN \c_true_bool
      { \@@_item_caseful_equal:n { \int_value:w `#1 } }
      { 1 } { 0 } \c_false_bool
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_u_not_cs:}
%   In mode $0$, the |\u| escape adds one state to the NFA for each
%   token in \cs{l_@@_internal_a_tl}. If a given \meta{token} is a
%   control sequence, then insert a string comparison test, otherwise,
%   \cs{@@_item_exact:nn} which compares catcode and character code.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_compile_u_not_cs:
  {
    \tl_analysis_map_inline:Nn \l_@@_internal_a_tl
      {
        \tl_build_put_right:Nx \l_@@_build_tl
          {
            \@@_class:NnnnN \c_true_bool
              {
                \if_int_compare:w "##3 = 0 \exp_stop_f:
                  \@@_item_exact_cs:n
                    { \exp_after:wN \cs_to_str:N ##1 }
                \else:
                  \@@_item_exact:nn { \int_value:w "##3 } { ##2 }
                \fi:
              }
              { 1 } { 0 } \c_false_bool
          }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Other}
%
% \begin{macro}{\@@_compile_/K:}
%   The |\K| control sequence is currently the only \enquote{command},
%   which performs some action, rather than matching something. It is
%   allowed in the same contexts as |\b|. At the compilation stage, we
%   leave it as a single control sequence, defined later.
%    \begin{macrocode}
\cs_new_protected:cpn { @@_compile_/K: }
  {
    \int_compare:nNnTF \l_@@_mode_int = \c_@@_outer_mode_int
      { \tl_build_put_right:Nn \l_@@_build_tl { \@@_command_K: } }
      { \@@_compile_raw_error:N K }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Showing regexes}
%
% \begin{macro}{\@@_show:N}
%   Within a group and within \cs{tl_build_begin:N} \ldots{} \cs{tl_build_end:N} we
%   redefine all the function that can appear in a compiled regex, then
%   run the regex. The result stored in \cs{l_@@_internal_a_tl} is then
%   meant to be shown.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_show:N #1
  {
    \group_begin:
      \tl_build_begin:N \l_@@_build_tl
      \cs_set_protected:Npn \@@_branch:n
        {
          \seq_pop_right:NN \l_@@_show_prefix_seq
            \l_@@_internal_a_tl
          \@@_show_one:n { +-branch }
          \seq_put_right:No \l_@@_show_prefix_seq
            \l_@@_internal_a_tl
          \use:n
        }
      \cs_set_protected:Npn \@@_group:nnnN
        { \@@_show_group_aux:nnnnN { } }
      \cs_set_protected:Npn \@@_group_no_capture:nnnN
        { \@@_show_group_aux:nnnnN { ~(no~capture) } }
      \cs_set_protected:Npn \@@_group_resetting:nnnN
        { \@@_show_group_aux:nnnnN { ~(resetting) } }
      \cs_set_eq:NN \@@_class:NnnnN \@@_show_class:NnnnN
      \cs_set_protected:Npn \@@_command_K:
        { \@@_show_one:n { reset~match~start~(\iow_char:N\\K) } }
      \cs_set_protected:Npn \@@_assertion:Nn ##1##2
        {
          \@@_show_one:n
            { \bool_if:NF ##1 { negative~ } assertion:~##2 }
        }
      \cs_set:Npn \@@_b_test: { word~boundary }
      \cs_set:Npn \@@_Z_test: { anchor~at~end~(\iow_char:N\\Z) }
      \cs_set:Npn \@@_A_test: { anchor~at~start~(\iow_char:N\\A) }
      \cs_set:Npn \@@_G_test: { anchor~at~start~of~match~(\iow_char:N\\G) }
      \cs_set_protected:Npn \@@_item_caseful_equal:n ##1
        { \@@_show_one:n { char~code~\int_eval:n{##1} } }
      \cs_set_protected:Npn \@@_item_caseful_range:nn ##1##2
        {
          \@@_show_one:n
            { range~[\int_eval:n{##1}, \int_eval:n{##2}] }
        }
      \cs_set_protected:Npn \@@_item_caseless_equal:n ##1
        { \@@_show_one:n { char~code~\int_eval:n{##1}~(caseless) } }
      \cs_set_protected:Npn \@@_item_caseless_range:nn ##1##2
        {
          \@@_show_one:n
            { Range~[\int_eval:n{##1}, \int_eval:n{##2}]~(caseless) }
        }
      \cs_set_protected:Npn \@@_item_catcode:nT
        { \@@_show_item_catcode:NnT \c_true_bool }
      \cs_set_protected:Npn \@@_item_catcode_reverse:nT
        { \@@_show_item_catcode:NnT \c_false_bool }
      \cs_set_protected:Npn \@@_item_reverse:n
        { \@@_show_scope:nn { Reversed~match } }
      \cs_set_protected:Npn \@@_item_exact:nn ##1##2
        { \@@_show_one:n { char~##2,~catcode~##1 } }
      \cs_set_eq:NN \@@_item_exact_cs:n \@@_show_item_exact_cs:n
      \cs_set_protected:Npn \@@_item_cs:n
        { \@@_show_scope:nn { control~sequence } }
      \cs_set:cpn { @@_prop_.: } { \@@_show_one:n { any~token } }
      \seq_clear:N \l_@@_show_prefix_seq
      \@@_show_push:n { ~ }
      \cs_if_exist_use:N #1
      \tl_build_end:N \l_@@_build_tl
      \exp_args:NNNo
    \group_end:
    \tl_set:Nn \l_@@_internal_a_tl { \l_@@_build_tl }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_show_one:n}
%   Every part of the final message go through this function, which adds
%   one line to the output, with the appropriate prefix.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_show_one:n #1
  {
    \int_incr:N \l_@@_show_lines_int
    \tl_build_put_right:Nx \l_@@_build_tl
      {
        \exp_not:N \iow_newline:
        \seq_map_function:NN \l_@@_show_prefix_seq \use:n
        #1
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {\@@_show_push:n, \@@_show_pop:, \@@_show_scope:nn}
%   Enter and exit levels of nesting. The \texttt{scope} function prints
%   its first argument as an \enquote{introduction}, then performs its
%   second argument in a deeper level of nesting.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_show_push:n #1
  { \seq_put_right:Nx \l_@@_show_prefix_seq { #1 ~ } }
\cs_new_protected:Npn \@@_show_pop:
  { \seq_pop_right:NN \l_@@_show_prefix_seq \l_@@_internal_a_tl }
\cs_new_protected:Npn \@@_show_scope:nn #1#2
  {
    \@@_show_one:n {#1}
    \@@_show_push:n { ~ }
    #2
    \@@_show_pop:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_show_group_aux:nnnnN}
%   We display all groups in the same way, simply adding a message,
%   \texttt{(no capture)} or \texttt{(resetting)}, to special groups.
%   The odd \cs{use_ii:nn} avoids printing a spurious \texttt{+-branch}
%   for the first branch.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_show_group_aux:nnnnN #1#2#3#4#5
  {
    \@@_show_one:n { ,-group~begin #1 }
    \@@_show_push:n { | }
    \use_ii:nn #2
    \@@_show_pop:
    \@@_show_one:n
      { `-group~end \@@_msg_repeated:nnN {#3} {#4} #5 }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_show_class:NnnnN}
%   I'm entirely unhappy about this function: I couldn't find a way to
%   test if a class is a single test. Instead, collect the
%   representation of the tests in the class. If that had more than one
%   line, write \texttt{Match} or \texttt{Don't match} on its own line,
%   with the repeating information if any. Then the various tests on
%   lines of their own, and finally a line. Otherwise, we need to
%   evaluate the representation of the tests again (since the prefix is
%   incorrect). That's clunky, but not too expensive, since it's only
%   one test.
%    \begin{macrocode}
\cs_set:Npn \@@_show_class:NnnnN #1#2#3#4#5
  {
    \group_begin:
      \tl_build_begin:N \l_@@_build_tl
      \int_zero:N \l_@@_show_lines_int
      \@@_show_push:n {~}
      #2
    \int_compare:nTF { \l_@@_show_lines_int = 0 }
      {
        \group_end:
        \@@_show_one:n { \bool_if:NTF #1 { Fail } { Pass } }
      }
      {
        \bool_if:nTF
          { #1 && \int_compare_p:n { \l_@@_show_lines_int = 1 } }
          {
            \group_end:
            #2
            \tl_build_put_right:Nn \l_@@_build_tl
              { \@@_msg_repeated:nnN {#3} {#4} #5 }
          }
          {
              \tl_build_end:N \l_@@_build_tl
              \exp_args:NNNo
            \group_end:
            \tl_set:Nn \l_@@_internal_a_tl \l_@@_build_tl
            \@@_show_one:n
              {
                \bool_if:NTF #1 { Match } { Don't~match }
                \@@_msg_repeated:nnN {#3} {#4} #5
              }
            \tl_build_put_right:Nx \l_@@_build_tl
              { \exp_not:o \l_@@_internal_a_tl }
          }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_show_item_catcode:NnT}
%   Produce a sequence of categories which the catcode bitmap |#2|
%   contains, and show it, indenting the tests on which this catcode
%   constraint applies.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_show_item_catcode:NnT #1#2
  {
    \seq_set_split:Nnn \l_@@_internal_seq { } { CBEMTPUDSLOA }
    \seq_set_filter:NNn \l_@@_internal_seq \l_@@_internal_seq
      { \int_if_odd_p:n { #2 / \int_use:c { c_@@_catcode_##1_int } } }
    \@@_show_scope:nn
      {
        categories~
        \seq_map_function:NN \l_@@_internal_seq \use:n
        , ~
        \bool_if:NF #1 { negative~ } class
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_show_item_exact_cs:n}
%    \begin{macrocode}
\cs_new_protected:Npn \@@_show_item_exact_cs:n #1
  {
    \seq_set_split:Nnn \l_@@_internal_seq { \scan_stop: } {#1}
    \seq_set_map_x:NNn \l_@@_internal_seq
      \l_@@_internal_seq { \iow_char:N\\##1 }
    \@@_show_one:n
      { control~sequence~ \seq_use:Nn \l_@@_internal_seq { ~or~ } }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsection{Building}
%
% \subsubsection{Variables used while building}
%
% \begin{variable}{\l_@@_min_state_int, \l_@@_max_state_int}
%   The last state that was allocated is
%   $\cs{l_@@_max_state_int}-1$, so that \cs{l_@@_max_state_int} always
%   points to a free state.  The \texttt{min_state} variable is
%   $1$ to begin with, but gets shifted in nested calls to the matching
%   code, namely in |\c{...}| constructions.
%    \begin{macrocode}
\int_new:N  \l_@@_min_state_int
\int_set:Nn \l_@@_min_state_int { 1 }
\int_new:N  \l_@@_max_state_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_left_state_int, \l_@@_right_state_int}
% \begin{variable}{\l_@@_left_state_seq, \l_@@_right_state_seq}
%   Alternatives are implemented by branching from a \texttt{left} state
%   into the various choices, then merging those into a \texttt{right}
%   state. We store information about those states in two sequences.
%   Those states are also used to implement group quantifiers. Most
%   often, the left and right pointers only differ by~$1$.
%    \begin{macrocode}
\int_new:N  \l_@@_left_state_int
\int_new:N  \l_@@_right_state_int
\seq_new:N  \l_@@_left_state_seq
\seq_new:N  \l_@@_right_state_seq
%    \end{macrocode}
% \end{variable}
% \end{variable}
%
% \begin{variable}{\l_@@_capturing_group_int}
%   \cs{l_@@_capturing_group_int} is the next \textsc{id} number to
%   be assigned to a capturing group. This starts
%   at $0$ for the group enclosing the full regular expression, and
%   groups are counted in the order of their left parenthesis, except
%   when encountering \texttt{resetting} groups.
%    \begin{macrocode}
\int_new:N  \l_@@_capturing_group_int
%    \end{macrocode}
% \end{variable}
%
% \subsubsection{Framework}
%
% This phase is about going from a compiled regex to an \textsc{nfa}.
% Each state of the \textsc{nfa} is stored in a \tn{toks}. The
% operations which can appear in the \tn{toks} are
% \begin{itemize}
%   \item \cs{@@_action_start_wildcard:N} \meta{boolean} inserted at the
%     start of the regular expression, where a \texttt{true}
%     \meta{boolean} makes it unanchored.
%   \item \cs{@@_action_success:} marks the exit state of the
%     \textsc{nfa}.
%   \item \cs{@@_action_cost:n} \Arg{shift} is a transition from the
%     current \meta{state} to $\meta{state}+\meta{shift}$, which
%     consumes the current character: the target state is saved and will
%     be considered again when matching at the next position.
%   \item \cs{@@_action_free:n} \Arg{shift}, and
%     \cs{@@_action_free_group:n} \Arg{shift} are free transitions,
%     which immediately perform the actions for the state
%     $\meta{state}+\meta{shift}$ of the \textsc{nfa}. They differ in
%     how they detect and avoid infinite loops. For now, we just need to
%     know that the \texttt{group} variant must be used for transitions
%     back to the start of a group.
%   \item \cs{@@_action_submatch:nN} \Arg{group} \meta{key} where the
%     \meta{key} is |<| or |>| for the beginning or end of group
%     numbered \meta{group}.  This causes the current position in the
%     query to be stored as the \meta{key} submatch boundary.
%   \item One of these actions, within a conditional.
% \end{itemize}
%
% We strive to preserve the following properties while building.
% \begin{itemize}
%   \item The current capturing group is
%     $\text{\texttt{capturing_group}}-1$, and if a group opened now
%     it would be labelled \texttt{capturing_group}.
%   \item The last allocated state is $\text{\texttt{max_state}}-1$, so
%     \texttt{max_state} is a free state.
%   \item The \texttt{left_state} points to a state to the left of the
%     current group or of the last class.
%   \item The \texttt{right_state} points to a newly created,
%     empty state, with some transitions leading to it.
%   \item The \texttt{left/right} sequences hold a list of the
%     corresponding end-points of nested groups.
% \end{itemize}
%
% \begin{macro}{\@@_build:n, \@@_build_aux:Nn, \@@_build:N, \@@_build_aux:NN}
%   The \texttt{n}-type function first compiles its argument. Reset some
%   variables. Allocate two states, and put a wildcard in state $0$
%   (transitions to state $1$ and $0$ state). Then build the regex
%   within a (capturing) group numbered $0$ (current
%   value of \texttt{capturing_group}). Finally, if the match reaches the
%   last state, it is successful.  A \texttt{false} boolean for argument
%   |#1| for the auxiliaries will suppress the wildcard and make the
%   match anchored: used for \cs{peek_regex:nTF} and similar.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_build:n
  { \@@_build_aux:Nn \c_true_bool }
\cs_new_protected:Npn \@@_build:N
  { \@@_build_aux:NN \c_true_bool }
\cs_new_protected:Npn \@@_build_aux:Nn #1#2
  {
    \@@_compile:n {#2}
    \@@_build_aux:NN #1 \l_@@_internal_regex
  }
\cs_new_protected:Npn \@@_build_aux:NN #1#2
  {
    \@@_standard_escapechar:
    \int_zero:N \l_@@_capturing_group_int
    \int_set_eq:NN \l_@@_max_state_int \l_@@_min_state_int
    \@@_build_new_state:
    \@@_build_new_state:
    \@@_toks_put_right:Nn \l_@@_left_state_int
      { \@@_action_start_wildcard:N #1 }
    \@@_group:nnnN {#2} { 1 } { 0 } \c_false_bool
    \@@_toks_put_right:Nn \l_@@_right_state_int
      { \@@_action_success: }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_build_for_cs:n}
%   The matching code relies on some global intarray variables, but only
%   uses a range of their entries.  Specifically,
%   \begin{itemize}
%   \item \cs{g_@@_state_active_intarray} from \cs{l_@@_min_state_int}
%     to $\cs{l_@@_max_state_int}-1$;
%   \end{itemize}
%   Here, in this nested call to the
%   matching code, we need the new versions of this range to involve
%   completely new entries of the intarray variables, so we begin by
%   setting (the new) \cs{l_@@_min_state_int} to (the old)
%   \cs{l_@@_max_state_int} to use higher entries.
%
%   When using a regex to match a cs, we don't insert a wildcard, we
%   anchor at the end, and since we ignore submatches, there is no need
%   to surround the expression with a group. However, for branches to
%   work properly at the outer level, we need to put the appropriate
%   \texttt{left} and \texttt{right} states in their sequence.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_build_for_cs:n #1
  {
    \int_set_eq:NN \l_@@_min_state_int \l_@@_max_state_int
    \@@_build_new_state:
    \@@_build_new_state:
    \@@_push_lr_states:
    #1
    \@@_pop_lr_states:
    \@@_toks_put_right:Nn \l_@@_right_state_int
      {
        \if_int_compare:w -2 = \l_@@_curr_char_int
          \exp_after:wN \@@_action_success:
        \fi:
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Helpers for building an \textsc{nfa}}
%
% \begin{macro}{\@@_push_lr_states:, \@@_pop_lr_states:}
%   When building the regular expression, we keep track of pointers to
%   the left-end and right-end of each group without help from \TeX{}'s
%   grouping.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_push_lr_states:
  {
    \seq_push:No \l_@@_left_state_seq
      { \int_use:N \l_@@_left_state_int }
    \seq_push:No \l_@@_right_state_seq
      { \int_use:N \l_@@_right_state_int }
  }
\cs_new_protected:Npn \@@_pop_lr_states:
  {
    \seq_pop:NN \l_@@_left_state_seq  \l_@@_internal_a_tl
    \int_set:Nn \l_@@_left_state_int  \l_@@_internal_a_tl
    \seq_pop:NN \l_@@_right_state_seq \l_@@_internal_a_tl
    \int_set:Nn \l_@@_right_state_int \l_@@_internal_a_tl
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}
%   {
%     \@@_build_transition_left:NNN,
%     \@@_build_transition_right:nNn
%   }
%   Add a transition from |#2| to |#3| using the function |#1|. The
%   \texttt{left} function is used for higher priority transitions, and
%   the \texttt{right} function for lower priority transitions (which
%   should be performed later). The signatures differ to reflect the
%   differing usage later on. Both functions could be optimized.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_build_transition_left:NNN #1#2#3
  { \@@_toks_put_left:Nx  #2 { #1 { \int_eval:n { #3 - #2 } } } }
\cs_new_protected:Npn \@@_build_transition_right:nNn #1#2#3
  { \@@_toks_put_right:Nx #2 { #1 { \int_eval:n { #3 - #2 } } } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_build_new_state:}
%   Add a new empty state to the \textsc{nfa}. Then update the
%   \texttt{left}, \texttt{right}, and \texttt{max} states, so that the
%   \texttt{right} state is the new empty state, and the \texttt{left}
%   state points to the previously \enquote{current} state.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_build_new_state:
  {
    \@@_toks_clear:N \l_@@_max_state_int
    \int_set_eq:NN \l_@@_left_state_int \l_@@_right_state_int
    \int_set_eq:NN \l_@@_right_state_int \l_@@_max_state_int
    \int_incr:N \l_@@_max_state_int
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_build_transitions_lazyness:NNNNN}
%   This function creates a new state, and puts two transitions starting
%   from the old current state. The order of the transitions is
%   controlled by |#1|, true for lazy quantifiers, and false for greedy
%   quantifiers.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_build_transitions_lazyness:NNNNN #1#2#3#4#5
  {
    \@@_build_new_state:
    \@@_toks_put_right:Nx \l_@@_left_state_int
      {
        \if_meaning:w \c_true_bool #1
          #2 { \int_eval:n { #3 - \l_@@_left_state_int } }
          #4 { \int_eval:n { #5 - \l_@@_left_state_int } }
        \else:
          #4 { \int_eval:n { #5 - \l_@@_left_state_int } }
          #2 { \int_eval:n { #3 - \l_@@_left_state_int } }
        \fi:
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Building classes}
%
% \begin{macro}{\@@_class:NnnnN}
% \begin{macro}[rEXP]{\@@_tests_action_cost:n}
%   The arguments are: \meta{boolean} \Arg{tests} \Arg{min} \Arg{more}
%   \meta{lazyness}. First store the tests with a trailing
%   \cs{@@_action_cost:n}, in the true branch of
%   \cs{@@_break_point:TF} for positive classes, or the false branch
%   for negative classes. The integer \meta{more} is $0$ for fixed
%   repetitions, $-1$ for unbounded repetitions, and
%   $\meta{max}-\meta{min}$ for a range of repetitions.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_class:NnnnN #1#2#3#4#5
  {
    \cs_set:Npx \@@_tests_action_cost:n ##1
      {
        \exp_not:n { \exp_not:n {#2} }
        \bool_if:NTF #1
          { \@@_break_point:TF { \@@_action_cost:n {##1} } { } }
          { \@@_break_point:TF { } { \@@_action_cost:n {##1} } }
      }
    \if_case:w - #4 \exp_stop_f:
           \@@_class_repeat:n   {#3}
    \or:   \@@_class_repeat:nN  {#3}      #5
    \else: \@@_class_repeat:nnN {#3} {#4} #5
    \fi:
  }
\cs_new:Npn \@@_tests_action_cost:n { \@@_action_cost:n }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_class_repeat:n}
%   This is used for a fixed number of repetitions. Build one state for
%   each repetition, with a transition controlled by the tests that we
%   have collected. That works just fine for |#1|${}=0$ repetitions:
%   nothing is built.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_class_repeat:n #1
  {
    \prg_replicate:nn {#1}
      {
        \@@_build_new_state:
        \@@_build_transition_right:nNn \@@_tests_action_cost:n
          \l_@@_left_state_int \l_@@_right_state_int
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_class_repeat:nN}
%   This implements unbounded repetitions of a single class (\emph{e.g.}
%   the |*| and |+| quantifiers). If the minimum number |#1| of
%   repetitions is $0$, then build a transition from the current state
%   to itself governed by the tests, and a free transition to a new
%   state (hence skipping the tests). Otherwise, call
%   \cs{@@_class_repeat:n} for the code to match |#1| repetitions,
%   and add free transitions from the last state to the previous one,
%   and to a new one. In both cases, the order of transitions is
%   controlled by the lazyness boolean |#2|.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_class_repeat:nN #1#2
  {
    \if_int_compare:w #1 = 0 \exp_stop_f:
      \@@_build_transitions_lazyness:NNNNN #2
        \@@_action_free:n       \l_@@_right_state_int
        \@@_tests_action_cost:n \l_@@_left_state_int
    \else:
      \@@_class_repeat:n {#1}
      \int_set_eq:NN \l_@@_internal_a_int \l_@@_left_state_int
      \@@_build_transitions_lazyness:NNNNN #2
        \@@_action_free:n \l_@@_right_state_int
        \@@_action_free:n \l_@@_internal_a_int
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_class_repeat:nnN}
%   We want to build the code to match from |#1| to $|#1|+|#2|$
%   repetitions. Match |#1| repetitions (can be $0$). Compute the final
%   state of the next construction as \texttt{a}. Build $|#2|>0$ states,
%   each with a transition to the next state governed by the tests, and
%   a transition to the final state \texttt{a}. The computation of
%   \texttt{a} is safe because states are allocated in order, starting
%   from \texttt{max_state}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_class_repeat:nnN #1#2#3
  {
    \@@_class_repeat:n {#1}
    \int_set:Nn \l_@@_internal_a_int
      { \l_@@_max_state_int + #2 - 1 }
    \prg_replicate:nn { #2 }
      {
        \@@_build_transitions_lazyness:NNNNN #3
          \@@_action_free:n       \l_@@_internal_a_int
          \@@_tests_action_cost:n \l_@@_right_state_int
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Building groups}
%
% \begin{macro}{\@@_group_aux:nnnnN}
%   Arguments: \Arg{label} \Arg{contents} \Arg{min} \Arg{more}
%   \meta{lazyness}. If \meta{min} is $0$, we need to add a state before
%   building the group, so that the thread which skips the group does
%   not also set the start-point of the submatch. After adding one more
%   state, the \texttt{left_state} is the left end of the group, from
%   which all branches stem, and the \texttt{right_state} is the
%   right end of the group, and all branches end their course in that
%   state. We store those two integers to be queried for each branch, we
%   build the \textsc{nfa} states for the contents |#2| of the group,
%   and we forget about the two integers. Once this is done, perform the
%   repetition: either exactly |#3| times, or |#3| or more times, or
%   between |#3| and $|#3|+|#4|$ times, with lazyness |#5|. The
%   \meta{label} |#1| is used for submatch tracking. Each of the three
%   auxiliaries expects \texttt{left_state} and \texttt{right_state} to
%   be set properly.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group_aux:nnnnN #1#2#3#4#5
  {
      \if_int_compare:w #3 = 0 \exp_stop_f:
        \@@_build_new_state:
%<assert>\assert_int:n { \l_@@_max_state_int = \l_@@_right_state_int + 1 }
        \@@_build_transition_right:nNn \@@_action_free_group:n
          \l_@@_left_state_int \l_@@_right_state_int
      \fi:
      \@@_build_new_state:
      \@@_push_lr_states:
      #2
      \@@_pop_lr_states:
      \if_case:w - #4 \exp_stop_f:
             \@@_group_repeat:nn   {#1} {#3}
      \or:   \@@_group_repeat:nnN  {#1} {#3}      #5
      \else: \@@_group_repeat:nnnN {#1} {#3} {#4} #5
      \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_group:nnnN, \@@_group_no_capture:nnnN}
%   Hand to \cs{@@_group_aux:nnnnnN} the label of that group
%   (expanded), and the group itself, with some extra commands to
%   perform.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group:nnnN #1
  {
    \exp_args:No \@@_group_aux:nnnnN
      { \int_use:N \l_@@_capturing_group_int }
      {
        \int_incr:N \l_@@_capturing_group_int
        #1
      }
  }
\cs_new_protected:Npn \@@_group_no_capture:nnnN
  { \@@_group_aux:nnnnN { -1 } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_group_resetting:nnnN}
% \begin{macro}{\@@_group_resetting_loop:nnNn}
%   Again, hand the label $-1$ to \cs{@@_group_aux:nnnnN}, but this
%   time we work a little bit harder to keep track of the maximum group
%   label at the end of any branch, and to reset the group number at
%   each branch. This relies on the fact that a compiled regex always is
%   a sequence of items of the form \cs{@@_branch:n} \Arg{branch}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group_resetting:nnnN #1
  {
    \@@_group_aux:nnnnN { -1 }
      {
        \exp_args:Noo \@@_group_resetting_loop:nnNn
          { \int_use:N \l_@@_capturing_group_int }
          { \int_use:N \l_@@_capturing_group_int }
          #1
          { ?? \prg_break:n } { }
        \prg_break_point:
      }
  }
\cs_new_protected:Npn \@@_group_resetting_loop:nnNn #1#2#3#4
  {
    \use_none:nn #3 { \int_set:Nn \l_@@_capturing_group_int {#1} }
    \int_set:Nn \l_@@_capturing_group_int {#2}
    #3 {#4}
    \exp_args:Nf \@@_group_resetting_loop:nnNn
      { \int_max:nn {#1} { \l_@@_capturing_group_int } }
      {#2}
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_branch:n}
%   Add a free transition from the left state of the current group to a
%   brand new state, starting point of this branch. Once the branch is
%   built, add a transition from its last state to the right state of
%   the group. The left and right states of the group are extracted from
%   the relevant sequences.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_branch:n #1
  {
    \@@_build_new_state:
    \seq_get:NN \l_@@_left_state_seq \l_@@_internal_a_tl
    \int_set:Nn \l_@@_left_state_int \l_@@_internal_a_tl
    \@@_build_transition_right:nNn \@@_action_free:n
      \l_@@_left_state_int \l_@@_right_state_int
    #1
    \seq_get:NN \l_@@_right_state_seq \l_@@_internal_a_tl
    \@@_build_transition_right:nNn \@@_action_free:n
      \l_@@_right_state_int \l_@@_internal_a_tl
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_group_repeat:nn}
%   This function is called to repeat a group a fixed number of times
%   |#2|; if this is $0$ we remove the group altogether (but don't reset
%   the \texttt{capturing_group} label). Otherwise, the auxiliary
%   \cs{@@_group_repeat_aux:n} copies |#2| times the \tn{toks} for
%   the group, and leaves \texttt{internal_a} pointing to the left end
%   of the last repetition. We only record the submatch information at
%   the last repetition. Finally, add a state at the end (the transition
%   to it has been taken care of by the replicating auxiliary.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group_repeat:nn #1#2
  {
    \if_int_compare:w #2 = 0 \exp_stop_f:
      \int_set:Nn \l_@@_max_state_int
        { \l_@@_left_state_int - 1 }
      \@@_build_new_state:
    \else:
      \@@_group_repeat_aux:n {#2}
      \@@_group_submatches:nNN {#1}
        \l_@@_internal_a_int \l_@@_right_state_int
      \@@_build_new_state:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_group_submatches:nNN}
%   This inserts in states |#2| and |#3| the code for tracking
%   submatches of the group |#1|, unless inhibited by a label of $-1$.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group_submatches:nNN #1#2#3
  {
    \if_int_compare:w #1 > - 1 \exp_stop_f:
      \@@_toks_put_left:Nx #2 { \@@_action_submatch:nN {#1} < }
      \@@_toks_put_left:Nx #3 { \@@_action_submatch:nN {#1} > }
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_group_repeat_aux:n}
%   Here we repeat \tn{toks} ranging from \texttt{left_state} to
%   \texttt{max_state}, $|#1|>0$ times. First add a transition so that
%   the copies \enquote{chain} properly. Compute the shift
%   \texttt{c} between the original copy and the last copy we
%   want. Shift the \texttt{right_state} and \texttt{max_state} to their
%   final values. We then want to perform \texttt{c} copy operations. At
%   the end, \texttt{b} is equal to the \texttt{max_state}, and
%   \texttt{a} points to the left of the last copy of the group.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group_repeat_aux:n #1
  {
    \@@_build_transition_right:nNn \@@_action_free:n
      \l_@@_right_state_int \l_@@_max_state_int
    \int_set_eq:NN \l_@@_internal_a_int \l_@@_left_state_int
    \int_set_eq:NN \l_@@_internal_b_int \l_@@_max_state_int
    \if_int_compare:w \int_eval:n {#1} > 1 \exp_stop_f:
      \int_set:Nn \l_@@_internal_c_int
        {
          ( #1 - 1 )
          * ( \l_@@_internal_b_int - \l_@@_internal_a_int )
        }
      \int_add:Nn \l_@@_right_state_int { \l_@@_internal_c_int }
      \int_add:Nn \l_@@_max_state_int   { \l_@@_internal_c_int }
      \@@_toks_memcpy:NNn
        \l_@@_internal_b_int
        \l_@@_internal_a_int
        \l_@@_internal_c_int
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_group_repeat:nnN}
%   This function is called to repeat a group at least $n$ times; the
%   case $n=0$ is very different from $n>0$. Assume first that $n=0$.
%   Insert submatch tracking information at the start and end of the
%   group, add a free transition from the right end to the
%   \enquote{true} left state \texttt{a} (remember: in this case we had
%   added an extra state before the left state). This forms the loop,
%   which we break away from by adding a free transition from \texttt{a}
%   to a new state.
%
%   Now consider the case $n>0$. Repeat the group $n$ times, chaining
%   various copies with a free transition. Add submatch tracking only to
%   the last copy, then add a free transition from the right end back to
%   the left end of the last copy, either before or after the transition
%   to move on towards the rest of the \textsc{nfa}. This transition can
%   end up before submatch tracking, but that is irrelevant since it
%   only does so when going again through the group, recording new
%   matches. Finally, add a state; we already have a transition pointing
%   to it from \cs{@@_group_repeat_aux:n}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group_repeat:nnN #1#2#3
  {
    \if_int_compare:w #2 = 0 \exp_stop_f:
      \@@_group_submatches:nNN {#1}
        \l_@@_left_state_int \l_@@_right_state_int
      \int_set:Nn \l_@@_internal_a_int
        { \l_@@_left_state_int - 1 }
      \@@_build_transition_right:nNn \@@_action_free:n
        \l_@@_right_state_int \l_@@_internal_a_int
      \@@_build_new_state:
      \if_meaning:w \c_true_bool #3
        \@@_build_transition_left:NNN \@@_action_free:n
          \l_@@_internal_a_int \l_@@_right_state_int
      \else:
        \@@_build_transition_right:nNn \@@_action_free:n
          \l_@@_internal_a_int \l_@@_right_state_int
      \fi:
    \else:
      \@@_group_repeat_aux:n {#2}
      \@@_group_submatches:nNN {#1}
        \l_@@_internal_a_int \l_@@_right_state_int
      \if_meaning:w \c_true_bool #3
        \@@_build_transition_right:nNn \@@_action_free_group:n
          \l_@@_right_state_int \l_@@_internal_a_int
      \else:
        \@@_build_transition_left:NNN \@@_action_free_group:n
          \l_@@_right_state_int \l_@@_internal_a_int
      \fi:
      \@@_build_new_state:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_group_repeat:nnnN}
%   We wish to repeat the group between |#2| and $|#2|+|#3|$ times, with
%   a lazyness controlled by |#4|. We insert submatch tracking up front:
%   in principle, we could avoid recording submatches for the first |#2|
%   copies of the group, but that forces us to treat specially the case
%   $|#2|=0$. Repeat that group with submatch tracking $|#2|+|#3|$ times
%   (the maximum number of repetitions). Then our goal is to add |#3|
%   transitions from the end of the |#2|-th group, and each subsequent
%   groups, to the end. For a lazy quantifier, we add those transitions
%   to the left states, before submatch tracking. For the greedy case,
%   we add the transitions to the right states, after submatch tracking
%   and the transitions which go on with more repetitions. In the greedy
%   case with $|#2|=0$, the transition which skips over all copies of
%   the group must be added separately, because its starting state does
%   not follow the normal pattern: we had to add it \enquote{by hand}
%   earlier.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group_repeat:nnnN #1#2#3#4
  {
    \@@_group_submatches:nNN {#1}
      \l_@@_left_state_int \l_@@_right_state_int
    \@@_group_repeat_aux:n { #2 + #3 }
    \if_meaning:w \c_true_bool #4
      \int_set_eq:NN \l_@@_left_state_int \l_@@_max_state_int
      \prg_replicate:nn { #3 }
        {
          \int_sub:Nn \l_@@_left_state_int
            { \l_@@_internal_b_int - \l_@@_internal_a_int }
          \@@_build_transition_left:NNN \@@_action_free:n
            \l_@@_left_state_int \l_@@_max_state_int
        }
    \else:
      \prg_replicate:nn { #3 - 1 }
        {
          \int_sub:Nn \l_@@_right_state_int
            { \l_@@_internal_b_int - \l_@@_internal_a_int }
          \@@_build_transition_right:nNn \@@_action_free:n
            \l_@@_right_state_int \l_@@_max_state_int
        }
      \if_int_compare:w #2 = 0 \exp_stop_f:
        \int_set:Nn \l_@@_right_state_int
          { \l_@@_left_state_int - 1 }
      \else:
        \int_sub:Nn \l_@@_right_state_int
          { \l_@@_internal_b_int - \l_@@_internal_a_int }
      \fi:
      \@@_build_transition_right:nNn \@@_action_free:n
        \l_@@_right_state_int \l_@@_max_state_int
    \fi:
    \@@_build_new_state:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Others}
%
% \begin{macro}{\@@_assertion:Nn, \@@_b_test:, \@@_A_test:, \@@_G_test:, \@@_Z_test:}
%   Usage: \cs{@@_assertion:Nn} \meta{boolean} \Arg{test}, where the
%   \meta{test} is either of the two other functions. Add a free
%   transition to a new state, conditionally to the assertion test. The
%   \cs{@@_b_test:} test is used by the |\b| and |\B| escape: check
%   if the last character was a word character or not, and do the same
%   to the current character. The boundary-markers of the string are
%   non-word characters for this purpose.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_assertion:Nn #1#2
  {
    \@@_build_new_state:
    \@@_toks_put_right:Nx \l_@@_left_state_int
      {
        \exp_not:n {#2}
        \@@_break_point:TF
          \bool_if:NF #1 { { } }
          {
            \@@_action_free:n
              {
                \int_eval:n
                  { \l_@@_right_state_int - \l_@@_left_state_int }
              }
          }
          \bool_if:NT #1 { { } }
      }
  }
\cs_new_protected:Npn \@@_b_test:
  {
    \group_begin:
      \int_set_eq:NN \l_@@_curr_char_int \l_@@_last_char_int
      \@@_prop_w:
      \@@_break_point:TF
        { \group_end: \@@_item_reverse:n \@@_prop_w: }
        { \group_end: \@@_prop_w: }
  }
\cs_new_protected:Npn \@@_Z_test:
  {
    \if_int_compare:w -2 = \l_@@_curr_char_int
      \exp_after:wN \@@_break_true:w
    \fi:
  }
\cs_new_protected:Npn \@@_A_test:
  {
    \if_int_compare:w -2 = \l_@@_last_char_int
      \exp_after:wN \@@_break_true:w
    \fi:
  }
\cs_new_protected:Npn \@@_G_test:
  {
    \if_int_compare:w \l_@@_curr_pos_int = \l_@@_start_pos_int
      \exp_after:wN \@@_break_true:w
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_command_K:}
%   Change the starting point of the $0$-th submatch (full match), and
%   transition to a new state, pretending that this is a fresh thread.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_command_K:
  {
    \@@_build_new_state:
    \@@_toks_put_right:Nx \l_@@_left_state_int
      {
        \@@_action_submatch:nN { 0 } <
        \bool_set_true:N \l_@@_fresh_thread_bool
        \@@_action_free:n
          {
            \int_eval:n
              { \l_@@_right_state_int - \l_@@_left_state_int }
          }
        \bool_set_false:N \l_@@_fresh_thread_bool
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsection{Matching}
%
% We search for matches by running all the execution threads through the
% \textsc{nfa} in parallel, reading one token of the query at each step.
% The \textsc{nfa} contains \enquote{free} transitions to other states,
% and transitions which \enquote{consume} the current token.  For free
% transitions, the instruction at the new state of the \textsc{nfa} is
% performed immediately.  When a transition consumes a character, the
% new state is appended to a list of \enquote{active states}, stored in
% \cs{g_@@_thread_info_intarray} (together with submatch information):
% this thread is made active again when the next
% token is read from the query.  At every step (for each token in the
% query), we unpack that list of active states and the corresponding
% submatch props, and empty those.
%
% If two paths through the \textsc{nfa} \enquote{collide} in the sense
% that they reach the same state after reading a given token, then they
% only differ in how they previously matched, and any future execution
% would be identical for both. (Note that this would be wrong in the
% presence of back-references.) Hence, we only need to keep one of the
% two threads: the thread with the highest priority. Our \textsc{nfa} is
% built in such a way that higher priority actions always come before
% lower priority actions, which makes things work.
%
% The explanation in the previous paragraph may make us think that we
% simply need to keep track of which states were visited at a given
% step: after all, the loop generated when matching |(a?)*| against |a|
% is broken, isn't it? No. The group first matches |a|, as it should,
% then repeats; it attempts to match |a| again but fails; it skips |a|,
% and finds out that this state has already been seen at this position
% in the query: the match stops. The capturing group is (wrongly) |a|.
% What went wrong is that a thread collided with itself, and the later
% version, which has gone through the group one more times with an empty
% match, should have a higher priority than not going through the group.
%
% We solve this by distinguishing \enquote{normal} free transitions
% \cs{@@_action_free:n} from transitions
% \cs{@@_action_free_group:n} which go back to the start of the
% group. The former keeps threads unless they have been visited by a
% \enquote{completed} thread, while the latter kind of transition also
% prevents going back to a state visited by the current thread.
%
% \subsubsection{Variables used when matching}
%
% \begin{variable}
%   {
%     \l_@@_min_pos_int,
%     \l_@@_max_pos_int,
%     \l_@@_curr_pos_int,
%     \l_@@_start_pos_int,
%     \l_@@_success_pos_int,
%   }
%   The tokens in the query are indexed from \texttt{min_pos} for the
%   first to $\texttt{max_pos}-1$ for the last, and their information is
%   stored in several arrays and \tn{toks} registers with those numbers.
%   We match
%   without backtracking, keeping all threads in lockstep at the
%   \texttt{curr_pos} in the query. The starting point of the current
%   match attempt is \texttt{start_pos}, and \texttt{success_pos},
%   updated whenever a thread succeeds, is used as the next starting
%   position.
%    \begin{macrocode}
\int_new:N \l_@@_min_pos_int
\int_new:N \l_@@_max_pos_int
\int_new:N \l_@@_curr_pos_int
\int_new:N \l_@@_start_pos_int
\int_new:N \l_@@_success_pos_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}
%   {
%     \l_@@_curr_char_int,
%     \l_@@_curr_catcode_int,
%     \l_@@_curr_token_tl,
%     \l_@@_last_char_int,
%     \l_@@_last_char_success_int,
%     \l_@@_case_changed_char_int
%   }
%   The character and category codes of the token at the current
%   position and a token list expanding to that token; the character
%   code of the token at the previous position;
%   the character code of the token just before a successful match;
%   and the character code of the result of changing the case of the
%   current token (|A-Z|$\leftrightarrow$|a-z|). This last integer is
%   only computed when necessary, and is otherwise \cs{c_max_int}.  The
%   \texttt{curr_char} variable is also used in various other phases
%   to hold a character code.
%    \begin{macrocode}
\int_new:N \l_@@_curr_char_int
\int_new:N \l_@@_curr_catcode_int
\tl_new:N \l_@@_curr_token_tl
\int_new:N \l_@@_last_char_int
\int_new:N \l_@@_last_char_success_int
\int_new:N \l_@@_case_changed_char_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_curr_state_int}
%   For every character in the token list, each of the active states is
%   considered in turn.  The variable \cs{l_@@_curr_state_int}
%   holds the state of the \textsc{nfa} which is currently considered:
%   transitions are then given as shifts relative to the current state.
%    \begin{macrocode}
\int_new:N \l_@@_curr_state_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}
%   {\l_@@_curr_submatches_tl, \l_@@_success_submatches_tl}
%   The submatches for the thread which is currently active are stored
%   in the \texttt{curr_submatches} list, which is almost a comma list,
%   but ends with a comma. This list is stored by \cs{@@_store_state:n}
%   into an intarray variable, to be retrieved when matching at the next
%   position. When a thread succeeds, this list is copied to
%   \cs{l_@@_success_submatches_tl}: only the last successful thread
%   remains there.
%    \begin{macrocode}
\tl_new:N \l_@@_curr_submatches_tl
\tl_new:N \l_@@_success_submatches_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_step_int}
%   This integer, always even, is increased every time a character in
%   the query is read, and not reset when doing multiple matches.  We
%   store in \cs{g_@@_state_active_intarray} the last step in which each
%   \meta{state} in the \textsc{nfa} was encountered. This lets us break
%   infinite loops by not visiting the same state twice in the same
%   step. In fact, the step we store is equal to \texttt{step} when we
%   have started performing the operations of \tn{toks}\meta{state}, but
%   not finished yet. However, once we finish, we store
%   $\text{\texttt{step}}+1$ in \cs{g_@@_state_active_intarray}.  This is
%   needed to track submatches
%   properly (see building phase). The \texttt{step} is also used to
%   attach each set of submatch information to a given iteration (and
%   automatically discard it when it corresponds to a past step).
%    \begin{macrocode}
\int_new:N \l_@@_step_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_min_thread_int, \l_@@_max_thread_int}
%   All the currently active threads are kept in order of precedence in
%   \cs{g_@@_thread_info_intarray} together with the corresponding
%   submatch information.  Data in this intarray is organized as blocks
%   from \texttt{min_thread} (included) to \texttt{max_thread}
%   (excluded).  At the start of every step, the whole array is
%   unpacked, so that the space can immediately be reused, and
%   \texttt{max_thread} is reset to \texttt{min_thread}, effectively
%   clearing the array.
%    \begin{macrocode}
\int_new:N \l_@@_min_thread_int
\int_new:N \l_@@_max_thread_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\g_@@_state_active_intarray, \g_@@_thread_info_intarray}
%   \cs{g_@@_state_active_intarray} stores the last \meta{step} in which
%   each \meta{state} was active.  \cs{g_@@_thread_info_intarray} stores
%   threads to be considered in the next step, more precisely the
%   states in which these threads are.
%    \begin{macrocode}
\intarray_new:Nn \g_@@_state_active_intarray { 65536 }
\intarray_new:Nn \g_@@_thread_info_intarray { 65536 }
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_matched_analysis_tl, \l_@@_curr_analysis_tl}
%   The list \cs{l_@@_curr_analysis_tl} consists of a brace group
%   containing three brace groups corresponding to the current token,
%   with the same syntax as \cs{tl_analysis_map_inline:nn}.  The list
%   \cs{l_@@_matched_analysis_tl} (constructed under the
%   \texttt{tl\_build} machinery) has one item for each token that has
%   already been treated so far in a given match attempt: each item
%   consists of three brace groups with the same syntax as
%   \cs{tl_analysis_map_inline:nn}.
%    \begin{macrocode}
\tl_new:N \l_@@_matched_analysis_tl
\tl_new:N \l_@@_curr_analysis_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_every_match_tl}
%   Every time a match is found, this token list is used.  For single
%   matching, the token list is empty. For multiple matching, the token
%   list is set to repeat the matching, after performing some operation
%   which depends on the user function. See \cs{@@_single_match:} and
%   \cs{@@_multi_match:n}.
%    \begin{macrocode}
\tl_new:N \l_@@_every_match_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_fresh_thread_bool, \l_@@_empty_success_bool}
% \begin{macro}{\@@_if_two_empty_matches:F}
%   When doing multiple matches, we need to avoid infinite loops where
%   each iteration matches the same empty token list. When an empty
%   token list is matched, the next successful match of the same empty
%   token list is suppressed. We detect empty matches by setting
%   \cs{l_@@_fresh_thread_bool} to \texttt{true} for threads which
%   directly come from the start of the regex or from the |\K| command,
%   and testing that boolean whenever a thread succeeds. The function
%   \cs{@@_if_two_empty_matches:F} is redefined at every match
%   attempt, depending on whether the previous match was empty or not:
%   if it was, then the function must cancel a purported success if it
%   is empty and at the same spot as the previous match; otherwise, we
%   definitely don't have two identical empty matches, so the function
%   is \cs{use:n}.
%    \begin{macrocode}
\bool_new:N \l_@@_fresh_thread_bool
\bool_new:N \l_@@_empty_success_bool
\cs_new_eq:NN \@@_if_two_empty_matches:F \use:n
%    \end{macrocode}
% \end{macro}
% \end{variable}
%
% \begin{variable}
%   {
%     \g_@@_success_bool,
%     \l_@@_saved_success_bool,
%     \l_@@_match_success_bool
%   }
%   The boolean \cs{l_@@_match_success_bool} is true if the current
%   match attempt was successful, and \cs{g_@@_success_bool} is true
%   if there was at least one successful match. This is the only global
%   variable in this whole module, but we would need it to be local when
%   matching a control sequence with |\c{...}|. This is done by saving
%   the global variable into \cs{l_@@_saved_success_bool}, which is
%   local, hence not affected by the changes due to inner regex
%   functions.
%    \begin{macrocode}
\bool_new:N \g_@@_success_bool
\bool_new:N \l_@@_saved_success_bool
\bool_new:N \l_@@_match_success_bool
%    \end{macrocode}
% \end{variable}
%
% \subsubsection{Matching: framework}
%
% \begin{macro}{\@@_match:n, \@@_match_cs:n}
% \begin{macro}{\@@_match_init:}
%   Initialize the variables that should
%   be set once for each user function (even for multiple
%   matches). Namely, the overall matching is not yet successful; none of
%   the states should be marked as visited (\cs{g_@@_state_active_intarray}), and
%   we start at step $0$; we pretend that there was a previous match
%   ending at the start of the query, which was not empty (to avoid
%   smothering an empty match at the start). Once all this is set up, we
%   are ready for the ride. Find the first match.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_match:n #1
  {
    \@@_match_init:
    \@@_match_once_init:
    \tl_analysis_map_inline:nn {#1}
      { \@@_match_one_token:nnN {##1} {##2} ##3 }
    \@@_match_one_token:nnN { } { -2 } F
    \prg_break_point:Nn \@@_maplike_break: { }
  }
\cs_new_protected:Npn \@@_match_cs:n #1
  {
    \int_set_eq:NN \l_@@_min_thread_int \l_@@_max_thread_int
    \@@_match_init:
    \@@_match_once_init:
    \str_map_inline:nn {#1}
      {
        \tl_if_blank:nTF {##1}
          { \@@_match_one_token:nnN {##1} {`##1} A }
          { \@@_match_one_token:nnN {##1} {`##1} C }
      }
    \@@_match_one_token:nnN { } { -2 } F
    \prg_break_point:Nn \@@_maplike_break: { }
  }
\cs_new_protected:Npn \@@_match_init:
  {
    \bool_gset_false:N \g_@@_success_bool
    \int_step_inline:nnn
      \l_@@_min_state_int { \l_@@_max_state_int - 1 }
      {
        \__kernel_intarray_gset:Nnn
          \g_@@_state_active_intarray {##1} { 1 }
      }
    \int_zero:N \l_@@_step_int
    \int_set:Nn \l_@@_min_pos_int { 2 }
    \int_set_eq:NN \l_@@_success_pos_int \l_@@_min_pos_int
    \int_set:Nn \l_@@_last_char_success_int { -2 }
    \tl_build_begin:N \l_@@_matched_analysis_tl
    \tl_clear:N \l_@@_curr_analysis_tl
    \int_set:Nn \l_@@_min_submatch_int { 1 }
    \int_set_eq:NN \l_@@_submatch_int \l_@@_min_submatch_int
    \bool_set_false:N \l_@@_empty_success_bool
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_match_once_init:}
%   This function resets various variables used when finding one match.
%   It is called before the loop through characters, and every time we
%   find a match, before searching for another match (this is controlled
%   by the \texttt{every_match} token list).
%
%   First initialize some variables: set the
%   conditional which detects identical empty matches; this match
%   attempt starts at the previous \texttt{success_pos}, is not yet
%   successful, and has no submatches yet; clear the array of active
%   threads, and put the starting state $0$ in it. We are then almost
%   ready to read our first token in the query, but we actually start
%   one position earlier than the start because
%   \cs{@@_match_one_token:nnN} increments \cs{l_@@_curr_pos_int} and
%   saves \cs{l_@@_curr_char_int} as the \texttt{last_char} so that word
%   boundaries can be correctly identified.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_match_once_init:
  {
    \if_meaning:w \c_true_bool \l_@@_empty_success_bool
      \cs_set:Npn \@@_if_two_empty_matches:F
        {
          \int_compare:nNnF
            \l_@@_start_pos_int = \l_@@_curr_pos_int
        }
    \else:
      \cs_set_eq:NN \@@_if_two_empty_matches:F \use:n
    \fi:
    \int_set_eq:NN \l_@@_start_pos_int \l_@@_success_pos_int
    \bool_set_false:N \l_@@_match_success_bool
    \tl_set:Nx \l_@@_curr_submatches_tl
      { \prg_replicate:nn { 2 * \l_@@_capturing_group_int } { 0 , } }
    \int_set_eq:NN \l_@@_max_thread_int \l_@@_min_thread_int
    \@@_store_state:n { \l_@@_min_state_int }
    \int_set:Nn \l_@@_curr_pos_int
      { \l_@@_start_pos_int - 1 }
    \int_set_eq:NN \l_@@_curr_char_int \l_@@_last_char_success_int
    \tl_build_get:NN \l_@@_matched_analysis_tl \l_@@_internal_a_tl
    \exp_args:NNf \@@_match_once_init_aux:
    \tl_map_inline:nn
      { \exp_after:wN \l_@@_internal_a_tl \l_@@_curr_analysis_tl }
      { \@@_match_one_token:nnN ##1 }
    \prg_break_point:Nn \@@_maplike_break: { }
  }
\cs_new_protected:Npn \@@_match_once_init_aux:
  {
    \tl_build_clear:N \l_@@_matched_analysis_tl
    \tl_clear:N \l_@@_curr_analysis_tl
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_single_match:, \@@_multi_match:n}
%   For a single match, the overall success is determined by whether the
%   only match attempt is a success. When doing multiple matches, the
%   overall matching is successful as soon as any match
%   succeeds. Perform the action |#1|, then find the next match.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_single_match:
  {
    \tl_set:Nn \l_@@_every_match_tl
      {
        \bool_gset_eq:NN
          \g_@@_success_bool
          \l_@@_match_success_bool
        \@@_maplike_break:
      }
  }
\cs_new_protected:Npn \@@_multi_match:n #1
  {
    \tl_set:Nn \l_@@_every_match_tl
      {
        \if_meaning:w \c_false_bool \l_@@_match_success_bool
          \exp_after:wN \@@_maplike_break:
        \fi:
        \bool_gset_true:N \g_@@_success_bool
        #1
        \@@_match_once_init:
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_match_one_token:nnN}
% \begin{macro}[rEXP]{\@@_match_one_active:n}
%   At each new position, set some variables and get the new character
%   and category from the query. Then unpack the array of active
%   threads, and clear it by resetting its length
%   (\texttt{max_thread}). This results in a sequence of
%   \cs{@@_use_state_and_submatches:w} \meta{state}|,|\meta{submatch-clist}|;| and
%   we consider those states one by one in order. As soon as a thread
%   succeeds, exit the step, and, if there are threads to consider at the
%   next position, and we have not reached the end of the string,
%   repeat the loop. Otherwise, the last thread that succeeded is the
%   match.  We explain the \texttt{fresh_thread} business when
%   describing \cs{@@_action_wildcard:}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_match_one_token:nnN #1#2#3
  {
    \int_add:Nn \l_@@_step_int { 2 }
    \int_incr:N \l_@@_curr_pos_int
    \int_set_eq:NN \l_@@_last_char_int \l_@@_curr_char_int
    \int_set_eq:NN \l_@@_case_changed_char_int \c_max_int
    \tl_set:Nn \l_@@_curr_token_tl {#1}
    \int_set:Nn \l_@@_curr_char_int {#2}
    \int_set:Nn \l_@@_curr_catcode_int { "#3 }
    \tl_build_put_right:Nx \l_@@_matched_analysis_tl
      { \exp_not:o \l_@@_curr_analysis_tl }
    \tl_set:Nn \l_@@_curr_analysis_tl { { {#1} {#2} #3 } }
    \use:x
      {
        \int_set_eq:NN \l_@@_max_thread_int \l_@@_min_thread_int
        \int_step_function:nnN
          { \l_@@_min_thread_int }
          { \l_@@_max_thread_int - 1 }
          \@@_match_one_active:n
      }
    \prg_break_point:
    \bool_set_false:N \l_@@_fresh_thread_bool
    \if_int_compare:w \l_@@_max_thread_int > \l_@@_min_thread_int
      \if_int_compare:w -2 < \l_@@_curr_char_int
        \exp_after:wN \exp_after:wN \exp_after:wN \use_none:n
      \fi:
    \fi:
    \l_@@_every_match_tl
  }
\cs_new:Npn \@@_match_one_active:n #1
  {
    \@@_use_state_and_submatches:w
    \__kernel_intarray_range_to_clist:Nnn
      \g_@@_thread_info_intarray
      { 1 + #1 * (\l_@@_capturing_group_int * 2 + 1) }
      { (1 + #1) * (\l_@@_capturing_group_int * 2 + 1) }
    ;
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \subsubsection{Using states of the \textsc{nfa}}
%
% \begin{macro}{\@@_use_state:}
%   Use the current \textsc{nfa} instruction. The state is initially
%   marked as belonging to the current \texttt{step}: this allows normal
%   free transition to repeat, but group-repeating transitions
%   won't. Once we are done exploring all the branches it spawned, the
%   state is marked as $\texttt{step}+1$: any thread hitting it at that
%   point will be terminated.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_use_state:
  {
    \__kernel_intarray_gset:Nnn \g_@@_state_active_intarray
      { \l_@@_curr_state_int } { \l_@@_step_int }
    \@@_toks_use:w \l_@@_curr_state_int
    \__kernel_intarray_gset:Nnn \g_@@_state_active_intarray
      { \l_@@_curr_state_int }
      { \int_eval:n { \l_@@_step_int + 1 } }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_use_state_and_submatches:w}
%   This function is called as one item in the array of active threads
%   after that array has been unpacked for a new step. Update the
%   \texttt{curr_state} and \texttt{curr_submatches} and use the
%   state if it has not yet been encountered at this step.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_use_state_and_submatches:w #1 , #2 ;
  {
    \int_set:Nn \l_@@_curr_state_int {#1}
    \if_int_compare:w
        \__kernel_intarray_item:Nn \g_@@_state_active_intarray
          { \l_@@_curr_state_int }
                      < \l_@@_step_int
      \tl_set:Nn \l_@@_curr_submatches_tl { #2 , }
      \exp_after:wN \@@_use_state:
    \fi:
    \scan_stop:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Actions when matching}
%
% \begin{macro}{\@@_action_start_wildcard:N}
%   For an unanchored match, state $0$ has a free transition to the next
%   and a costly one to itself, to repeat at the next position. To catch
%   repeated identical empty matches, we need to know if a successful
%   thread corresponds to an empty match. The instruction resetting
%   \cs{l_@@_fresh_thread_bool} may be skipped by a successful
%   thread, hence we had to add it to \cs{@@_match_one_token:nnN} too.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_action_start_wildcard:N #1
  {
    \bool_set_true:N \l_@@_fresh_thread_bool
    \@@_action_free:n {1}
    \bool_set_false:N \l_@@_fresh_thread_bool
    \bool_if:NT #1 { \@@_action_cost:n {0} }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_action_free:n, \@@_action_free_group:n}
% \begin{macro}{\@@_action_free_aux:nn}
%   These functions copy a thread after checking that the \textsc{nfa}
%   state has not already been used at this position. If not, store
%   submatches in the new state, and insert the instructions for that
%   state in the input stream.  Then restore the old value of
%   \cs{l_@@_curr_state_int} and of the current submatches.  The
%   two types of free transitions differ by how they test that the state
%   has not been encountered yet: the \texttt{group} version is
%   stricter, and will not use a state if it was used earlier in the
%   current thread, hence forcefully breaking the loop, while the
%   \enquote{normal} version will revisit a state even within the thread
%   itself.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_action_free:n
  { \@@_action_free_aux:nn { > \l_@@_step_int \else: } }
\cs_new_protected:Npn \@@_action_free_group:n
  { \@@_action_free_aux:nn { < \l_@@_step_int } }
\cs_new_protected:Npn \@@_action_free_aux:nn #1#2
  {
    \use:x
      {
        \int_add:Nn \l_@@_curr_state_int {#2}
        \exp_not:n
          {
            \if_int_compare:w
                \__kernel_intarray_item:Nn \g_@@_state_active_intarray
                  { \l_@@_curr_state_int }
                #1
              \exp_after:wN \@@_use_state:
            \fi:
          }
        \int_set:Nn \l_@@_curr_state_int
          { \int_use:N \l_@@_curr_state_int }
        \tl_set:Nn \exp_not:N \l_@@_curr_submatches_tl
          { \exp_not:o \l_@@_curr_submatches_tl }
      }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_action_cost:n}
%   A transition which consumes the current character and shifts the
%   state by |#1|.  The resulting state is stored in the appropriate array
%   for use at the next position, and we also store the current
%   submatches.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_action_cost:n #1
  {
    \exp_args:Nx \@@_store_state:n
      { \int_eval:n { \l_@@_curr_state_int + #1 } }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_store_state:n}
% \begin{macro}{\@@_store_submatches:}
%   Put the given state and current submatch information in
%   \cs{g_@@_thread_info_intarray}, and increment the length of the
%   array.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_store_state:n #1
  {
    \exp_args:No \@@_store_submatches:nn
      \l_@@_curr_submatches_tl {#1}
    \int_incr:N \l_@@_max_thread_int
  }
\cs_new_protected:Npn \@@_store_submatches:nn #1#2
  {
    \__kernel_intarray_gset_range_from_clist:Nnn
      \g_@@_thread_info_intarray
      {
        \@@_int_eval:w
        1 + \l_@@_max_thread_int *
        (\l_@@_capturing_group_int * 2 + 1)
      }
      { #2 , #1 }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_disable_submatches:}
%   Some user functions don't require tracking submatches.
%   We get a performance improvement by simply defining the
%   relevant functions to remove their argument and do nothing
%   with it.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_disable_submatches:
  {
    \cs_set_protected:Npn \@@_store_submatches:n ##1 { }
    \cs_set_protected:Npn \@@_action_submatch:nN ##1##2 { }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_action_submatch:nN, \@@_action_submatch_aux:w, \@@_action_submatch_auxii:w, \@@_action_submatch_auxiii:w, \@@_action_submatch_auxiv:w}
%   Update the current submatches with the information from the current
%   position. Maybe a bottleneck.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_action_submatch:nN #1#2
  {
    \exp_after:wN \@@_action_submatch_aux:w
    \l_@@_curr_submatches_tl ; {#1} #2
  }
\cs_new_protected:Npn \@@_action_submatch_aux:w #1 ; #2#3
  {
    \tl_set:Nx \l_@@_curr_submatches_tl
      {
        \prg_replicate:nn
          { #2 \if_meaning:w > #3 + \l_@@_capturing_group_int \fi: }
          { \@@_action_submatch_auxii:w }
        \@@_action_submatch_auxiii:w
        #1
      }
  }
\cs_new:Npn \@@_action_submatch_auxii:w
    #1 \@@_action_submatch_auxiii:w #2 ,
  { #2 , #1 \@@_action_submatch_auxiii:w }
\cs_new:Npn \@@_action_submatch_auxiii:w #1 ,
  { \int_use:N \l_@@_curr_pos_int , }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_action_success:}
%   There is a successful match when an execution path reaches the last
%   state in the \textsc{nfa}, unless this marks a second identical
%   empty match. Then mark that there was a successful match; it is
%   empty if it is \enquote{fresh}; and we store the current position
%   and submatches. The current step is then interrupted with
%   \cs{prg_break:}, and only paths with higher precedence are
%   pursued further. The values stored here may be overwritten by a
%   later success of a path with higher precedence.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_action_success:
  {
    \@@_if_two_empty_matches:F
      {
        \bool_set_true:N \l_@@_match_success_bool
        \bool_set_eq:NN \l_@@_empty_success_bool
          \l_@@_fresh_thread_bool
        \int_set_eq:NN \l_@@_success_pos_int \l_@@_curr_pos_int
        \int_set_eq:NN \l_@@_last_char_success_int \l_@@_last_char_int
        \tl_build_clear:N \l_@@_matched_analysis_tl
        \tl_set_eq:NN \l_@@_success_submatches_tl
          \l_@@_curr_submatches_tl
        \prg_break:
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsection{Replacement}
%
% \subsubsection{Variables and helpers used in replacement}
%
% \begin{variable}{\l_@@_replacement_csnames_int}
%   The behaviour of closing braces inside a replacement text depends on
%   whether a sequences |\c{| or |\u{| has been encountered. The number
%       of \enquote{open} such sequences that should be closed by |}| is
%     stored in \cs{l_@@_replacement_csnames_int}, and decreased by
%     $1$ by each |}|.
%    \begin{macrocode}
\int_new:N \l_@@_replacement_csnames_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_replacement_category_tl, \l_@@_replacement_category_seq}
%   This sequence of letters is used to correctly restore categories in
%   nested constructions such as |\cL(abc\cD(_)d)|.
%    \begin{macrocode}
\tl_new:N \l_@@_replacement_category_tl
\seq_new:N \l_@@_replacement_category_seq
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_balance_tl}
%   This token list holds the replacement text for
%   \cs{@@_replacement_balance_one_match:n} while it is being built
%   incrementally.
%    \begin{macrocode}
\tl_new:N \l_@@_balance_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}[rEXP]{\@@_replacement_balance_one_match:n}
%   This expects as an argument the first index of a set of entries in
%   \cs{g_@@_submatch_begin_intarray} (and related arrays) which hold the
%   submatch information for a given match. It
%   can be used within an integer expression to obtain the brace balance
%   incurred by performing the replacement on that match. This combines
%   the braces lost by removing the match, braces added by all the
%   submatches appearing in the replacement, and braces appearing
%   explicitly in the replacement. Even though it is always redefined
%   before use, we initialize it as for an empty replacement. An
%   important property is that concatenating several calls to that
%   function must result in a valid integer expression (hence a leading
%   |+| in the actual definition).
%    \begin{macrocode}
\cs_new:Npn \@@_replacement_balance_one_match:n #1
  { - \@@_submatch_balance:n {#1} }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[rEXP]{\@@_replacement_do_one_match:n}
%   The input is the same as \cs{@@_replacement_balance_one_match:n}.
%   This function is redefined to expand to the part of the token list
%   from the end of the previous match to a given match, followed by the
%   replacement text. Hence concatenating the result of this function
%   with all possible arguments (one call for each match), as well as
%   the range from the end of the last match to the end of the string,
%   produces the fully replaced token list. The initialization does
%   not matter, but (as an example) we set it as for an empty replacement.
%    \begin{macrocode}
\cs_new:Npn \@@_replacement_do_one_match:n #1
  {
    \@@_query_range:nn
      { \__kernel_intarray_item:Nn \g_@@_submatch_prev_intarray {#1} }
      { \__kernel_intarray_item:Nn \g_@@_submatch_begin_intarray {#1} }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_exp_not:N}
%   This function lets us navigate around the fact that the primitive
%   \cs{exp_not:n} requires a braced argument. As far as I can tell, it
%   is only needed if the user tries to include in the replacement text
%   a control sequence set equal to a macro parameter character, such as
%   \cs{c_parameter_token}. Indeed, within an \texttt{x}-expanding
%   assignment, \cs{exp_not:N}~|#| behaves as a single |#|, whereas
%   \cs{exp_not:n}~|{#}| behaves as a doubled |##|.
%    \begin{macrocode}
\cs_new:Npn \@@_replacement_exp_not:N #1 { \exp_not:n {#1} }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_exp_not:V}
%   This is used for the implementation of~|\u|, and it gets redefined
%   for \cs{peek_regex_replace_once:nnTF}.
%    \begin{macrocode}
\cs_new_eq:NN \@@_replacement_exp_not:V \exp_not:V
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Query and brace balance}
%
% \begin{macro}[rEXP]{\@@_query_range:nn}
% \begin{macro}[rEXP]{\@@_query_range_loop:ww}
%   When it is time to extract submatches from the token list, the
%   various tokens are stored in \tn{toks} registers numbered from
%   \cs{l_@@_min_pos_int} inclusive to \cs{l_@@_max_pos_int}
%   exclusive. The function \cs{@@_query_range:nn} \Arg{min}
%   \Arg{max} unpacks registers from the position \meta{min} to the
%   position $\meta{max}-1$ included. Once this is expanded, a second
%   \texttt{x}-expansion results in the actual tokens from the
%   query. That second expansion is only done by user functions at the
%   very end of their operation, after checking (and correcting) the
%   brace balance first.
%    \begin{macrocode}
\cs_new:Npn \@@_query_range:nn #1#2
  {
    \exp_after:wN \@@_query_range_loop:ww
    \int_value:w \@@_int_eval:w #1 \exp_after:wN ;
    \int_value:w \@@_int_eval:w #2 ;
    \prg_break_point:
  }
\cs_new:Npn \@@_query_range_loop:ww #1 ; #2 ;
  {
    \if_int_compare:w #1 < #2 \exp_stop_f:
    \else:
      \exp_after:wN \prg_break:
    \fi:
    \@@_toks_use:w #1 \exp_stop_f:
    \exp_after:wN \@@_query_range_loop:ww
      \int_value:w \@@_int_eval:w #1 + 1 ; #2 ;
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_query_submatch:n}
%   Find the start and end positions for a given submatch (of a given match).
%    \begin{macrocode}
\cs_new:Npn \@@_query_submatch:n #1
  {
    \@@_query_range:nn
      { \__kernel_intarray_item:Nn \g_@@_submatch_begin_intarray {#1} }
      { \__kernel_intarray_item:Nn \g_@@_submatch_end_intarray {#1} }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_submatch_balance:n}
%   Every user function must result in a balanced token list (unbalanced
%   token lists cannot be stored by TeX). When we unpacked the query, we
%   kept track of the brace balance, hence the contribution from a given
%   range is the difference between the brace balances at the
%   \meta{max~pos} and \meta{min~pos}.  These two positions are found in
%   the corresponding \enquote{submatch} arrays.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_submatch_balance:n #1
  {
    \int_eval:n
      {
        \@@_intarray_item:NnF \g_@@_balance_intarray
          {
            \__kernel_intarray_item:Nn
              \g_@@_submatch_end_intarray {#1}
          }
          { 0 }
        -
        \@@_intarray_item:NnF \g_@@_balance_intarray
          {
            \__kernel_intarray_item:Nn
              \g_@@_submatch_begin_intarray {#1}
          }
          { 0 }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Framework}
%
% \begin{macro}{\@@_replacement:n}
% \begin{macro}{\@@_replacement_aux:n}
%   The replacement text is built incrementally. We keep track in
%   \cs{l_@@_balance_int} of the balance of explicit begin- and
%   end-group tokens and we store in \cs{l_@@_balance_tl} some
%   code to compute the brace balance from submatches (see its
%   description). Detect unescaped right braces, and escaped characters,
%   with trailing \cs{prg_do_nothing:} because some of the later
%   function look-ahead. Once the whole replacement text has been
%   parsed, make sure that there is no open csname. Finally, define the
%   \texttt{balance_one_match} and \texttt{do_one_match} functions.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement:n #1
  {
    \group_begin:
      \tl_build_begin:N \l_@@_build_tl
      \int_zero:N \l_@@_balance_int
      \tl_clear:N \l_@@_balance_tl
      \@@_escape_use:nnnn
        {
          \if_charcode:w \c_right_brace_str ##1
            \@@_replacement_rbrace:N
          \else:
            \@@_replacement_normal:n
          \fi:
          ##1
        }
        { \@@_replacement_escaped:N ##1 }
        { \@@_replacement_normal:n ##1 }
        {#1}
      \prg_do_nothing: \prg_do_nothing:
      \if_int_compare:w \l_@@_replacement_csnames_int > 0 \exp_stop_f:
        \__kernel_msg_error:nnx { regex } { replacement-missing-rbrace }
          { \int_use:N \l_@@_replacement_csnames_int }
        \tl_build_put_right:Nx \l_@@_build_tl
          { \prg_replicate:nn \l_@@_replacement_csnames_int \cs_end: }
      \fi:
      \seq_if_empty:NF \l_@@_replacement_category_seq
        {
          \__kernel_msg_error:nnx { regex } { replacement-missing-rparen }
            { \seq_count:N \l_@@_replacement_category_seq }
          \seq_clear:N \l_@@_replacement_category_seq
        }
      \cs_gset:Npx \@@_replacement_balance_one_match:n ##1
        {
          + \int_use:N \l_@@_balance_int
          \l_@@_balance_tl
          - \@@_submatch_balance:n {##1}
        }
      \tl_build_end:N \l_@@_build_tl
      \exp_args:NNo
    \group_end:
    \@@_replacement_aux:n \l_@@_build_tl
  }
\cs_new_protected:Npn \@@_replacement_aux:n #1
  {
    \cs_set:Npn \@@_replacement_do_one_match:n ##1
      {
        \@@_query_range:nn
          {
            \__kernel_intarray_item:Nn
              \g_@@_submatch_prev_intarray {##1}
          }
          {
            \__kernel_intarray_item:Nn
              \g_@@_submatch_begin_intarray {##1}
          }
        #1
      }
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\@@_replacement_put:n}
%   This gets redefined for \cs{peek_regex_replace_once:nnTF}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_put:n
  { \tl_build_put_right:Nn \l_@@_build_tl }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_normal:n, \@@_replacement_normal_aux:N}
%   Most characters are simply sent to the output by
%   \cs{tl_build_put_right:Nn}, unless a particular category code has been
%   requested: then \cs{@@_replacement_c_A:w} or a similar auxiliary is
%   called.  One exception is right parentheses, which restore the
%   category code in place before the group started.  Note that the
%   sequence is non-empty there: it contains an empty entry
%   corresponding to the initial value of
%   \cs{l_@@_replacement_category_tl}.
%   The argument |#1| is a single character (including the case of a catcode-other space).
%   In case no specific catcode is requested, we taked into account the
%   current catcode regime (at the time the replacement is performed)
%   as much as reasonable, with all impossible catcodes (escape,
%   newline, etc.) being mapped to \enquote{other}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_normal:n #1
  {
    \tl_if_empty:NTF \l_@@_replacement_category_tl
      { \@@_replacement_normal_aux:N #1 }
      { % (
        \token_if_eq_charcode:NNTF #1 )
          {
            \seq_pop:NN \l_@@_replacement_category_seq
              \l_@@_replacement_category_tl
          }
          {
            \use:c { @@_replacement_c_ \l_@@_replacement_category_tl :w }
            ? #1
          }
      }
  }
\cs_new_protected:Npn \@@_replacement_normal_aux:N #1
  {
    \token_if_eq_charcode:NNTF #1 \c_space_token
      { \@@_replacement_c_S:w }
      {
        \exp_after:wN \exp_after:wN
        \if_case:w \tex_catcode:D `#1 \exp_stop_f:
             \@@_replacement_c_O:w
        \or: \@@_replacement_c_B:w
        \or: \@@_replacement_c_E:w
        \or: \@@_replacement_c_M:w
        \or: \@@_replacement_c_T:w
        \or: \@@_replacement_c_O:w
        \or: \@@_replacement_c_P:w
        \or: \@@_replacement_c_U:w
        \or: \@@_replacement_c_D:w
        \or: \@@_replacement_c_O:w
        \or: \@@_replacement_c_S:w
        \or: \@@_replacement_c_L:w
        \or: \@@_replacement_c_O:w
        \or: \@@_replacement_c_A:w
        \else: \@@_replacement_c_O:w
        \fi:
      }
    ? #1
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_escaped:N}
%   As in parsing a regular expression, we use an auxiliary built from
%   |#1| if defined. Otherwise, check for escaped digits (standing from
%   submatches from $0$ to $9$): anything else is a raw character.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_escaped:N #1
  {
    \cs_if_exist_use:cF { @@_replacement_#1:w }
      {
        \if_int_compare:w 1 < 1#1 \exp_stop_f:
          \@@_replacement_put_submatch:n {#1}
        \else:
          \@@_replacement_normal:n {#1}
        \fi:
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Submatches}
%
% \begin{macro}{\@@_replacement_put_submatch:n, \@@_replacement_put_submatch_aux:n}
%   Insert a submatch in the replacement text. This is dropped if the
%   submatch number is larger than the number of capturing groups.
%   Unless the submatch appears inside a |\c{...}| or |\u{...}|
%   construction, it must be taken into account in the brace balance.
%   Later on, |##1| will be replaced by a pointer to the $0$-th submatch for a
%   given match.  There is an \cs{exp_not:N} here as at the point-of-use
%   of \cs{l_@@_balance_tl} there is an \texttt{x}-type expansion which is needed
%   to get |##1| in correctly.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_put_submatch:n #1
  {
    \if_int_compare:w #1 < \l_@@_capturing_group_int
      \@@_replacement_put_submatch_aux:n {#1}
    \fi:
  }
\cs_new_protected:Npn \@@_replacement_put_submatch_aux:n #1
  {
    \tl_build_put_right:Nn \l_@@_build_tl
      { \@@_query_submatch:n { \int_eval:n { #1 + ##1 } } }
    \if_int_compare:w \l_@@_replacement_csnames_int = 0 \exp_stop_f:
      \tl_put_right:Nn \l_@@_balance_tl
        {
          + \@@_submatch_balance:n
            { \exp_not:N \int_eval:n { #1 + ##1 } }
        }
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_g:w}
% \begin{macro}[rEXP]{\@@_replacement_g_digits:NN}
%   Grab digits for the |\g| escape sequence in a primitive assignment
%   to the integer \cs{l_@@_internal_a_int}.  At the end of the run of
%   digits, check that it ends with a right brace.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_g:w #1#2
  {
    \@@_two_if_eq:NNNNTF
      #1 #2 \@@_replacement_normal:n \c_left_brace_str
      { \l_@@_internal_a_int = \@@_replacement_g_digits:NN }
      { \@@_replacement_error:NNN g #1 #2 }
  }
\cs_new:Npn \@@_replacement_g_digits:NN #1#2
  {
    \token_if_eq_meaning:NNTF #1 \@@_replacement_normal:n
      {
        \if_int_compare:w 1 < 1#2 \exp_stop_f:
          #2
          \exp_after:wN \use_i:nnn
          \exp_after:wN \@@_replacement_g_digits:NN
        \else:
          \exp_stop_f:
          \exp_after:wN \@@_replacement_error:NNN
          \exp_after:wN g
        \fi:
      }
      {
        \exp_stop_f:
        \if_meaning:w \@@_replacement_rbrace:N #1
          \exp_args:No \@@_replacement_put_submatch:n
            { \int_use:N \l_@@_internal_a_int }
          \exp_after:wN \use_none:nn
        \else:
          \exp_after:wN \@@_replacement_error:NNN
          \exp_after:wN g
        \fi:
      }
    #1 #2
  }
%    \end{macrocode}
% \end{macro}
% \end{macro}
%
% \subsubsection{Csnames in replacement}
%
% \begin{macro}{\@@_replacement_c:w}
%   |\c| may only be followed by an unescaped character.  If followed by
%   a left brace, start a control sequence by calling an auxiliary
%   common with |\u|.  Otherwise test whether the category is known; if
%   it is not, complain.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_c:w #1#2
  {
    \token_if_eq_meaning:NNTF #1 \@@_replacement_normal:n
      {
        \exp_after:wN \token_if_eq_charcode:NNTF \c_left_brace_str #2
          { \@@_replacement_cu_aux:Nw \@@_replacement_exp_not:N }
          {
            \cs_if_exist:cTF { @@_replacement_c_#2:w }
              { \@@_replacement_cat:NNN #2 }
              { \@@_replacement_error:NNN c #1#2 }
          }
      }
      { \@@_replacement_error:NNN c #1#2 }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_cu_aux:Nw}
%   Start a control sequence with \cs{cs:w}, protected
%   from expansion by |#1| (either \cs{@@_replacement_exp_not:N} or
%   \cs{exp_not:V}), or turned to a string by \cs{tl_to_str:V} if inside
%   another csname construction |\c| or |\u|.  We use \cs{tl_to_str:V}
%   rather than \cs{tl_to_str:N} to deal with integers and other
%   registers.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_cu_aux:Nw #1
  {
    \if_case:w \l_@@_replacement_csnames_int
      \tl_build_put_right:Nn \l_@@_build_tl
        { \exp_not:n { \exp_after:wN #1 \cs:w } }
    \else:
      \tl_build_put_right:Nn \l_@@_build_tl
        { \exp_not:n { \exp_after:wN \tl_to_str:V \cs:w } }
    \fi:
    \int_incr:N \l_@@_replacement_csnames_int
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_u:w}
%   Check that |\u| is followed by a left brace. If so, start a control
%   sequence with \cs{cs:w}, which is then unpacked either with
%   \cs{exp_not:V} or \cs{tl_to_str:V} depending on the current context.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_u:w #1#2
  {
    \@@_two_if_eq:NNNNTF
      #1 #2 \@@_replacement_normal:n \c_left_brace_str
      { \@@_replacement_cu_aux:Nw \@@_replacement_exp_not:V }
      { \@@_replacement_error:NNN u #1#2 }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_rbrace:N}
%   Within a |\c{...}| or |\u{...}| construction, end the control
%   sequence, and decrease the brace count. Otherwise, this is a raw
%   right brace.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_rbrace:N #1
  {
    \if_int_compare:w \l_@@_replacement_csnames_int > 0 \exp_stop_f:
      \tl_build_put_right:Nn \l_@@_build_tl { \cs_end: }
      \int_decr:N \l_@@_replacement_csnames_int
    \else:
      \@@_replacement_normal:n {#1}
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Characters in replacement}
%
% \begin{macro}{\@@_replacement_cat:NNN}
%   Here, |#1| is a letter among |BEMTPUDSLOA| and |#2#3| denote the
%   next character.  Complain if we reach the end of the replacement or
%   if the construction appears inside |\c{|\ldots{}|}| or
%   |\u{|\ldots{}|}|, and detect the case of a parenthesis.  In that
%   case, store the current category in a sequence and switch to a new
%   one.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_cat:NNN #1#2#3
  {
    \token_if_eq_meaning:NNTF \prg_do_nothing: #3
      { \__kernel_msg_error:nn { regex } { replacement-catcode-end } }
      {
        \int_compare:nNnTF { \l_@@_replacement_csnames_int } > 0
          {
            \__kernel_msg_error:nnnn
              { regex } { replacement-catcode-in-cs } {#1} {#3}
            #2 #3
          }
          {
            \@@_two_if_eq:NNNNTF #2 #3 \@@_replacement_normal:n (
              {
                \seq_push:NV \l_@@_replacement_category_seq
                  \l_@@_replacement_category_tl
                \tl_set:Nn \l_@@_replacement_category_tl {#1}
              }
              {
                \token_if_eq_meaning:NNT #2 \@@_replacement_escaped:N
                  {
                    \@@_char_if_alphanumeric:NTF #3
                      {
                        \__kernel_msg_error:nnnn
                          { regex } { replacement-catcode-escaped }
                          {#1} {#3}
                      }
                      { }
                  }
                \use:c { @@_replacement_c_#1:w } #2 #3
              }
          }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% We now need to change the category code of the null character many
% times, hence work in a group. The catcode-specific macros below are
% defined in alphabetical order; if you are trying to understand the
% code, start from the end of the alphabet as those categories are
% simpler than active or begin-group.
%    \begin{macrocode}
\group_begin:
%    \end{macrocode}
%
% \begin{macro}{\@@_replacement_char:nNN}
%   The only way to produce an arbitrary character--catcode pair is to
%   use the \tn{lowercase} or \tn{uppercase} primitives. This is a
%   wrapper for our purposes. The first argument is the null character
%   with various catcodes. The second and third arguments are grabbed
%   from the input stream: |#3| is the character whose character code to
%   reproduce.  We could use \cs{char_generate:nn} but only for some
%   catcodes (active characters and spaces are not supported).
%    \begin{macrocode}
  \cs_new_protected:Npn \@@_replacement_char:nNN #1#2#3
    {
      \tex_lccode:D 0 = `#3 \scan_stop:
      \tex_lowercase:D { \@@_replacement_put:n {#1} }
    }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_A:w}
%   For an active character, expansion must be avoided, twice because we
%   later do two \texttt{x}-expansions, to unpack \tn{toks} for the
%   query, and to expand their contents to tokens of the query.
%    \begin{macrocode}
  \char_set_catcode_active:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_A:w
    { \@@_replacement_char:nNN { \exp_not:n { \exp_not:N ^^@ } } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_B:w}
%   An explicit begin-group token increases the balance, unless within a
%   |\c{...}| or |\u{...}| construction. Add the desired begin-group
%   character, using the standard \cs{if_false:} trick. We eventually
%   \texttt{x}-expand twice. The first time must yield a balanced token
%   list, and the second one gives the bare begin-group token. The
%   \cs{exp_after:wN} is not strictly needed, but is more consistent
%   with \pkg{l3tl-analysis}.
%    \begin{macrocode}
  \char_set_catcode_group_begin:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_B:w
    {
      \if_int_compare:w \l_@@_replacement_csnames_int = 0 \exp_stop_f:
        \int_incr:N \l_@@_balance_int
      \fi:
      \@@_replacement_char:nNN
        { \exp_not:n { \exp_after:wN ^^@ \if_false: } \fi: } }
    }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_C:w}
%   This is not quite catcode-related: when the user requests a
%   character with category \enquote{control sequence}, the
%   one-character control symbol is returned. As for the active
%   character, we prepare for two \texttt{x}-expansions.
%    \begin{macrocode}
  \cs_new_protected:Npn \@@_replacement_c_C:w #1#2
    {
      \tl_build_put_right:Nn \l_@@_build_tl
        { \exp_not:N \@@_replacement_exp_not:N \exp_not:c {#2} }
    }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_D:w}
%   Subscripts fit the mould: \tn{lowercase} the null byte with the
%   correct category.
%    \begin{macrocode}
  \char_set_catcode_math_subscript:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_D:w
    { \@@_replacement_char:nNN { ^^@ } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_E:w}
%   Similar to the begin-group case, the second \texttt{x}-expansion
%   produces the bare end-group token.
%    \begin{macrocode}
  \char_set_catcode_group_end:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_E:w
    {
      \if_int_compare:w \l_@@_replacement_csnames_int = 0 \exp_stop_f:
        \int_decr:N \l_@@_balance_int
      \fi:
      \@@_replacement_char:nNN
        { \exp_not:n { \if_false: { \fi:  ^^@ } }
    }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_L:w}
%   Simply \tn{lowercase} a letter null byte to produce an arbitrary letter.
%    \begin{macrocode}
  \char_set_catcode_letter:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_L:w
    { \@@_replacement_char:nNN { ^^@ } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_M:w}
%   No surprise here, we lowercase the null math toggle.
%    \begin{macrocode}
  \char_set_catcode_math_toggle:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_M:w
    { \@@_replacement_char:nNN { ^^@ } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_O:w}
%   Lowercase an other null byte.
%    \begin{macrocode}
  \char_set_catcode_other:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_O:w
    { \@@_replacement_char:nNN { ^^@ } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_P:w}
%   For macro parameters, expansion is a tricky issue. We need to
%   prepare for two \texttt{x}-expansions and passing through various
%   macro definitions. Note that we cannot replace one \cs{exp_not:n} by
%   doubling the macro parameter characters because this would misbehave
%   if a mischievous user asks for |\c{\cP\#}|, since that macro
%   parameter character would be doubled.
%    \begin{macrocode}
  \char_set_catcode_parameter:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_P:w
    {
      \@@_replacement_char:nNN
        { \exp_not:n { \exp_not:n { ^^@^^@^^@^^@ } } }
    }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_S:w}
%   Spaces are normalized on input by \TeX{} to have character code
%   $32$. It is in fact impossible to get a token with character code
%   $0$ and category code $10$. Hence we use $32$ instead of $0$ as our
%   base character.
%    \begin{macrocode}
  \cs_new_protected:Npn \@@_replacement_c_S:w #1#2
    {
      \if_int_compare:w `#2 = 0 \exp_stop_f:
        \__kernel_msg_error:nn { regex } { replacement-null-space }
      \fi:
      \tex_lccode:D `\ = `#2 \scan_stop:
      \tex_lowercase:D { \@@_replacement_put:n {~} }
    }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_T:w}
%   No surprise for alignment tabs here. Those are surrounded by the
%   appropriate braces whenever necessary, hence they don't cause
%   trouble in alignment settings.
%    \begin{macrocode}
  \char_set_catcode_alignment:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_T:w
    { \@@_replacement_char:nNN { ^^@ } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replacement_c_U:w}
%   Simple call to \cs{@@_replacement_char:nNN} which lowercases the
%   math superscript |^^@|.
%    \begin{macrocode}
  \char_set_catcode_math_superscript:N \^^@
  \cs_new_protected:Npn \@@_replacement_c_U:w
    { \@@_replacement_char:nNN { ^^@ } }
%    \end{macrocode}
% \end{macro}
%
% Restore the catcode of the null byte.
%    \begin{macrocode}
\group_end:
%    \end{macrocode}
%
% \subsubsection{An error}
%
% \begin{macro}{\@@_replacement_error:NNN}
%   Simple error reporting by calling one of the messages
%   \texttt{replacement-c}, \texttt{replacement-g}, or
%   \texttt{replacement-u}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replacement_error:NNN #1#2#3
  {
    \__kernel_msg_error:nnx { regex } { replacement-#1 } {#3}
    #2 #3
  }
%    \end{macrocode}
% \end{macro}
%
% \subsection{User functions}
%
% \begin{macro}{\regex_new:N}
%   Before being assigned a sensible value, a regex variable matches
%   nothing.
%    \begin{macrocode}
\cs_new_protected:Npn \regex_new:N #1
  { \cs_new_eq:NN #1 \c_@@_no_match_regex }
%    \end{macrocode}
% \end{macro}
%
% \begin{variable}{\l_tmpa_regex, \l_tmpb_regex, \g_tmpa_regex, \g_tmpb_regex}
%   The usual scratch space.
%    \begin{macrocode}
\regex_new:N \l_tmpa_regex
\regex_new:N \l_tmpb_regex
\regex_new:N \g_tmpa_regex
\regex_new:N \g_tmpb_regex
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}{\regex_set:Nn, \regex_gset:Nn, \regex_const:Nn}
%   Compile, then store the result in the user variable with the
%   appropriate assignment function.
%    \begin{macrocode}
\cs_new_protected:Npn \regex_set:Nn #1#2
  {
    \@@_compile:n {#2}
    \tl_set_eq:NN #1 \l_@@_internal_regex
  }
\cs_new_protected:Npn \regex_gset:Nn #1#2
  {
    \@@_compile:n {#2}
    \tl_gset_eq:NN #1 \l_@@_internal_regex
  }
\cs_new_protected:Npn \regex_const:Nn #1#2
  {
    \@@_compile:n {#2}
    \tl_const:Nx #1 { \exp_not:o \l_@@_internal_regex }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\regex_show:N, \regex_show:n}
%   User functions: the \texttt{n} variant requires compilation first.
%   Then show the variable with some appropriate text. The auxiliary
%   is defined in a different section.
%    \begin{macrocode}
\cs_new_protected:Npn \regex_show:n #1
  {
    \@@_compile:n {#1}
    \@@_show:N \l_@@_internal_regex
    \__kernel_msg_show:nnxxxx { regex } { show }
      { \tl_to_str:n {#1} } { }
      { \l_@@_internal_a_tl } { }
  }
\cs_new_protected:Npn \regex_show:N #1
  {
    \__kernel_chk_defined:NT #1
      {
        \@@_show:N #1
        \__kernel_msg_show:nnxxxx { regex } { show }
          { } { \token_to_str:N #1 }
          { \l_@@_internal_a_tl } { }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[TF]{\regex_match:nn, \regex_match:Nn}
%   Those conditionals are based on a common auxiliary defined
%   later. Its first argument builds the \textsc{nfa} corresponding to
%   the regex, and the second argument is the query token list. Once we
%   have performed the match, convert the resulting boolean to
%   \cs{prg_return_true:} or \texttt{false}.
%    \begin{macrocode}
\prg_new_protected_conditional:Npnn \regex_match:nn #1#2 { T , F , TF }
  {
    \@@_if_match:nn { \@@_build:n {#1} } {#2}
    \@@_return:
  }
\prg_new_protected_conditional:Npnn \regex_match:Nn #1#2 { T , F , TF }
  {
    \@@_if_match:nn { \@@_build:N #1 } {#2}
    \@@_return:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\regex_count:nnN, \regex_count:NnN}
%   Again, use an auxiliary whose first argument builds the \textsc{nfa}.
%    \begin{macrocode}
\cs_new_protected:Npn \regex_count:nnN #1
  { \@@_count:nnN { \@@_build:n {#1} } }
\cs_new_protected:Npn \regex_count:NnN #1
  { \@@_count:nnN { \@@_build:N #1 } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[noTF]
%   {
%     \regex_extract_once:nnN, \regex_extract_once:NnN,
%     \regex_extract_all:nnN,  \regex_extract_all:NnN,
%     \regex_replace_once:nnN, \regex_replace_once:NnN,
%     \regex_replace_all:nnN,  \regex_replace_all:NnN,
%     \regex_split:nnN,        \regex_split:NnN
%   }
%   We define here $40$ user functions, following a common pattern in
%   terms of \texttt{:nnN} auxiliaries, defined in the coming
%   subsections.  The auxiliary is handed \cs{@@_build:n} or
%   \cs{@@_build:N} with the appropriate regex argument, then all
%   other necessary arguments (replacement text, token list, \emph{etc.}
%   The conditionals call \cs{@@_return:} to return either
%   \texttt{true} or \texttt{false} once matching has been performed.
%    \begin{macrocode}
\cs_set_protected:Npn \@@_tmp:w #1#2#3
  {
    \cs_new_protected:Npn #2 ##1 { #1 { \@@_build:n {##1} } }
    \cs_new_protected:Npn #3 ##1 { #1 { \@@_build:N  ##1  } }
    \prg_new_protected_conditional:Npnn #2 ##1##2##3 { T , F , TF }
      { #1 { \@@_build:n {##1} } {##2} ##3 \@@_return: }
    \prg_new_protected_conditional:Npnn #3 ##1##2##3 { T , F , TF }
      { #1 { \@@_build:N  ##1  } {##2} ##3 \@@_return: }
  }
\@@_tmp:w \@@_extract_once:nnN
  \regex_extract_once:nnN \regex_extract_once:NnN
\@@_tmp:w \@@_extract_all:nnN
  \regex_extract_all:nnN \regex_extract_all:NnN
\@@_tmp:w \@@_replace_once:nnN
  \regex_replace_once:nnN \regex_replace_once:NnN
\@@_tmp:w \@@_replace_all:nnN
  \regex_replace_all:nnN \regex_replace_all:NnN
\@@_tmp:w \@@_split:nnN \regex_split:nnN \regex_split:NnN
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Variables and helpers for user functions}
%
% \begin{variable}{\l_@@_match_count_int}
%   The number of matches found so far is stored
%   in \cs{l_@@_match_count_int}. This is only used
%   in the \cs{regex_count:nnN} functions.
%    \begin{macrocode}
\int_new:N \l_@@_match_count_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{@@_begin, @@_end}
%   Those flags are raised to indicate extra begin-group
%   or end-group tokens when extracting submatches.
%    \begin{macrocode}
\flag_new:n { @@_begin }
\flag_new:n { @@_end }
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_min_submatch_int, \l_@@_submatch_int, \l_@@_zeroth_submatch_int}
%   The end-points of each submatch are stored in two arrays whose index \meta{submatch} ranges
%   from \cs{l_@@_min_submatch_int} (inclusive) to
%   \cs{l_@@_submatch_int} (exclusive). Each successful match comes
%   with a $0$-th submatch (the full match), and one match for each
%   capturing group: submatches corresponding to the last successful
%   match are labelled starting at \texttt{zeroth_submatch}. The entry
%   \cs{l_@@_zeroth_submatch_int} in \cs{g_@@_submatch_prev_intarray} holds
%   the position at which that match attempt started: this is used for
%   splitting and replacements.
%    \begin{macrocode}
\int_new:N \l_@@_min_submatch_int
\int_new:N \l_@@_submatch_int
\int_new:N \l_@@_zeroth_submatch_int
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\g_@@_submatch_prev_intarray, \g_@@_submatch_begin_intarray, \g_@@_submatch_end_intarray}
%   Hold the place where the match attempt begun and the end-points of each submatch.
%    \begin{macrocode}
\intarray_new:Nn \g_@@_submatch_prev_intarray { 65536 }
\intarray_new:Nn \g_@@_submatch_begin_intarray { 65536 }
\intarray_new:Nn \g_@@_submatch_end_intarray { 65536 }
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\g_@@_balance_intarray}
%   The first thing we do when matching is to store the balance of
%   begin-group/end-group characters into \cs{g_@@_balance_intarray}.
%    \begin{macrocode}
\intarray_new:Nn \g_@@_balance_intarray { 65536 }
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}{\@@_return:}
%   This function triggers either \cs{prg_return_false:} or
%   \cs{prg_return_true:} as appropriate to whether a match was found or
%   not. It is used by all user conditionals.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_return:
  {
    \if_meaning:w \c_true_bool \g_@@_success_bool
      \prg_return_true:
    \else:
      \prg_return_false:
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_query_set:n, \@@_query_set_aux:nN}
%   To easily extract subsets of the input once we found the positions
%   at which to cut, store the input tokens one by one into successive
%   \tn{toks} registers.  Also store the brace balance (used to check
%   for overall brace balance) in an array.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_query_set:n #1
  {
    \int_zero:N \l_@@_balance_int
    \int_zero:N \l_@@_curr_pos_int
    \@@_query_set_aux:nN { } F
    \tl_analysis_map_inline:nn {#1}
      { \@@_query_set_aux:nN {##1} ##3 }
    \@@_query_set_aux:nN { } F
    \int_set_eq:NN \l_@@_max_pos_int \l_@@_curr_pos_int
  }
\cs_new_protected:Npn \@@_query_set_aux:nN #1#2
  {
    \int_incr:N \l_@@_curr_pos_int
    \@@_toks_set:Nn \l_@@_curr_pos_int {#1}
    \__kernel_intarray_gset:Nnn \g_@@_balance_intarray
      { \l_@@_curr_pos_int } { \l_@@_balance_int }
    \if_case:w "#2 \exp_stop_f:
    \or: \int_incr:N \l_@@_balance_int
    \or: \int_decr:N \l_@@_balance_int
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Matching}
%
% \begin{macro}{\@@_if_match:nn}
%   We don't track submatches, and stop after a single match. Build the
%   \textsc{nfa} with |#1|, and perform the match on the query |#2|.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_if_match:nn #1#2
  {
    \group_begin:
      \@@_disable_submatches:
      \@@_single_match:
      #1
      \@@_match:n {#2}
    \group_end:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_count:nnN}
%   Again, we don't care about submatches. Instead of aborting after the
%   first \enquote{longest match} is found, we search for multiple
%   matches, incrementing \cs{l_@@_match_count_int} every time to
%   record the number of matches. Build the \textsc{nfa} and match. At
%   the end, store the result in the user's variable.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_count:nnN #1#2#3
  {
    \group_begin:
      \@@_disable_submatches:
      \int_zero:N \l_@@_match_count_int
      \@@_multi_match:n { \int_incr:N \l_@@_match_count_int }
      #1
      \@@_match:n {#2}
      \exp_args:NNNo
    \group_end:
    \int_set:Nn #3 { \int_use:N \l_@@_match_count_int }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Extracting submatches}
%
% \begin{macro}{\@@_extract_once:nnN, \@@_extract_all:nnN}
%   Match once or multiple times. After each match (or after the only
%   match), extract the submatches using \cs{@@_extract:}. At the
%   end, store the sequence containing all the submatches into the user
%   variable |#3| after closing the group.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_extract_once:nnN #1#2#3
  {
    \group_begin:
      \@@_single_match:
      #1
      \@@_match:n {#2}
      \@@_extract:
      \@@_query_set:n {#2}
    \@@_group_end_extract_seq:N #3
  }
\cs_new_protected:Npn \@@_extract_all:nnN #1#2#3
  {
    \group_begin:
      \@@_multi_match:n { \@@_extract: }
      #1
      \@@_match:n {#2}
      \@@_query_set:n {#2}
    \@@_group_end_extract_seq:N #3
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_split:nnN}
%   Splitting at submatches is a bit more tricky. For each match,
%   extract all submatches, and replace the zeroth submatch by the part
%   of the query between the start of the match attempt and the start of
%   the zeroth submatch. This is inhibited if the delimiter matched an
%   empty token list at the start of this match attempt.  After the last
%   match, store the last part of the token list, which ranges from the
%   start of the match attempt to the end of the query. This step is
%   inhibited if the last match was empty and at the very end: decrement
%   \cs{l_@@_submatch_int}, which controls which matches will be used.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_split:nnN #1#2#3
  {
    \group_begin:
      \@@_multi_match:n
        {
          \if_int_compare:w
            \l_@@_start_pos_int < \l_@@_success_pos_int
            \@@_extract:
            \__kernel_intarray_gset:Nnn \g_@@_submatch_prev_intarray
              { \l_@@_zeroth_submatch_int } { 0 }
            \__kernel_intarray_gset:Nnn \g_@@_submatch_end_intarray
              { \l_@@_zeroth_submatch_int }
              {
                \__kernel_intarray_item:Nn \g_@@_submatch_begin_intarray
                  { \l_@@_zeroth_submatch_int }
              }
            \__kernel_intarray_gset:Nnn \g_@@_submatch_begin_intarray
              { \l_@@_zeroth_submatch_int }
              { \l_@@_start_pos_int }
          \fi:
        }
      #1
      \@@_match:n {#2}
      \@@_query_set:n {#2}
      \__kernel_intarray_gset:Nnn \g_@@_submatch_prev_intarray
        { \l_@@_submatch_int } { 0 }
      \__kernel_intarray_gset:Nnn \g_@@_submatch_end_intarray
        { \l_@@_submatch_int }
        { \l_@@_max_pos_int }
      \__kernel_intarray_gset:Nnn \g_@@_submatch_begin_intarray
        { \l_@@_submatch_int }
        { \l_@@_start_pos_int }
      \int_incr:N \l_@@_submatch_int
      \if_meaning:w \c_true_bool \l_@@_empty_success_bool
        \if_int_compare:w \l_@@_start_pos_int = \l_@@_max_pos_int
          \int_decr:N \l_@@_submatch_int
        \fi:
      \fi:
    \@@_group_end_extract_seq:N #3
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_group_end_extract_seq:N}
%   The end-points of submatches are stored as entries of two arrays
%   from \cs{l_@@_min_submatch_int} to
%   \cs{l_@@_submatch_int} (exclusive). Extract the relevant ranges
%   into \cs{l_@@_internal_a_tl}. We detect unbalanced results using
%   the two flags \texttt{__regex_begin} and \texttt{__regex_end}, raised
%   whenever we see too many begin-group or end-group tokens in a
%   submatch.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group_end_extract_seq:N #1
  {
      \flag_clear:n { @@_begin }
      \flag_clear:n { @@_end }
      \seq_set_from_function:NnN \l_@@_internal_seq
        {
          \int_step_function:nnN { \l_@@_min_submatch_int }
            { \l_@@_submatch_int - 1 }
        }
        \@@_extract_seq_aux:n
      \int_compare:nNnF
        {
          \flag_height:n { @@_begin } +
          \flag_height:n { @@_end }
        }
          = 0
        {
          \__kernel_msg_error:nnxxx { regex } { result-unbalanced }
            { splitting~or~extracting~submatches }
            { \flag_height:n { @@_end } }
            { \flag_height:n { @@_begin } }
        }
      \seq_set_map_x:NNn \l_@@_internal_seq \l_@@_internal_seq {##1}
      \exp_args:NNNo
      \group_end:
      \tl_set:Nn #1 { \l_@@_internal_seq }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[EXP]{\@@_extract_seq_aux:n, \@@_extract_seq_aux:ww}
%   The \texttt{:n} auxiliary builds one item of the sequence of
%   submatches. First compute the brace balance of the submatch, then
%   extract the submatch from the query, adding the appropriate braces
%   and raising a flag if the submatch is not balanced.
%    \begin{macrocode}
\cs_new:Npn \@@_extract_seq_aux:n #1
  {
    \exp_after:wN \@@_extract_seq_aux:ww
    \int_value:w \@@_submatch_balance:n {#1} ; #1;
  }
\cs_new:Npn \@@_extract_seq_aux:ww #1; #2;
  {
    \if_int_compare:w #1 < 0 \exp_stop_f:
      \flag_raise:n { @@_end }
      \prg_replicate:nn {-#1} { \exp_not:n { { \if_false: } \fi: } }
    \fi:
    \@@_query_submatch:n {#2}
    \if_int_compare:w #1 > 0 \exp_stop_f:
      \flag_raise:n { @@_begin }
      \prg_replicate:nn {#1} { \exp_not:n { \if_false: { \fi: } } }
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_extract:}
%   Our task here is to store the list of end-points of submatches, and
%   store them in appropriate array entries, from
%   \cs{l_@@_zeroth_submatch_int} upwards.  First, we store in
%   \cs{g_@@_submatch_prev_intarray} the position at which the match
%   attempt started.  We extract the rest from the comma list
%   \cs{l_@@_success_submatches_tl}, which starts with entries to be
%   stored in \cs{g_@@_submatch_begin_intarray} and continues with
%   entries for \cs{g_@@_submatch_end_intarray}.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_extract:
  {
    \if_meaning:w \c_true_bool \g_@@_success_bool
      \int_set_eq:NN \l_@@_zeroth_submatch_int \l_@@_submatch_int
      \prg_replicate:nn \l_@@_capturing_group_int
        {
          \__kernel_intarray_gset:Nnn \g_@@_submatch_prev_intarray
            { \l_@@_submatch_int } { 0 }
          \int_incr:N \l_@@_submatch_int
        }
      \__kernel_intarray_gset:Nnn \g_@@_submatch_prev_intarray
        { \l_@@_zeroth_submatch_int } { \l_@@_start_pos_int }
      \int_zero:N \l_@@_internal_a_int
      \clist_map_inline:Nn \l_@@_success_submatches_tl
        {
          \if_int_compare:w \l_@@_internal_a_int < \l_@@_capturing_group_int
            \__kernel_intarray_gset:Nnn \g_@@_submatch_begin_intarray
              { \@@_int_eval:w \l_@@_zeroth_submatch_int + \l_@@_internal_a_int } {##1}
          \else:
            \__kernel_intarray_gset:Nnn \g_@@_submatch_end_intarray
              { \@@_int_eval:w \l_@@_zeroth_submatch_int + \l_@@_internal_a_int - \l_@@_capturing_group_int } {##1}
          \fi:
          \int_incr:N \l_@@_internal_a_int
        }
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Replacement}
%
% \begin{macro}{\@@_replace_once:nnN}
%   Build the \textsc{nfa} and the replacement functions, then find a
%   single match.  If the match failed, simply exit the
%   group. Otherwise, we do the replacement. Extract submatches. Compute
%   the brace balance corresponding to replacing this match by the
%   replacement (this depends on submatches). Prepare the replaced token
%   list: the replacement function produces the tokens from the start of
%   the query to the start of the match and the replacement text for
%   this match; we need to add the tokens from the end of the match to
%   the end of the query. Finally, store the result in the user's
%   variable after closing the group: this step involves an additional
%   \texttt{x}-expansion, and checks that braces are balanced in the
%   final result.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replace_once:nnN #1#2#3
  {
    \group_begin:
      \@@_single_match:
      #1
      \exp_args:No \@@_match:n {#3}
      \if_meaning:w \c_false_bool \g_@@_success_bool
        \group_end:
      \else:
        \@@_extract:
        \exp_args:No \@@_query_set:n {#3}
        \@@_replacement:n {#2}
        \int_set:Nn \l_@@_balance_int
          {
            \@@_replacement_balance_one_match:n
              { \l_@@_zeroth_submatch_int }
          }
        \__kernel_tl_set:Nx \l_@@_internal_a_tl
          {
            \@@_replacement_do_one_match:n
              { \l_@@_zeroth_submatch_int }
            \@@_query_range:nn
              {
                \__kernel_intarray_item:Nn \g_@@_submatch_end_intarray
                  { \l_@@_zeroth_submatch_int }
              }
              { \l_@@_max_pos_int }
          }
        \@@_group_end_replace:N #3
      \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_replace_all:nnN}
%   Match multiple times, and for every match, extract submatches and
%   additionally store the position at which the match attempt started.
%   The entries from \cs{l_@@_min_submatch_int} to
%   \cs{l_@@_submatch_int} hold information about submatches of every
%   match in order; each match corresponds to
%   \cs{l_@@_capturing_group_int} consecutive entries.
%   Compute the brace balance corresponding to doing all the
%   replacements: this is the sum of brace balances for replacing each
%   match. Join together the replacement texts for each match (including
%   the part of the query before the match), and the end of the query.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_replace_all:nnN #1#2#3
  {
    \group_begin:
      \@@_multi_match:n { \@@_extract: }
      #1
      \exp_args:No \@@_match:n {#3}
      \exp_args:No \@@_query_set:n {#3}
      \@@_replacement:n {#2}
      \int_set:Nn \l_@@_balance_int
        {
          0
          \int_step_function:nnnN
            { \l_@@_min_submatch_int }
            \l_@@_capturing_group_int
            { \l_@@_submatch_int - 1 }
            \@@_replacement_balance_one_match:n
        }
      \__kernel_tl_set:Nx \l_@@_internal_a_tl
        {
          \int_step_function:nnnN
            { \l_@@_min_submatch_int }
            \l_@@_capturing_group_int
            { \l_@@_submatch_int - 1 }
            \@@_replacement_do_one_match:n
          \@@_query_range:nn
            \l_@@_start_pos_int \l_@@_max_pos_int
        }
    \@@_group_end_replace:N #3
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_group_end_replace:N}
%   If the brace balance is not $0$, raise an error. Then set the user's
%   variable |#1| to the \texttt{x}-expansion of
%   \cs{l_@@_internal_a_tl}, adding the appropriate braces to produce
%   a balanced result. And end the group.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_group_end_replace:N #1
  {
    \if_int_compare:w \l_@@_balance_int = 0 \exp_stop_f:
    \else:
      \__kernel_msg_error:nnxxx { regex } { result-unbalanced }
        { replacing }
        { \int_max:nn { - \l_@@_balance_int } { 0 } }
        { \int_max:nn { \l_@@_balance_int } { 0 } }
    \fi:
    \use:x
      {
        \group_end:
        \tl_set:Nn \exp_not:N #1
          {
            \if_int_compare:w \l_@@_balance_int < 0 \exp_stop_f:
              \prg_replicate:nn { - \l_@@_balance_int }
                { { \if_false: } \fi: }
            \fi:
            \l_@@_internal_a_tl
            \if_int_compare:w \l_@@_balance_int > 0 \exp_stop_f:
              \prg_replicate:nn { \l_@@_balance_int }
                { \if_false: { \fi: } }
            \fi:
          }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsubsection{Peeking ahead}
%
% \begin{variable}{\l_@@_peek_true_tl, \l_@@_peek_false_tl}
%   True/false code arguments of \cs{peek_regex:nTF} or similar.
%    \begin{macrocode}
\tl_new:N \l_@@_peek_true_tl
\tl_new:N \l_@@_peek_false_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_replacement_tl}
%   When peeking in \cs{peek_regex_replace_once:nnTF} we need to store
%   the replacement text.
%    \begin{macrocode}
\tl_new:N \l_@@_replacement_tl
%    \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_input_tl}
% \begin{macro}{\@@_input_item:n}
%   Stores each token found as \cs{@@_input_item:n} \Arg{tokens}, where
%   the \meta{tokens} \texttt{o}-expand to the token found, as for
%   \cs{tl_analysis_map_inline:nn}.
%    \begin{macrocode}
\tl_new:N \l_@@_input_tl
\cs_new_eq:NN \@@_input_item:n ?
%    \end{macrocode}
% \end{macro}
% \end{variable}
%
% \begin{macro}[TF]
%   {\peek_regex:n, \peek_regex:N, \peek_regex_remove_once:n, \peek_regex_remove_once:N}
%   The |T| and |F| functions just call the corresponding |TF| function.
%   The four |TF| functions differ along two axes: whether to remove the
%   token or not, distinguished by using \cs{@@_peek_end:} or
%   \cs{@@_peek_remove_end:n} (the latter case needs an argument, as we
%   will see), and whether the regex has to be compiled or is already in
%   an |N|-type variable, distinguished by calling \cs{@@_build_aux:Nn}
%   or \cs{@@_build_aux:NN}.  The first argument of these functions is
%   \cs{c_false_bool} to indicate that there should be no implicit
%   insertion of a wildcard at the start of the pattern: otherwise the
%   code would keep looking further into the input stream until matching
%   the regex.
%    \begin{macrocode}
\cs_new_protected:Npn \peek_regex:nTF #1
  {
    \@@_peek:nnTF
      { \@@_build_aux:Nn \c_false_bool {#1} }
      { \@@_peek_end: }
  }
\cs_new_protected:Npn \peek_regex:nT #1#2
  { \peek_regex:nTF {#1} {#2} { } }
\cs_new_protected:Npn \peek_regex:nF #1 { \peek_regex:nTF {#1} { } }
\cs_new_protected:Npn \peek_regex:NTF #1
  {
    \@@_peek:nnTF
      { \@@_build_aux:NN \c_false_bool #1 }
      { \@@_peek_end: }
  }
\cs_new_protected:Npn \peek_regex:NT #1#2
  { \peek_regex:NTF #1 {#2} { } }
\cs_new_protected:Npn \peek_regex:NF #1 { \peek_regex:NTF {#1} { } }
\cs_new_protected:Npn \peek_regex_remove_once:nTF #1
  {
    \@@_peek:nnTF
      { \@@_build_aux:Nn \c_false_bool {#1} }
      { \@@_peek_remove_end:n {##1} }
  }
\cs_new_protected:Npn \peek_regex_remove_once:nT #1#2
  { \peek_regex_remove_once:nTF {#1} {#2} { } }
\cs_new_protected:Npn \peek_regex_remove_once:nF #1
  { \peek_regex_remove_once:nTF {#1} { } }
\cs_new_protected:Npn \peek_regex_remove_once:NTF #1
  {
    \@@_peek:nnTF
      { \@@_build_aux:NN \c_false_bool #1 }
      { \@@_peek_remove_end:n {##1} }
  }
\cs_new_protected:Npn \peek_regex_remove_once:NT #1#2
  { \peek_regex_remove_once:NTF #1 {#2} { } }
\cs_new_protected:Npn \peek_regex_remove_once:NF #1
  { \peek_regex_remove_once:NTF #1 { } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_peek:nnTF, \@@_peek_aux:nnTF}
%   Store the user's true/false codes (plus \cs{group_end:}) into two
%   token lists.  Then build the automaton with |#1|, without submatch
%   tracking, and aiming for a single match.  Then start matching by
%   setting up a few variables like for any regex matching like
%   \cs{regex_match:nnTF}, with the addition of \cs{l_@@_input_tl}
%   that keeps track of the tokens seen, to reinsert them at the
%   end.  Instead of \cs{tl_analysis_map_inline:nn} on the input, we
%   call \cs{peek_analysis_map_inline:n} to go through tokens in the
%   input stream.  Since \cs{@@_match_one_token:nnN} calls
%   \cs{@@_maplike_break:} we need to catch that and break the
%   \cs{peek_analysis_map_inline:n} loop instead.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_peek:nnTF #1
  {
    \@@_peek_aux:nnTF
      {
        \@@_disable_submatches:
        #1
      }
  }
\cs_new_protected:Npn \@@_peek_aux:nnTF #1#2#3#4
  {
    \group_begin:
      \tl_set:Nn \l_@@_peek_true_tl { \group_end: #3 }
      \tl_set:Nn \l_@@_peek_false_tl { \group_end: #4 }
      \@@_single_match:
      #1
      \@@_match_init:
      \tl_build_clear:N \l_@@_input_tl
      \@@_match_once_init:
      \peek_analysis_map_inline:n
        {
          \tl_build_put_right:Nn \l_@@_input_tl
            { \@@_input_item:n {##1} }
          \@@_match_one_token:nnN {##1} {##2} ##3
          \use_none:nnn
          \prg_break_point:Nn \@@_maplike_break:
            { \peek_analysis_map_break:n {#2} }
        }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_peek_end:, \@@_peek_remove_end:n}
%   Once the regex matches (or permanently fails to match) we call
%   \cs{@@_peek_end:}, or \cs{@@_peek_remove_end:n} with argument the
%   last token seen.  For \cs{peek_regex:nTF} we reinsert tokens seen by
%   calling \cs{@@_peek_reinsert:N} regardless of the result of the
%   match.  For \cs{peek_regex_remove_once:nTF} we reinsert the tokens
%   seen only if the match failed; otherwise we just reinsert the
%   tokens~|#1|, with one expansion.  To be more precise, |#1| consists
%   of tokens that \texttt{o}-expand and \texttt{x}-expand to the last
%   token seen, for example it is \cs{exp_not:N} \meta{cs} for a control
%   sequence.  This means that just doing \cs{exp_after:wN}
%   \cs{l_@@_peek_true_tl} |#1| would be unsafe because the expansion of
%   \meta{cs} would be suppressed.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_peek_end:
  {
    \bool_if:NTF \g_@@_success_bool
      { \@@_peek_reinsert:N \l_@@_peek_true_tl }
      { \@@_peek_reinsert:N \l_@@_peek_false_tl }
  }
\cs_new_protected:Npn \@@_peek_remove_end:n #1
  {
    \bool_if:NTF \g_@@_success_bool
      { \exp_args:NNo \use:nn \l_@@_peek_true_tl {#1} }
      { \@@_peek_reinsert:N \l_@@_peek_false_tl }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_peek_reinsert:N, \@@_reinsert_item:n}
%   Insert the true/false code |#1|, followed by the tokens found, which
%   were stored in \cs{l_@@_input_tl}.  For this, loop through that
%   token list using \cs{@@_reinsert_item:n}, which expands |#1| once to
%   get a single token, and jumps over it to expand what follows, with
%   suitable \cs{exp:w} and \cs{exp_end:}.  We cannot just use
%   \cs{use:e} on the whole token list because the result may be
%   unbalanced, which would stop the primitive prematurely, or let it
%   continue beyond where we would like.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_peek_reinsert:N #1
  {
    \tl_build_end:N \l_@@_input_tl
    \cs_set_eq:NN \@@_input_item:n \@@_reinsert_item:n
    \exp_after:wN #1 \exp:w \l_@@_input_tl \exp_end:
  }
\cs_new_protected:Npn \@@_reinsert_item:n #1
  {
    \exp_after:wN \exp_after:wN
    \exp_after:wN \exp_end:
    \exp_after:wN \exp_after:wN
    #1
    \exp:w
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}[noTF]
%   {\peek_regex_replace_once:nn, \peek_regex_replace_once:Nn}
%   Similar to \cs{peek_regex:nTF} above.
%    \begin{macrocode}
\cs_new_protected:Npn \peek_regex_replace_once:nnTF #1
  { \@@_peek_replace:nnTF { \@@_build_aux:Nn \c_false_bool {#1} } }
\cs_new_protected:Npn \peek_regex_replace_once:nnT #1#2#3
  { \peek_regex_replace_once:nnTF {#1} {#2} {#3} { } }
\cs_new_protected:Npn \peek_regex_replace_once:nnF #1#2
  { \peek_regex_replace_once:nnTF {#1} {#2} { } }
\cs_new_protected:Npn \peek_regex_replace_once:nn #1#2
  { \peek_regex_replace_once:nnTF {#1} {#2} { } { } }
\cs_new_protected:Npn \peek_regex_replace_once:NnTF #1
  { \@@_peek_replace:nnTF { \@@_build_aux:NN \c_false_bool #1 } }
\cs_new_protected:Npn \peek_regex_replace_once:NnT #1#2#3
  { \peek_regex_replace_once:NnTF #1 {#2} {#3} { } }
\cs_new_protected:Npn \peek_regex_replace_once:NnF #1#2
  { \peek_regex_replace_once:NnTF #1 {#2} { } }
\cs_new_protected:Npn \peek_regex_replace_once:Nn #1#2
  { \peek_regex_replace_once:NnTF #1 {#2} { } { } }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_peek_replace:nnTF}
%   Same as \cs{@@_peek:nnTF} (used for \cs{peek_regex:nTF} above), but
%   without disabling submatches, and with a different end.  The
%   replacement text |#2| is stored, to be analyzed later.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_peek_replace:nnTF #1#2
  {
    \tl_set:Nn \l_@@_replacement_tl {#2}
    \@@_peek_aux:nnTF {#1} { \@@_peek_replace_end: }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_peek_replace_end:}
%   If the match failed \cs{@@_peek_reinsert:N} reinserts the tokens
%   found.  Otherwise, finish storing the submatch information using
%   \cs{@@_extract:}, and store the input into \tn{toks}.  Redefine a
%   few auxiliaries to change slightly their expansion behaviour as
%   explained below.  Analyse the replacement text with
%   \cs{@@_replacement:n}, which as usual defines
%   \cs{@@_replacement_do_one_match:n} to insert the tokens from the
%   start of the match attempt to the beginning of the match, followed
%   by the replacement text.  The \cs{use:x} expands for instance the
%   trailing \cs{@@_query_range:nn} down to a sequence of
%   \cs{@@_reinsert_item:n} \Arg{tokens} where \meta{tokens}
%   \texttt{o}-expand to a single token that we want to insert.  After
%   \texttt{x}-expansion, \cs{use:x} does \cs{use:n}, so we have
%   \cs{exp_after:wN} \cs{l_@@_peek_true_tl} \cs{exp:w} \ldots{}
%   \cs{exp_end:}.  This is set up such as to obtain
%   \cs{l_@@_peek_true_tl} followed by the replaced tokens (possibly
%   unbalanced) in the input stream.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_peek_replace_end:
  {
    \bool_if:NTF \g_@@_success_bool
      {
        \@@_extract:
        \@@_query_set_from_input_tl:
        \cs_set_eq:NN \@@_replacement_put:n \@@_peek_replacement_put:n
        \cs_set_eq:NN \@@_replacement_put_submatch_aux:n
          \@@_peek_replacement_put_submatch_aux:n
        \cs_set_eq:NN \@@_input_item:n \@@_reinsert_item:n
        \cs_set_eq:NN \@@_replacement_exp_not:N \@@_peek_replacement_token:n
        \cs_set_eq:NN \@@_replacement_exp_not:V \@@_peek_replacement_var:N
        \exp_args:No \@@_replacement:n { \l_@@_replacement_tl }
        \use:x
          {
            \exp_not:n { \exp_after:wN \l_@@_peek_true_tl \exp:w }
            \@@_replacement_do_one_match:n
              { \l_@@_zeroth_submatch_int }
            \@@_query_range:nn
              {
                \__kernel_intarray_item:Nn \g_@@_submatch_end_intarray
                  { \l_@@_zeroth_submatch_int }
              }
              { \l_@@_max_pos_int }
            \exp_end:
          }
      }
      { \@@_peek_reinsert:N \l_@@_peek_false_tl }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_query_set_from_input_tl:, \@@_query_set_item:n}
%   The input was stored into \cs{l_@@_input_tl} as successive items
%   \cs{@@_input_item:n} \Arg{tokens}.  Store that in successive
%   \tn{toks}.  It's not clear whether the empty entries before and
%   after are both useful.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_query_set_from_input_tl:
  {
    \tl_build_end:N \l_@@_input_tl
    \int_zero:N \l_@@_curr_pos_int
    \cs_set_eq:NN \@@_input_item:n \@@_query_set_item:n
    \@@_query_set_item:n { }
    \l_@@_input_tl
    \@@_query_set_item:n { }
    \int_set_eq:NN \l_@@_max_pos_int \l_@@_curr_pos_int
  }
\cs_new_protected:Npn \@@_query_set_item:n #1
  {
    \int_incr:N \l_@@_curr_pos_int
    \@@_toks_set:Nn \l_@@_curr_pos_int { \@@_input_item:n {#1} }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_peek_replacement_put:n}
%   While building the replacement function
%   \cs{@@_replacement_do_one_match:n}, we often want to put simple
%   material, given as |#1|, whose \texttt{x}-expansion
%   \texttt{o}-expands to a single token.  Normally we can just add the
%   token to \cs{l_@@_build_tl}, but for
%   \cs{peek_regex_replace_once:nnTF} we eventually want to do some
%   strange expansion that is basically using \cs{exp_after:wN} to jump
%   through numerous tokens (we cannot use \texttt{x}-expansion like for
%   \cs{regex_replace_once:nnNTF} because it is ok for the result to be
%   unbalanced since we insert it in the input stream rather than
%   storing it.  When within a csname we don't do any such shenanigan
%   because \cs{cs:w} \ldots{} \cs{cs_end:} does all the expansion we
%   need.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_peek_replacement_put:n #1
  {
    \if_case:w \l_@@_replacement_csnames_int
      \tl_build_put_right:Nn \l_@@_build_tl
        { \exp_not:N \@@_reinsert_item:n {#1} }
    \else:
      \tl_build_put_right:Nn \l_@@_build_tl {#1}
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_peek_replacement_token:n}
%   When hit with \cs{exp:w}, \cs{@@_peek_replacement_token:n}
%   \Arg{token} stops \cs{exp_end:} and does \cs{exp_after:wN}
%   \meta{token} \cs{exp:w} to continue expansion after it.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_peek_replacement_token:n #1
  { \exp_after:wN \exp_end: \exp_after:wN #1 \exp:w }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_peek_replacement_put_submatch_aux:n}
%   While analyzing the replacement we also have to insert submatches
%   found in the query.  Since query items \cs{@@_input_item:n}
%   \Arg{tokens} expand correctly only when surrounded by \cs{exp:w}
%   \ldots{} \cs{exp_end:}, and since these expansion controls are not
%   there within csnames (because \cs{cs:w} \ldots{} \cs{cs_end:} make
%   them unnecessary in most cases), we have to put \cs{exp:w} and
%   \cs{exp_end:} by hand here.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_peek_replacement_put_submatch_aux:n #1
  {
    \if_case:w \l_@@_replacement_csnames_int
      \tl_build_put_right:Nn \l_@@_build_tl
        { \@@_query_submatch:n { \int_eval:n { #1 + ##1 } } }
    \else:
      \tl_build_put_right:Nn \l_@@_build_tl
        { \exp:w \@@_query_submatch:n { \int_eval:n { #1 + ##1 } } \exp_end: }
    \fi:
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_peek_replacement_var:N}
%   This is used for |\u| outside csnames.  It makes sure to continue
%   expansion with \cs{exp:w} before expanding the variable~|#1| and
%   stopping the \cs{exp:w} that precedes.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_peek_replacement_var:N #1
  {
    \exp_after:wN \exp_last_unbraced:NV
    \exp_after:wN \exp_end:
    \exp_after:wN #1
    \exp:w
  }
%    \end{macrocode}
% \end{macro}
%
% \subsection{Messages}
%
% Messages for the preparsing phase.
%    \begin{macrocode}
\use:x
  {
    \__kernel_msg_new:nnn { regex } { trailing-backslash }
      { Trailing~escape~char~'\iow_char:N\\'~in~regex~or~replacement. }
    \__kernel_msg_new:nnn { regex } { x-missing-rbrace }
      {
        Missing~brace~'\iow_char:N\}'~in~regex~
        '...\iow_char:N\\x\iow_char:N\{...##1'.
      }
    \__kernel_msg_new:nnn { regex } { x-overflow }
      {
        Character~code~##1~too~large~in~
        \iow_char:N\\x\iow_char:N\{##2\iow_char:N\}~regex.
      }
  }
%    \end{macrocode}
%
% Invalid quantifier.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { invalid-quantifier }
  { Braced~quantifier~'#1'~may~not~be~followed~by~'#2'. }
  {
    The~character~'#2'~is~invalid~in~the~braced~quantifier~'#1'.~
    The~only~valid~quantifiers~are~'*',~'?',~'+',~'{<int>}',~
    '{<min>,}'~and~'{<min>,<max>}',~optionally~followed~by~'?'.
  }
%    \end{macrocode}
%
% Messages for missing or extra closing brackets and parentheses, with
% some fancy singular/plural handling for the case of parentheses.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { missing-rbrack }
  { Missing~right~bracket~inserted~in~regular~expression. }
  {
    LaTeX~was~given~a~regular~expression~where~a~character~class~
    was~started~with~'[',~but~the~matching~']'~is~missing.
  }
\__kernel_msg_new:nnnn { regex } { missing-rparen }
  {
    Missing~right~
    \int_compare:nTF { #1 = 1 } { parenthesis } { parentheses } ~
    inserted~in~regular~expression.
  }
  {
    LaTeX~was~given~a~regular~expression~with~\int_eval:n {#1} ~
    more~left~parentheses~than~right~parentheses.
  }
\__kernel_msg_new:nnnn { regex } { extra-rparen }
  { Extra~right~parenthesis~ignored~in~regular~expression. }
  {
    LaTeX~came~across~a~closing~parenthesis~when~no~submatch~group~
    was~open.~The~parenthesis~will~be~ignored.
  }
%    \end{macrocode}
%
% Some escaped alphanumerics are not allowed everywhere.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { bad-escape }
  {
    Invalid~escape~'\iow_char:N\\#1'~
    \@@_if_in_cs:TF { within~a~control~sequence. }
      {
        \@@_if_in_class:TF
          { in~a~character~class. }
          { following~a~category~test. }
      }
  }
  {
    The~escape~sequence~'\iow_char:N\\#1'~may~not~appear~
    \@@_if_in_cs:TF
      {
        within~a~control~sequence~test~introduced~by~
        '\iow_char:N\\c\iow_char:N\{'.
      }
      {
        \@@_if_in_class:TF
          { within~a~character~class~ }
          { following~a~category~test~such~as~'\iow_char:N\\cL'~ }
        because~it~does~not~match~exactly~one~character.
      }
  }
%    \end{macrocode}
%
% Range errors.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { range-missing-end }
  { Invalid~end-point~for~range~'#1-#2'~in~character~class. }
  {
    The~end-point~'#2'~of~the~range~'#1-#2'~may~not~serve~as~an~
    end-point~for~a~range:~alphanumeric~characters~should~not~be~
    escaped,~and~non-alphanumeric~characters~should~be~escaped.
  }
\__kernel_msg_new:nnnn { regex } { range-backwards }
  { Range~'[#1-#2]'~out~of~order~in~character~class. }
  {
    In~ranges~of~characters~'[x-y]'~appearing~in~character~classes,~
    the~first~character~code~must~not~be~larger~than~the~second.~
    Here,~'#1'~has~character~code~\int_eval:n {`#1},~while~
    '#2'~has~character~code~\int_eval:n {`#2}.
  }
%    \end{macrocode}
%
% Errors related to |\c| and |\u|.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { c-bad-mode }
  { Invalid~nested~'\iow_char:N\\c'~escape~in~regular~expression. }
  {
    The~'\iow_char:N\\c'~escape~cannot~be~used~within~
    a~control~sequence~test~'\iow_char:N\\c{...}'~
    nor~another~category~test.~
    To~combine~several~category~tests,~use~'\iow_char:N\\c[...]'.
  }
\__kernel_msg_new:nnnn { regex } { c-C-invalid }
  { '\iow_char:N\\cC'~should~be~followed~by~'.'~or~'(',~not~'#1'. }
  {
    The~'\iow_char:N\\cC'~construction~restricts~the~next~item~to~be~a~
    control~sequence~or~the~next~group~to~be~made~of~control~sequences.~
    It~only~makes~sense~to~follow~it~by~'.'~or~by~a~group.
  }
\__kernel_msg_new:nnnn { regex } { c-lparen-in-class }
  { Catcode~test~cannot~apply~to~group~in~character~class }
  {
    Construction~such~as~'\iow_char:N\\cL(abc)'~are~not~allowed~inside~a~
    class~'[...]'~because~classes~do~not~match~multiple~characters~at~once.
  }
\__kernel_msg_new:nnnn { regex } { c-missing-rbrace }
  { Missing~right~brace~inserted~for~'\iow_char:N\\c'~escape. }
  {
    LaTeX~was~given~a~regular~expression~where~a~
    '\iow_char:N\\c\iow_char:N\{...'~construction~was~not~ended~
    with~a~closing~brace~'\iow_char:N\}'.
  }
\__kernel_msg_new:nnnn { regex } { c-missing-rbrack }
  { Missing~right~bracket~inserted~for~'\iow_char:N\\c'~escape. }
  {
    A~construction~'\iow_char:N\\c[...'~appears~in~a~
    regular~expression,~but~the~closing~']'~is~not~present.
  }
\__kernel_msg_new:nnnn { regex } { c-missing-category }
  { Invalid~character~'#1'~following~'\iow_char:N\\c'~escape. }
  {
    In~regular~expressions,~the~'\iow_char:N\\c'~escape~sequence~
    may~only~be~followed~by~a~left~brace,~a~left~bracket,~or~a~
    capital~letter~representing~a~character~category,~namely~
    one~of~'ABCDELMOPSTU'.
  }
\__kernel_msg_new:nnnn { regex } { c-trailing }
  { Trailing~category~code~escape~'\iow_char:N\\c'... }
  {
    A~regular~expression~ends~with~'\iow_char:N\\c'~followed~
    by~a~letter.~It~will~be~ignored.
  }
\__kernel_msg_new:nnnn { regex } { u-missing-lbrace }
  { Missing~left~brace~following~'\iow_char:N\\u'~escape. }
  {
    The~'\iow_char:N\\u'~escape~sequence~must~be~followed~by~
    a~brace~group~with~the~name~of~the~variable~to~use.
  }
\__kernel_msg_new:nnnn { regex } { u-missing-rbrace }
  { Missing~right~brace~inserted~for~'\iow_char:N\\u'~escape. }
  {
    LaTeX~
    \str_if_eq:eeTF { } {#2}
      { reached~the~end~of~the~string~ }
      { encountered~an~escaped~alphanumeric~character '\iow_char:N\\#2'~ }
    when~parsing~the~argument~of~an~
    '\iow_char:N\\u\iow_char:N\{...\}'~escape.
  }
%    \end{macrocode}
%
% Errors when encountering the \textsc{posix} syntax |[:...:]|.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { posix-unsupported }
  { POSIX~collating~element~'[#1 ~ #1]'~not~supported. }
  {
    The~'[.foo.]'~and~'[=bar=]'~syntaxes~have~a~special~meaning~
    in~POSIX~regular~expressions.~This~is~not~supported~by~LaTeX.~
    Maybe~you~forgot~to~escape~a~left~bracket~in~a~character~class?
  }
\__kernel_msg_new:nnnn { regex } { posix-unknown }
  { POSIX~class~'[:#1:]'~unknown. }
  {
    '[:#1:]'~is~not~among~the~known~POSIX~classes~
    '[:alnum:]',~'[:alpha:]',~'[:ascii:]',~'[:blank:]',~
    '[:cntrl:]',~'[:digit:]',~'[:graph:]',~'[:lower:]',~
    '[:print:]',~'[:punct:]',~'[:space:]',~'[:upper:]',~
    '[:word:]',~and~'[:xdigit:]'.
  }
\__kernel_msg_new:nnnn { regex } { posix-missing-close }
  { Missing~closing~':]'~for~POSIX~class. }
  { The~POSIX~syntax~'#1'~must~be~followed~by~':]',~not~'#2'. }
%    \end{macrocode}
%
% In various cases, the result of a \pkg{l3regex} operation can leave us
% with an unbalanced token list, which we must re-balance by adding
% begin-group or end-group character tokens.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { result-unbalanced }
  { Missing~brace~inserted~when~#1. }
  {
    LaTeX~was~asked~to~do~some~regular~expression~operation,~
    and~the~resulting~token~list~would~not~have~the~same~number~
    of~begin-group~and~end-group~tokens.~Braces~were~inserted:~
    #2~left,~#3~right.
  }
%    \end{macrocode}
%
% Error message for unknown options.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { unknown-option }
  { Unknown~option~'#1'~for~regular~expressions. }
  {
    The~only~available~option~is~'case-insensitive',~toggled~by~
    '(?i)'~and~'(?-i)'.
  }
\__kernel_msg_new:nnnn { regex } { special-group-unknown }
  { Unknown~special~group~'#1~...'~in~a~regular~expression. }
  {
    The~only~valid~constructions~starting~with~'(?'~are~
    '(?:~...~)',~'(?|~...~)',~'(?i)',~and~'(?-i)'.
  }
%    \end{macrocode}
%
% Errors in the replacement text.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { replacement-c }
  { Misused~'\iow_char:N\\c'~command~in~a~replacement~text. }
  {
    In~a~replacement~text,~the~'\iow_char:N\\c'~escape~sequence~
    can~be~followed~by~one~of~the~letters~'ABCDELMOPSTU'~
    or~a~brace~group,~not~by~'#1'.
  }
\__kernel_msg_new:nnnn { regex } { replacement-u }
  { Misused~'\iow_char:N\\u'~command~in~a~replacement~text. }
  {
    In~a~replacement~text,~the~'\iow_char:N\\u'~escape~sequence~
    must~be~~followed~by~a~brace~group~holding~the~name~of~the~
    variable~to~use.
  }
\__kernel_msg_new:nnnn { regex } { replacement-g }
  {
    Missing~brace~for~the~'\iow_char:N\\g'~construction~
    in~a~replacement~text.
  }
  {
    In~the~replacement~text~for~a~regular~expression~search,~
    submatches~are~represented~either~as~'\iow_char:N \\g{dd..d}',~
    or~'\\d',~where~'d'~are~single~digits.~Here,~a~brace~is~missing.
  }
\__kernel_msg_new:nnnn { regex } { replacement-catcode-end }
  {
    Missing~character~for~the~'\iow_char:N\\c<category><character>'~
    construction~in~a~replacement~text.
  }
  {
    In~a~replacement~text,~the~'\iow_char:N\\c'~escape~sequence~
    can~be~followed~by~one~of~the~letters~'ABCDELMOPSTU'~representing~
    the~character~category.~Then,~a~character~must~follow.~LaTeX~
    reached~the~end~of~the~replacement~when~looking~for~that.
  }
\__kernel_msg_new:nnnn { regex } { replacement-catcode-escaped }
  {
    Escaped~letter~or~digit~after~category~code~in~replacement~text.
  }
  {
    In~a~replacement~text,~the~'\iow_char:N\\c'~escape~sequence~
    can~be~followed~by~one~of~the~letters~'ABCDELMOPSTU'~representing~
    the~character~category.~Then,~a~character~must~follow,~not~
    '\iow_char:N\\#2'.
  }
\__kernel_msg_new:nnnn { regex } { replacement-catcode-in-cs }
  {
    Category~code~'\iow_char:N\\c#1#3'~ignored~inside~
    '\iow_char:N\\c\{...\}'~in~a~replacement~text.
  }
  {
    In~a~replacement~text,~the~category~codes~of~the~argument~of~
    '\iow_char:N\\c\{...\}'~are~ignored~when~building~the~control~
    sequence~name.
  }
\__kernel_msg_new:nnnn { regex } { replacement-null-space }
  { TeX~cannot~build~a~space~token~with~character~code~0. }
  {
    You~asked~for~a~character~token~with~category~space,~
    and~character~code~0,~for~instance~through~
    '\iow_char:N\\cS\iow_char:N\\x00'.~
    This~specific~case~is~impossible~and~will~be~replaced~
    by~a~normal~space.
  }
\__kernel_msg_new:nnnn { regex } { replacement-missing-rbrace }
  { Missing~right~brace~inserted~in~replacement~text. }
  {
    There~ \int_compare:nTF { #1 = 1 } { was } { were } ~ #1~
    missing~right~\int_compare:nTF { #1 = 1 } { brace } { braces } .
  }
\__kernel_msg_new:nnnn { regex } { replacement-missing-rparen }
  { Missing~right~parenthesis~inserted~in~replacement~text. }
  {
    There~ \int_compare:nTF { #1 = 1 } { was } { were } ~ #1~
    missing~right~
    \int_compare:nTF { #1 = 1 } { parenthesis } { parentheses } .
  }
%    \end{macrocode}
%
% Some escaped alphanumerics are not allowed everywhere.
%    \begin{macrocode}
\__kernel_msg_new:nnnn { regex } { backwards-quantifier }
  { Quantifer~"{#1,#2}"~is~backwards. }
  { The~values~given~in~a~quantifier~must~be~in~order. }
%    \end{macrocode}
%
% Used when showing a regex.
%    \begin{macrocode}
\__kernel_msg_new:nnn { regex } { show }
  {
    >~Compiled~regex~
    \tl_if_empty:nTF {#1} { variable~ #2 } { {#1} } :
    #3
  }
%    \end{macrocode}
%
% \begin{macro}{\@@_msg_repeated:nnN}
%   This is not technically a message, but seems related enough to go
%   there. The arguments are: |#1| is the minimum number of repetitions;
%   |#2| is the number of allowed extra repetitions ($-1$ for infinite
%   number), and |#3| tells us about lazyness.
%    \begin{macrocode}
\cs_new:Npn \@@_msg_repeated:nnN #1#2#3
  {
    \str_if_eq:eeF { #1 #2 } { 1 0 }
      {
        , ~ repeated ~
        \int_case:nnF {#2}
          {
            { -1 } { #1~or~more~times,~\bool_if:NTF #3 { lazy } { greedy } }
            {  0 } { #1~times }
          }
          {
            between~#1~and~\int_eval:n {#1+#2}~times,~
            \bool_if:NTF #3 { lazy } { greedy }
          }
      }
  }
%    \end{macrocode}
% \end{macro}
%
% \subsection{Code for tracing}
%
% There is a more extensive implementation of tracing in the l3trial
% package \pkg{l3trace}.  Function names are a bit different but could
% be merged.
%
% \begin{macro}
%   {\@@_trace_push:nnN, \@@_trace_pop:nnN, \@@_trace:nnx}
%   Here |#1| is the module name (\texttt{regex}) and |#2| is
%   typically~1.  If the module's current tracing level is less than
%   |#2| show nothing, otherwise write |#3| to the terminal.
%    \begin{macrocode}
\cs_new_protected:Npn \@@_trace_push:nnN #1#2#3
  { \@@_trace:nnx {#1} {#2} { entering~ \token_to_str:N #3 } }
\cs_new_protected:Npn \@@_trace_pop:nnN #1#2#3
   { \@@_trace:nnx {#1} {#2} { leaving~ \token_to_str:N #3 } }
\cs_new_protected:Npn \@@_trace:nnx #1#2#3
  {
    \int_compare:nNnF
      { \int_use:c { g_@@_trace_#1_int } } < {#2}
      { \iow_term:x { Trace:~#3 } }
  }
%    \end{macrocode}
% \end{macro}
%
% \begin{variable}{\g_@@_trace_regex_int}
%   No tracing when that is zero.
%    \begin{macrocode}
\int_new:N \g_@@_trace_regex_int
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}{\@@_trace_states:n}
%   This function lists the contents of all states of the \textsc{nfa},
%   stored in \tn{toks} from $0$ to \cs{l_@@_max_state_int}
%   (excluded).
%    \begin{macrocode}
\cs_new_protected:Npn \@@_trace_states:n #1
  {
    \int_step_inline:nnn
      \l_@@_min_state_int
      { \l_@@_max_state_int - 1 }
      {
        \@@_trace:nnx { regex } {#1}
          { \iow_char:N \\toks ##1 = { \@@_toks_use:w ##1 } }
      }
  }
%    \end{macrocode}
% \end{macro}
%
%    \begin{macrocode}
%</package>
%    \end{macrocode}
%
% \end{implementation}
%
% \PrintIndex
% \endinput
%^^A NOT IMPLEMENTED
%^^A    \p{xx}     a character with the xx property
%^^A    \P{xx}     a character without the xx property
%^^A    (?=...)    positive look ahead
%^^A    (?!...)    negative look ahead
%^^A    (?<=...)   positive look behind
%^^A    (?<!...)   negative look behind
%^^A    (?<name>...) or (?'name'...) or (?P<name>...)
%^^A               named capturing group
%^^A    \R         a newline sequence
%^^A    \X         an extended Unicode sequence
%^^A    (?C) or (?Cn)   callout with data n
%^^A    (?R)            recurse whole pattern
%^^A    (?[+-]n) or \g<[+-]n> or (?&name) or (?P>name) or \g<name>
%^^A                    call subpattern
%^^A    (?([+-]n)... or (?(<name>)...
%^^A                    reference condition
%^^A    (?(R)... or (?(Rn)... or (?(R&name)...
%^^A                    recursion condition
%^^A    (?(DEFINE)...   define subpattern for reference
%^^A    (?(assert)...   assertion condition
%^^A    (*ACCEPT)       force successful match
%^^A    (*FAIL)         force backtrack; synonym (*F)
%^^A    (*COMMIT)       overall failure, no advance of starting point
%^^A    (*PRUNE)        advance to next starting character
%^^A    (*SKIP)         advance start to current matching position
%^^A    (*THEN)         local failure, backtrack to next alternation
%^^A    (*CR) or (*LF) or (*CRLF) or (*ANYCRLF) or (*ANY)
%^^A                    newline convention
%^^A    (*BSR_ANYCRLF) or (*BSR_UNICODE)
%^^A                    change what \R matches.
%^^A
%^^A    \cx             "control-x", where x is any ASCII character
%^^A    \C              one byte, even in UTF-8 mode (best avoided)
%^^A    +               possessive quantifiers
%^^A    (?>...)         atomic, non-capturing group
%^^A    (?#....)        comment (not nestable)
%^^A    (?JmsUx)        options (duplicate names; multiline; single line;
%^^A                      ungreedy; extended)
%^^A    (*NO_START_OPT) no start-match optimization (PCRE_NO_START_OPTIMIZE)
%^^A    (*UTF8)         set UTF-8 mode (PCRE_UTF8)
%^^A    (*UCP)          set PCRE_UCP (use Unicode properties for \d etc)
%^^A    \n or \gn or \g{[-]n} or \g{name} or (?P=name)
%^^A    or \k<name> or \k'name' or \k{name}
%^^A                    back-references