Skip to content

Commit

Permalink
Avoid overeager tokenization by \peek_analysis_map_inline:n (see #1434)
Browse files Browse the repository at this point in the history
Now the peeking loop only ever looks at the next token and not the
one after.  This gives wrong results in rare cases (implicit catcode
1,2,10 char whose csname starts with another csname with the same
meaning) instead of only doing so in extra-rare cases (same with
extra conditions on the next-next token).  We prioritize having the
simplest effect on tokenization as some uses require that
  • Loading branch information
blefloch committed Feb 6, 2024
1 parent 778f988 commit 8e6610c
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 64 deletions.
2 changes: 2 additions & 0 deletions l3kernel/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ this project uses date-based 'snapshot' version identifiers.
### Fixed
- Inconsistent local/global assignments in `\vcoffin_gset:Nnn` and
`\vcoffin_gset:Nnw`
- Tokenization by `\peek_analysis_map_inline:n` of one additional
character after any space or brace

## [2024-01-22]

Expand Down
96 changes: 36 additions & 60 deletions l3kernel/l3tl-analysis.dtx
Original file line number Diff line number Diff line change
Expand Up @@ -196,18 +196,15 @@
% \end{variable}
%
% \begin{variable}
% {\l_@@_analysis_token, \l_@@_analysis_char_token, \l_@@_analysis_next_token}
% {\l_@@_analysis_token, \l_@@_analysis_char_token}
% The tokens in the token list are probed with the \TeX{} primitive
% \tn{futurelet}. We use \cs{l_@@_analysis_token} in that
% construction. In some cases, we convert the following token to a
% string before probing it: then the token variable used is
% \cs{l_@@_analysis_char_token}. When getting tokens from the input
% stream we may need to look two tokens ahead, for which we use
% \cs{l_@@_analysis_next_token}.
% \cs{l_@@_analysis_char_token}.
% \begin{macrocode}
\cs_new_eq:NN \l_@@_analysis_token ?
\cs_new_eq:NN \l_@@_analysis_char_token ?
\cs_new_eq:NN \l_@@_analysis_next_token ?
% \end{macrocode}
% \end{variable}
%
Expand Down Expand Up @@ -1223,13 +1220,12 @@
% \@@_peek_analysis_nonexp:N, \@@_peek_analysis_cs:N,
% \@@_peek_analysis_char:N, \@@_peek_analysis_char:w,
% \@@_peek_analysis_special:, \@@_peek_analysis_retest:,
% \@@_peek_analysis_next:, \@@_peek_analysis_nextii:,
% \@@_peek_analysis_str:,
% \@@_peek_analysis_str:w, \@@_peek_analysis_str:n,
% \@@_peek_analysis_active_str:n, \@@_peek_analysis_explicit:n,
% \@@_peek_analysis_escape:, \@@_peek_analysis_collect:w,
% \@@_peek_analysis_collect:n, \@@_peek_analysis_collect_loop:,
% \@@_peek_analysis_collect_test:, \@@_peek_analysis_collect_end:NNN
% \@@_peek_analysis_collect_test:, \@@_peek_analysis_collect_end:NNNN
% }
% Save the user's code in a control sequence that is suitable for
% nested maps. We may wish to pass to this function an \tn{outer}
Expand Down Expand Up @@ -1463,7 +1459,7 @@
\if_meaning:w \l_@@_analysis_token \scan_stop:
\exp_after:wN \@@_peek_analysis_normal:N
\else:
\exp_after:wN \@@_peek_analysis_next:
\exp_after:wN \@@_peek_analysis_str:
\fi:
}
% \end{macrocode}
Expand All @@ -1472,41 +1468,22 @@
% begin-group or end-group token (catcode $1$ or~$2$), and we excluded
% a few cases that would be difficult later (empty control sequence,
% active character with the same character code as its meaning or as
% the escape character). Now look at the \meta{next token} following
% it using a combination of \tn{afterassignment} and \tn{futurelet}.
% (In fact look twice to reset an internal \TeX{} flag in case the
% \meta{next token} had been hit with \cs{exp_not:N}.)
% The syntax of this primitive is \tn{futurelet} \meta{peek token}
% \meta{first token} \meta{next token}, and it sets \meta{peek token}
% equal to \meta{next token}. Traditionally, one takes \meta{first
% token} to be some macro that regains control of the code and, e.g.,
% analyses \meta{peek token}. Here, both \meta{first token} and
% \meta{next token} are mostly unknown tokens in the input stream (but
% we know the \meta{first token} has catcode $1$, $2$ or $10$), where
% \meta{first token} was already stored as \cs{l_peek_token}, and we
% regain control using \tn{afterassignment}, which inserts its
% argument after the assignment, hence after \meta{peek token} but
% before \meta{first token}.
% \begin{macrocode}
\cs_new_protected:Npn \@@_peek_analysis_next:
{
\tl_if_empty:oT { \tex_the:D \tex_everyeof:D }
{ \tex_everyeof:D { \scan_stop: } }
\tex_afterassignment:D \@@_peek_analysis_nextii:
\tex_futurelet:D \l_@@_analysis_next_token
}
\cs_new_protected:Npn \@@_peek_analysis_nextii:
{
\tex_afterassignment:D \@@_peek_analysis_str:
\tex_futurelet:D \l_@@_analysis_next_token
}
% \end{macrocode}
% We then hit the \meta{first token} with \cs{token_to_str:N} and grab
% characters until finding \cs{l_@@_analysis_next_token}. More
% the escape character). The idea is to apply \cs{token_to_str:N} to
% the \meta{token} then grab characters (of category code~$12$ except
% for spaces that have category code~$10$) to reconstruct it. In
% earlier versions of the code we would peek at the \meta{next token}
% that lies after \meta{token} in the input stream, which would help
% us be more accurate in reconstructing the \meta{token} case in edge
% cases (mentioned below), but this had the side-effect of tokenizing
% the input stream (turning characters into tokens) farther ahead than
% needed.
%
% We hit the \meta{token} with \cs{token_to_str:N} and start grabbing
% characters. More
% precisely, by looking at the first character in the string
% representation of the \meta{first token} we distinguish three cases:
% representation of the \meta{token} we distinguish three cases:
% a stringified control sequence starts with the escape character; for
% an explicit character we find that same character; for an explicit
% an explicit character we find that same character; for an active
% character we find anything else (we made sure to exclude the case of
% an active character whose string representation coincides with the
% other two cases).
Expand Down Expand Up @@ -1594,14 +1571,11 @@
% know that until we had run all the various tests including
% stringifying the token. We are thus left with the hard work of
% picking up one by one the characters in the csname (being careful
% about spaces), until finding a token that matches the \meta{next
% token} picked up earlier (which was not stringified), such that the
% control sequence that we found so far indeed has the expected
% meaning \cs{l_peek_token}. This comparison with \cs{l_peek_token}
% catches a reasonably common case like \cs{c_group_begin_token} |_|
% in which the trailing |_| has category code other: without
% comparison of the constructed csname with \cs{l_peek_token}
% collection would stop at \cs[no-index]{c}, which is wrong.
% about spaces), until the constructed csname has the expected
% meaning. This fails if someone defines a token like
% \cs[no-index]{bgroup@my} whose string representation starts the same
% as another token with the same meaning being an implicit character
% token of category code $1$, $2$, or $10$.
% \begin{macrocode}
\cs_new_protected:Npn \@@_peek_analysis_escape:
{
Expand All @@ -1618,25 +1592,27 @@
}
\cs_new_protected:Npn \@@_peek_analysis_collect_loop:
{
\tex_futurelet:D \l_@@_analysis_token
\@@_peek_analysis_collect_test:
}
\cs_new_protected:Npn \@@_peek_analysis_collect_test:
{
\if_meaning:w \l_@@_analysis_token \l_@@_analysis_next_token
\exp_after:wN \if_meaning:w \cs:w \l_@@_internal_a_tl \cs_end: \l_peek_token
\@@_peek_analysis_collect_end:NNN
\exp_after:wN \if_meaning:w
\cs:w
\if_cs_exist:w \l_@@_internal_a_tl \cs_end:
\l_@@_internal_a_tl
\else:
c_one % anything short
\fi:
\cs_end:
\l_peek_token
\@@_peek_analysis_collect_end:NNNN
\fi:
\@@_peek_analysis_collect:w
\tex_futurelet:D \l_@@_analysis_token
\@@_peek_analysis_collect:w
}
% \end{macrocode}
% End by calling the user code with suitable arguments (here |#1|,
% |#2| are \cs{fi:}), which closes the group begun early on.
% \begin{macrocode}
\cs_new_protected:Npn \@@_peek_analysis_collect_end:NNN #1#2#3
\cs_new_protected:Npn \@@_peek_analysis_collect_end:NNNN #1#2#3#4
{
#1 #2
#1
\tl_put_right:Ne \l_@@_peek_code_tl
{
{ \exp_not:N \exp_not:n { \exp_not:c { \l_@@_internal_a_tl } } }
Expand Down
8 changes: 7 additions & 1 deletion l3kernel/l3token.dtx
Original file line number Diff line number Diff line change
Expand Up @@ -962,7 +962,7 @@
% (as appropriate to the result of the test).
% \end{function}
%
% \begin{function}[added = 2020-12-03, updated = 2022-10-03]
% \begin{function}[added = 2020-12-03, updated = 2024-02-07]
% {\peek_analysis_map_inline:n}
% \begin{syntax}
% \cs{peek_analysis_map_inline:n} \Arg{inline function}
Expand Down Expand Up @@ -1003,6 +1003,12 @@
% effect after the loop. Within the code, \cs{l_peek_token} is set
% equal (as a token, not a token list) to the token under
% consideration.
% \begin{texnote}
% In case the input stream has not yet been tokenized (converted
% from characters to tokens), characters are tokenized one by one as
% needed by \cs{peek_analysis_map_inline:n} using the current
% category code regime.
% \end{texnote}
% \end{function}
%
% \begin{function}[added = 2020-12-03]
Expand Down
20 changes: 18 additions & 2 deletions l3kernel/testfiles/m3peek003.lvt
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,30 @@
}
\exp_after:wN \test: \exp_not:N \prg_do_nothing: \stop

% Avoid tokenizing ahead after a space
\cs_set_protected:Npn \test:
{
\group_begin:
\char_set_catcode_other:N \\
\peek_analysis_map_inline:n
{
\TYPE { \tl_to_str:n {##1} , ##2 , ##3 }
\if_catcode:w + ##1
\exp_after:wN \use_none:n
\else:
\exp_after:wN \use:n
\fi:
{ \peek_analysis_map_break:n { \group_end: \test_end:w } }
}
}
\cs_set_protected:Npn \test_end:w #1 . { \tl_show:n {#1} }
\test: * ~ \scan_stop: .

% Original report from E.G.
\cs_new_eq:NN \prro \peek_regex_replace_once:nn
\ExplSyntaxOff
\prro{abc}{\c{TRUE}}abc



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\END
7 changes: 6 additions & 1 deletion l3kernel/testfiles/m3peek003.tlg
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ macro parameter character #,-1,0
\outer macro:->,-1,0
TRUE
\exp_not:n {\c_group_begin_token },-1,0
\__kernel_exp_not:w \exp_after:wN {\exp_not:N \prg_do_nothing: },-1,0
\exp_not:n {\prg_do_nothing: },-1,0
*,42,C
,32,A
> \scan_stop: .
<recently read> }
l. ...\test: * ~ \scan_stop: .
Defining \prro on line ...
TRUE

0 comments on commit 8e6610c

Please sign in to comment.