diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md index 0555893325..37ca376675 100644 --- a/l3kernel/CHANGELOG.md +++ b/l3kernel/CHANGELOG.md @@ -13,6 +13,7 @@ this project uses date-based 'snapshot' version identifiers. - Color export in comma-separated format ### Changed +- Use prevailing catcodes instead of string in regex replacement (issue #621) - `\__kernel_file_name_sanitize:n` now uses a faster `\csname`-based approach to expand the file name. - `\pdf_version_gset:n` for `dvips`. diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx index 5569afdd86..b25a659a8f 100644 --- a/l3kernel/l3regex.dtx +++ b/l3kernel/l3regex.dtx @@ -426,10 +426,17 @@ % the last match is used in the replacement text. Submatches always keep % the same category codes as in the original token list. % -% The characters inserted by the replacement have category code $12$ -% (other) by default, with the exception of space characters. Spaces -% inserted through \verb*|\ | have category code $10$, while spaces -% inserted through |\x20| or |\x{20}| have category code $12$. +% By default, the category code of characters inserted by the +% replacement are determined by the prevailing category code regime at +% the time where the replacement is made, with two exceptions: +% \begin{itemize} +% \item space characters (with character code $32$) inserted with +% \verb*|\ | or |\x20| or |\x{20}| have category code~$10$ regardless +% of the prevailing category code regime; +% \item if the category code would be $0$~(escape), $5$~(newline), +% $9$~(ignore), $14$~(comment) or $15$~(invalid), it is replaced by +% $12$~(other) instead. +% \end{itemize} % The escape sequence |\c| allows to insert characters % with arbitrary category codes, as well as control sequences. % \begin{l3regex-syntax} @@ -5304,7 +5311,7 @@ % \end{macrocode} % \end{macro} % -% \begin{macro}{\@@_replacement_normal:n} +% \begin{macro}{\@@_replacement_normal:n, \@@_replacement_normal_aux:N} % Most characters are simply sent to the output by % \cs{tl_build_put_right:Nn}, unless a particular category code has been % requested: then \cs{@@_replacement_c_A:w} or a similar auxiliary is @@ -5313,13 +5320,16 @@ % sequence is non-empty there: it contains an empty entry % corresponding to the initial value of % \cs{l_@@_replacement_category_tl}. -% The argument |#1| can be a space, otherwise it is a single -% character. +% The argument |#1| is a single character (including the case of a catcode-other space). +% In case no specific catcode is requested, we taked into account the +% current catcode regime (at the time the replacement is performed) +% as much as reasonable, with all impossible catcodes (escape, +% newline, etc.) being mapped to \enquote{other}. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_normal:n #1 { \tl_if_empty:NTF \l_@@_replacement_category_tl - { \@@_replacement_put:n {#1} } + { \@@_replacement_normal_aux:N #1 } { % ( \token_if_eq_charcode:NNTF #1 ) { @@ -5327,15 +5337,37 @@ \l_@@_replacement_category_tl } { - \use:c - { - @@_replacement_c_ - \l_@@_replacement_category_tl :w - } - \@@_replacement_normal:n {#1} + \use:c { @@_replacement_c_ \l_@@_replacement_category_tl :w } + ? #1 } } } +\cs_new_protected:Npn \@@_replacement_normal_aux:N #1 + { + \token_if_eq_charcode:NNTF #1 \c_space_token + { \@@_replacement_c_S:w } + { + \exp_after:wN \exp_after:wN + \if_case:w \tex_catcode:D `#1 \exp_stop_f: + \@@_replacement_c_O:w + \or: \@@_replacement_c_B:w + \or: \@@_replacement_c_E:w + \or: \@@_replacement_c_M:w + \or: \@@_replacement_c_T:w + \or: \@@_replacement_c_O:w + \or: \@@_replacement_c_P:w + \or: \@@_replacement_c_U:w + \or: \@@_replacement_c_D:w + \or: \@@_replacement_c_O:w + \or: \@@_replacement_c_S:w + \or: \@@_replacement_c_L:w + \or: \@@_replacement_c_O:w + \or: \@@_replacement_c_A:w + \else: \@@_replacement_c_O:w + \fi: + } + ? #1 + } % \end{macrocode} % \end{macro} % @@ -5343,7 +5375,6 @@ % As in parsing a regular expression, we use an auxiliary built from % |#1| if defined. Otherwise, check for escaped digits (standing from % submatches from $0$ to $9$): anything else is a raw character. -% We use \cs{token_to_str:N} to give spaces the right category code. % \begin{macrocode} \cs_new_protected:Npn \@@_replacement_escaped:N #1 { @@ -5352,8 +5383,7 @@ \if_int_compare:w 1 < 1#1 \exp_stop_f: \@@_replacement_put_submatch:n {#1} \else: - \exp_args:No \@@_replacement_normal:n - { \token_to_str:N #1 } + \@@_replacement_normal:n {#1} \fi: } } diff --git a/l3kernel/testfiles/m3regex005.luatex.tlg b/l3kernel/testfiles/m3regex005.luatex.tlg index 0e3cf39a31..32057e3e57 100644 --- a/l3kernel/testfiles/m3regex005.luatex.tlg +++ b/l3kernel/testfiles/m3regex005.luatex.tlg @@ -180,9 +180,9 @@ The token list \l_tmpa_tl contains the tokens: } l. ... } The token list \l_tmpa_tl contains the tokens: -> x (the character x) +> x (the letter x) > \c_parameter_token (control sequence=macro parameter character #) -> x (the character x). +> x (the letter x). } l. ... } ============================================================ @@ -281,6 +281,11 @@ TEST 10: Caseless matching and cs TEST 11: Braces ============================================================ |\{}| +The token list \l_tmpa_tl contains the tokens: +> \{ (control sequence=\protected macro:->\ifmmode \lbrace \else \textb\ETC.) +> } (the letter }). + } +l. ... } ! LaTeX3 Error: Missing right brace inserted in replacement text. For immediate help type H . ... @@ -297,8 +302,8 @@ TEST 12: More tests of cs TEST 13: Replaced space catcode ============================================================ blank space -the character -the character +blank space +blank space ============================================================ ============================================================ TEST 14: Catcode group in replacement @@ -312,7 +317,7 @@ In a replacement text, the '\c' escape sequence can be followed by one of the letters 'ABCDELMOPSTU' representing the character category. Then, a character must follow, not '\1'. The token list \l_tmpa_tl contains the tokens: -> q (the character q) +> q (the letter q) > e (subscript character e) > t (superscript character t) > a (the letter a) @@ -321,8 +326,8 @@ The token list \l_tmpa_tl contains the tokens: > 1 (math shift character 1) > p (superscript character p) > s (subscript character s) -> f (the character f) -> q (the character q) +> f (the letter f) +> q (the letter q) > e (subscript character e) > t (superscript character t) > a (the character a) @@ -331,7 +336,7 @@ The token list \l_tmpa_tl contains the tokens: > 1 (math shift character 1) > p (superscript character p) > s (subscript character s) -> f (the character f). +> f (the letter f). } l. ... } ! LaTeX3 Error: Missing right parenthesis inserted in replacement text. @@ -347,3 +352,30 @@ l. ... } There were 2 missing right parentheses. > \l_tmpb_tl=. ============================================================ +============================================================ +TEST 15: Catcode used by default +============================================================ +\g__cctab_next_cctab=\catcodetable... +The token list \l_tmpa_tl contains the tokens: +> ^^M (the character ^^M) +> ! (the character !) +> @ (the character @) +> # (macro parameter character #) +> # (macro parameter character #) +> # (macro parameter character #) +> $ (math shift character $) +> % (the character %) +> $ (math shift character $) +> ^ (superscript character ^) +> & (alignment tab character &) +> * (the character *) +> { (begin-group character {) +> (blank space ) +> } (end-group character }) +> : (the character :) +> _ (subscript character _) +> ~ (active character=macro:->\nobreakspace {}) +> \ (the character \). + } +l. ... } +============================================================ diff --git a/l3kernel/testfiles/m3regex005.lvt b/l3kernel/testfiles/m3regex005.lvt index 9dd59873bb..602a50efae 100644 --- a/l3kernel/testfiles/m3regex005.lvt +++ b/l3kernel/testfiles/m3regex005.lvt @@ -141,8 +141,9 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \TEST { Braces } { - \regex_replace_once:nnN { .* } { \c{{}} } \l_tmpa_tl + \regex_replace_once:nnN { .* } { \c{{}\cL} } \l_tmpa_tl \TYPE { | \tl_to_str:N \l_tmpa_tl | } + \tl_analysis_show:N \l_tmpa_tl \exp_args:Nnx \regex_replace_once:nnN { .* } { \iow_char:N\\c\iow_char:N\{ } \l_tmpa_tl \TYPE { | \tl_to_str:N \l_tmpa_tl | } @@ -188,5 +189,15 @@ \tl_log:N \l_tmpb_tl } +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\TEST { Catcode~used~by~default } + { + \cctab_begin:N \c_document_cctab + \tl_clear:N \l_tmpa_tl + \regex_replace_all:nnN { } { \x0d!@#\#$\%$^&*{\ }:_\~\\ } \l_tmpa_tl + \tl_analysis_show:N \l_tmpa_tl + \cctab_end: + } + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \END diff --git a/l3kernel/testfiles/m3regex005.tlg b/l3kernel/testfiles/m3regex005.tlg index 29d78f86aa..cc6c44cbaa 100644 --- a/l3kernel/testfiles/m3regex005.tlg +++ b/l3kernel/testfiles/m3regex005.tlg @@ -180,9 +180,9 @@ The token list \l_tmpa_tl contains the tokens: } l. ... } The token list \l_tmpa_tl contains the tokens: -> x (the character x) +> x (the letter x) > \c_parameter_token (control sequence=macro parameter character #) -> x (the character x). +> x (the letter x). } l. ... } ============================================================ @@ -281,6 +281,11 @@ TEST 10: Caseless matching and cs TEST 11: Braces ============================================================ |\{}| +The token list \l_tmpa_tl contains the tokens: +> \{ (control sequence=\protected macro:->\ifmmode \lbrace \else \textb\ETC.) +> } (the letter }). + } +l. ... } ! LaTeX3 Error: Missing right brace inserted in replacement text. For immediate help type H . ... @@ -297,8 +302,8 @@ TEST 12: More tests of cs TEST 13: Replaced space catcode ============================================================ blank space -the character -the character +blank space +blank space ============================================================ ============================================================ TEST 14: Catcode group in replacement @@ -312,7 +317,7 @@ In a replacement text, the '\c' escape sequence can be followed by one of the letters 'ABCDELMOPSTU' representing the character category. Then, a character must follow, not '\1'. The token list \l_tmpa_tl contains the tokens: -> q (the character q) +> q (the letter q) > e (subscript character e) > t (superscript character t) > a (the letter a) @@ -321,8 +326,8 @@ The token list \l_tmpa_tl contains the tokens: > 1 (math shift character 1) > p (superscript character p) > s (subscript character s) -> f (the character f) -> q (the character q) +> f (the letter f) +> q (the letter q) > e (subscript character e) > t (superscript character t) > a (the character a) @@ -331,7 +336,7 @@ The token list \l_tmpa_tl contains the tokens: > 1 (math shift character 1) > p (superscript character p) > s (subscript character s) -> f (the character f). +> f (the letter f). } l. ... } ! LaTeX3 Error: Missing right parenthesis inserted in replacement text. @@ -347,3 +352,30 @@ l. ... } There were 2 missing right parentheses. > \l_tmpb_tl=. ============================================================ +============================================================ +TEST 15: Catcode used by default +============================================================ +Defining \g__cctab_1_cctab on line ... +The token list \l_tmpa_tl contains the tokens: +> ^^M (the character ^^M) +> ! (the character !) +> @ (the character @) +> # (macro parameter character #) +> # (macro parameter character #) +> # (macro parameter character #) +> $ (math shift character $) +> % (the character %) +> $ (math shift character $) +> ^ (superscript character ^) +> & (alignment tab character &) +> * (the character *) +> { (begin-group character {) +> (blank space ) +> } (end-group character }) +> : (the character :) +> _ (subscript character _) +> ~ (active character=macro:->\nobreakspace {}) +> \ (the character \). + } +l. ... } +============================================================ diff --git a/l3kernel/testfiles/m3regex005.xetex.tlg b/l3kernel/testfiles/m3regex005.xetex.tlg index 0e3cf39a31..b74de5b573 100644 --- a/l3kernel/testfiles/m3regex005.xetex.tlg +++ b/l3kernel/testfiles/m3regex005.xetex.tlg @@ -180,9 +180,9 @@ The token list \l_tmpa_tl contains the tokens: } l. ... } The token list \l_tmpa_tl contains the tokens: -> x (the character x) +> x (the letter x) > \c_parameter_token (control sequence=macro parameter character #) -> x (the character x). +> x (the letter x). } l. ... } ============================================================ @@ -281,6 +281,11 @@ TEST 10: Caseless matching and cs TEST 11: Braces ============================================================ |\{}| +The token list \l_tmpa_tl contains the tokens: +> \{ (control sequence=\protected macro:->\ifmmode \lbrace \else \textb\ETC.) +> } (the letter }). + } +l. ... } ! LaTeX3 Error: Missing right brace inserted in replacement text. For immediate help type H . ... @@ -297,8 +302,8 @@ TEST 12: More tests of cs TEST 13: Replaced space catcode ============================================================ blank space -the character -the character +blank space +blank space ============================================================ ============================================================ TEST 14: Catcode group in replacement @@ -312,7 +317,7 @@ In a replacement text, the '\c' escape sequence can be followed by one of the letters 'ABCDELMOPSTU' representing the character category. Then, a character must follow, not '\1'. The token list \l_tmpa_tl contains the tokens: -> q (the character q) +> q (the letter q) > e (subscript character e) > t (superscript character t) > a (the letter a) @@ -321,8 +326,8 @@ The token list \l_tmpa_tl contains the tokens: > 1 (math shift character 1) > p (superscript character p) > s (subscript character s) -> f (the character f) -> q (the character q) +> f (the letter f) +> q (the letter q) > e (subscript character e) > t (superscript character t) > a (the character a) @@ -331,7 +336,7 @@ The token list \l_tmpa_tl contains the tokens: > 1 (math shift character 1) > p (superscript character p) > s (subscript character s) -> f (the character f). +> f (the letter f). } l. ... } ! LaTeX3 Error: Missing right parenthesis inserted in replacement text. @@ -347,3 +352,30 @@ l. ... } There were 2 missing right parentheses. > \l_tmpb_tl=. ============================================================ +============================================================ +TEST 15: Catcode used by default +============================================================ +Defining \g__cctab_1_cctab on line ... +The token list \l_tmpa_tl contains the tokens: +> ^^M (the character ^^M) +> ! (the character !) +> @ (the character @) +> # (macro parameter character #) +> # (macro parameter character #) +> # (macro parameter character #) +> $ (math shift character $) +> % (the character %) +> $ (math shift character $) +> ^ (superscript character ^) +> & (alignment tab character &) +> * (the character *) +> { (begin-group character {) +> (blank space ) +> } (end-group character }) +> : (the character :) +> _ (subscript character _) +> ~ (active character=macro:->\nobreakspace {}) +> \ (the character \). + } +l. ... } +============================================================