Change catcode produced by default by l3regex replacement (fixes #621)

latex3 · Apr 24, 2021 · 0555577 · 0555577
1 parent c76ca6c
commit 0555577
Show file tree

Hide file tree

Showing 6 changed files with 180 additions and 42 deletions.
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
@@ -13,6 +13,7 @@ this project uses date-based 'snapshot' version identifiers.
 - Color export in comma-separated format
 
 ### Changed
+- Use prevailing catcodes instead of string in regex replacement (issue #621)
 - `\__kernel_file_name_sanitize:n` now uses a faster `\csname`-based
   approach to expand the file name.
 - `\pdf_version_gset:n` for `dvips`.  

diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
@@ -426,10 +426,17 @@
 % the last match is used in the replacement text. Submatches always keep
 % the same category codes as in the original token list.
 %
-% The characters inserted by the replacement have category code $12$
-% (other) by default, with the exception of space characters.  Spaces
-% inserted through \verb*|\ | have category code $10$, while spaces
-% inserted through |\x20| or |\x{20}| have category code $12$.
+% By default, the category code of characters inserted by the
+% replacement are determined by the prevailing category code regime at
+% the time where the replacement is made, with two exceptions:
+% \begin{itemize}
+% \item space characters (with character code $32$) inserted with
+%   \verb*|\ | or |\x20| or |\x{20}| have category code~$10$ regardless
+%   of the prevailing category code regime;
+% \item if the category code would be $0$~(escape), $5$~(newline),
+%   $9$~(ignore), $14$~(comment) or $15$~(invalid), it is replaced by
+%   $12$~(other) instead.
+% \end{itemize}
 % The escape sequence |\c| allows to insert characters
 % with arbitrary category codes, as well as control sequences.
 % \begin{l3regex-syntax}
@@ -5304,7 +5311,7 @@
 %    \end{macrocode}
 % \end{macro}
 %
-% \begin{macro}{\@@_replacement_normal:n}
+% \begin{macro}{\@@_replacement_normal:n, \@@_replacement_normal_aux:N}
 %   Most characters are simply sent to the output by
 %   \cs{tl_build_put_right:Nn}, unless a particular category code has been
 %   requested: then \cs{@@_replacement_c_A:w} or a similar auxiliary is
@@ -5313,37 +5320,61 @@
 %   sequence is non-empty there: it contains an empty entry
 %   corresponding to the initial value of
 %   \cs{l_@@_replacement_category_tl}.
-%   The argument |#1| can be a space, otherwise it is a single
-%   character.
+%   The argument |#1| is a single character (including the case of a catcode-other space).
+%   In case no specific catcode is requested, we taked into account the
+%   current catcode regime (at the time the replacement is performed)
+%   as much as reasonable, with all impossible catcodes (escape,
+%   newline, etc.) being mapped to \enquote{other}.
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_replacement_normal:n #1
   {
     \tl_if_empty:NTF \l_@@_replacement_category_tl
-      { \@@_replacement_put:n {#1} }
+      { \@@_replacement_normal_aux:N #1 }
       { % (
         \token_if_eq_charcode:NNTF #1 )
           {
             \seq_pop:NN \l_@@_replacement_category_seq
               \l_@@_replacement_category_tl
           }
           {
-            \use:c
-              {
-                @@_replacement_c_
-                \l_@@_replacement_category_tl :w
-              }
-              \@@_replacement_normal:n {#1}
+            \use:c { @@_replacement_c_ \l_@@_replacement_category_tl :w }
+            ? #1
           }
       }
   }
+\cs_new_protected:Npn \@@_replacement_normal_aux:N #1
+  {
+    \token_if_eq_charcode:NNTF #1 \c_space_token
+      { \@@_replacement_c_S:w }
+      {
+        \exp_after:wN \exp_after:wN
+        \if_case:w \tex_catcode:D `#1 \exp_stop_f:
+             \@@_replacement_c_O:w
+        \or: \@@_replacement_c_B:w
+        \or: \@@_replacement_c_E:w
+        \or: \@@_replacement_c_M:w
+        \or: \@@_replacement_c_T:w
+        \or: \@@_replacement_c_O:w
+        \or: \@@_replacement_c_P:w
+        \or: \@@_replacement_c_U:w
+        \or: \@@_replacement_c_D:w
+        \or: \@@_replacement_c_O:w
+        \or: \@@_replacement_c_S:w
+        \or: \@@_replacement_c_L:w
+        \or: \@@_replacement_c_O:w
+        \or: \@@_replacement_c_A:w
+        \else: \@@_replacement_c_O:w
+        \fi:
+      }
+    ? #1
+  }
 %    \end{macrocode}
 % \end{macro}
 %
 % \begin{macro}{\@@_replacement_escaped:N}
 %   As in parsing a regular expression, we use an auxiliary built from
 %   |#1| if defined. Otherwise, check for escaped digits (standing from
 %   submatches from $0$ to $9$): anything else is a raw character.
-%   We use \cs{token_to_str:N} to give spaces the right category code.
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_replacement_escaped:N #1
   {
@@ -5352,8 +5383,7 @@
         \if_int_compare:w 1 < 1#1 \exp_stop_f:
           \@@_replacement_put_submatch:n {#1}
         \else:
-          \exp_args:No \@@_replacement_normal:n
-            { \token_to_str:N #1 }
+          \@@_replacement_normal:n {#1}
         \fi:
       }
   }

diff --git a/l3kernel/testfiles/m3regex005.luatex.tlg b/l3kernel/testfiles/m3regex005.luatex.tlg
@@ -180,9 +180,9 @@ The token list \l_tmpa_tl contains the tokens:
 <recently read> }
 l. ...  }
 The token list \l_tmpa_tl contains the tokens:
->  x (the character x)
+>  x (the letter x)
 >  \c_parameter_token (control sequence=macro parameter character #)
->  x (the character x).
+>  x (the letter x).
 <recently read> }
 l. ...  }
 ============================================================
@@ -281,6 +281,11 @@ TEST 10: Caseless matching and cs
 TEST 11: Braces
 ============================================================
 |\{}|
+The token list \l_tmpa_tl contains the tokens:
+>  \{ (control sequence=\protected macro:->\ifmmode \lbrace \else \textb\ETC.)
+>  } (the letter }).
+<recently read> }
+l. ...  }
 ! LaTeX3 Error: Missing right brace inserted in replacement text.
 For immediate help type H <return>.
  ...                                              
@@ -297,8 +302,8 @@ TEST 12: More tests of cs
 TEST 13: Replaced space catcode
 ============================================================
 blank space  
-the character  
-the character  
+blank space  
+blank space  
 ============================================================
 ============================================================
 TEST 14: Catcode group in replacement
@@ -312,7 +317,7 @@ In a replacement text, the '\c' escape sequence can be followed by one of the
 letters 'ABCDELMOPSTU' representing the character category. Then, a character
 must follow, not '\1'.
 The token list \l_tmpa_tl contains the tokens:
->  q (the character q)
+>  q (the letter q)
 >  e (subscript character e)
 >  t (superscript character t)
 >  a (the letter a)
@@ -321,8 +326,8 @@ The token list \l_tmpa_tl contains the tokens:
 >  1 (math shift character 1)
 >  p (superscript character p)
 >  s (subscript character s)
->  f (the character f)
->  q (the character q)
+>  f (the letter f)
+>  q (the letter q)
 >  e (subscript character e)
 >  t (superscript character t)
 >  a (the character a)
@@ -331,7 +336,7 @@ The token list \l_tmpa_tl contains the tokens:
 >  1 (math shift character 1)
 >  p (superscript character p)
 >  s (subscript character s)
->  f (the character f).
+>  f (the letter f).
 <recently read> }
 l. ...  }
 ! LaTeX3 Error: Missing right parenthesis inserted in replacement text.
@@ -347,3 +352,30 @@ l. ...  }
 There were 2 missing right parentheses.
 > \l_tmpb_tl=.
 ============================================================
+============================================================
+TEST 15: Catcode used by default
+============================================================
+\g__cctab_next_cctab=\catcodetable...
+The token list \l_tmpa_tl contains the tokens:
+>  ^^M (the character ^^M)
+>  ! (the character !)
+>  @ (the character @)
+>  # (macro parameter character #)
+>  # (macro parameter character #)
+>  # (macro parameter character #)
+>  $ (math shift character $)
+>  % (the character %)
+>  $ (math shift character $)
+>  ^ (superscript character ^)
+>  & (alignment tab character &)
+>  * (the character *)
+>  { (begin-group character {)
+>    (blank space  )
+>  } (end-group character })
+>  : (the character :)
+>  _ (subscript character _)
+>  ~ (active character=macro:->\nobreakspace {})
+>  \ (the character \).
+<recently read> }
+l. ...  }
+============================================================
diff --git a/l3kernel/testfiles/m3regex005.lvt b/l3kernel/testfiles/m3regex005.lvt
@@ -141,8 +141,9 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \TEST { Braces }
   {
-    \regex_replace_once:nnN { .* } { \c{{}} } \l_tmpa_tl
+    \regex_replace_once:nnN { .* } { \c{{}\cL} } \l_tmpa_tl
     \TYPE { | \tl_to_str:N \l_tmpa_tl | }
+    \tl_analysis_show:N \l_tmpa_tl
     \exp_args:Nnx \regex_replace_once:nnN
       { .* } { \iow_char:N\\c\iow_char:N\{ } \l_tmpa_tl
     \TYPE { | \tl_to_str:N \l_tmpa_tl | }
@@ -188,5 +189,15 @@
     \tl_log:N \l_tmpb_tl
   }
 
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\TEST { Catcode~used~by~default }
+  {
+    \cctab_begin:N \c_document_cctab
+    \tl_clear:N \l_tmpa_tl
+    \regex_replace_all:nnN { } { \x0d!@#\#$\%$^&*{\ }:_\~\\ } \l_tmpa_tl
+    \tl_analysis_show:N \l_tmpa_tl
+    \cctab_end:
+  }
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \END
diff --git a/l3kernel/testfiles/m3regex005.tlg b/l3kernel/testfiles/m3regex005.tlg
@@ -180,9 +180,9 @@ The token list \l_tmpa_tl contains the tokens:
 <recently read> }
 l. ...  }
 The token list \l_tmpa_tl contains the tokens:
->  x (the character x)
+>  x (the letter x)
 >  \c_parameter_token (control sequence=macro parameter character #)
->  x (the character x).
+>  x (the letter x).
 <recently read> }
 l. ...  }
 ============================================================
@@ -281,6 +281,11 @@ TEST 10: Caseless matching and cs
 TEST 11: Braces
 ============================================================
 |\{}|
+The token list \l_tmpa_tl contains the tokens:
+>  \{ (control sequence=\protected macro:->\ifmmode \lbrace \else \textb\ETC.)
+>  } (the letter }).
+<recently read> }
+l. ...  }
 ! LaTeX3 Error: Missing right brace inserted in replacement text.
 For immediate help type H <return>.
  ...                                              
@@ -297,8 +302,8 @@ TEST 12: More tests of cs
 TEST 13: Replaced space catcode
 ============================================================
 blank space  
-the character  
-the character  
+blank space  
+blank space  
 ============================================================
 ============================================================
 TEST 14: Catcode group in replacement
@@ -312,7 +317,7 @@ In a replacement text, the '\c' escape sequence can be followed by one of the
 letters 'ABCDELMOPSTU' representing the character category. Then, a character
 must follow, not '\1'.
 The token list \l_tmpa_tl contains the tokens:
->  q (the character q)
+>  q (the letter q)
 >  e (subscript character e)
 >  t (superscript character t)
 >  a (the letter a)
@@ -321,8 +326,8 @@ The token list \l_tmpa_tl contains the tokens:
 >  1 (math shift character 1)
 >  p (superscript character p)
 >  s (subscript character s)
->  f (the character f)
->  q (the character q)
+>  f (the letter f)
+>  q (the letter q)
 >  e (subscript character e)
 >  t (superscript character t)
 >  a (the character a)
@@ -331,7 +336,7 @@ The token list \l_tmpa_tl contains the tokens:
 >  1 (math shift character 1)
 >  p (superscript character p)
 >  s (subscript character s)
->  f (the character f).
+>  f (the letter f).
 <recently read> }
 l. ...  }
 ! LaTeX3 Error: Missing right parenthesis inserted in replacement text.
@@ -347,3 +352,30 @@ l. ...  }
 There were 2 missing right parentheses.
 > \l_tmpb_tl=.
 ============================================================
+============================================================
+TEST 15: Catcode used by default
+============================================================
+Defining \g__cctab_1_cctab on line ...
+The token list \l_tmpa_tl contains the tokens:
+>  ^^M (the character ^^M)
+>  ! (the character !)
+>  @ (the character @)
+>  # (macro parameter character #)
+>  # (macro parameter character #)
+>  # (macro parameter character #)
+>  $ (math shift character $)
+>  % (the character %)
+>  $ (math shift character $)
+>  ^ (superscript character ^)
+>  & (alignment tab character &)
+>  * (the character *)
+>  { (begin-group character {)
+>    (blank space  )
+>  } (end-group character })
+>  : (the character :)
+>  _ (subscript character _)
+>  ~ (active character=macro:->\nobreakspace {})
+>  \ (the character \).
+<recently read> }
+l. ...  }
+============================================================