New \codepoint_(str_)generate:n(n) functions

latex3 · Oct 9, 2022 · d4f3d9b · d4f3d9b
1 parent 3da1507
commit d4f3d9b
Show file tree

Hide file tree

Showing 7 changed files with 241 additions and 56 deletions.
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
@@ -7,6 +7,9 @@ this project uses date-based 'snapshot' version identifiers.
 
 ## [Unreleased]
 
+### Added
+- `\codepoint_str_generate:n`
+
 ### Changed
 - Usage of `\exp_not:n`/`\exp_not:N` in `\peek_analysis_map_inline:n` output
 

diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
@@ -1828,51 +1828,13 @@
 \cs_new:Npn \@@_to_nfd:nnnn #1#2#3#4
   {
     \int_compare:nNnTF {#1} = {#3}
-      { \@@_to_nfd_generate:nn {#1} {#4} }
+      { \codepoint_generate:nn {#1} {#4} }
       {
         \@@_to_nfd:nn {#1} {#4}
         \tl_if_blank:nF {#2}
           { \@@_to_nfd:nn {#2} {#4} }
       }
   }
-\bool_lazy_or:nnTF
-  { \sys_if_engine_luatex_p: }
-  { \sys_if_engine_xetex_p: }
-  {
-    \cs_new:Npn \@@_to_nfd_generate:nn
-      { \char_generate:nn }
-  }
-  {
-    \cs_new:Npn \@@_to_nfd_generate:nn #1#2
-      {
-        \exp_args:Ne \@@_to_nfd_generate:n
-          { \char_to_utfviii_bytes:n {#1} }
-      }
-    \cs_new:Npn \@@_to_nfd_generate:n #1
-      { \@@_to_nfd_generate:nnnn #1 }
-     \cs_new:Npn \@@_to_nfd_generate:nnnn #1#2#3#4
-        {
-          \int_compare:nNnTF {#1} < { "80 }
-            { \char_generate:nn {#1} { \char_value_catcode:n {#1} } }
-            {
-              \exp_after:wN \exp_after:wN \exp_after:wN
-                \exp_not:N \char_generate:nn {#1} { 13 }
-              \exp_after:wN \exp_after:wN \exp_after:wN
-                \exp_not:N \char_generate:nn {#2} { 13 }
-              \tl_if_blank:nF {#3}
-                {
-                  \exp_after:wN \exp_after:wN \exp_after:wN
-                    \exp_not:N \char_generate:nn {#3} { 13 }
-                  \tl_if_blank:nF {#4}
-                    {
-                      \exp_after:wN \exp_after:wN \exp_after:wN
-                        \exp_not:N \char_generate:nn {#4} { 13 }
-                    }
-                }
-            }
-
-        }
-  }
 %    \end{macrocode}
 % \end{macro}
 % \end{macro}

diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx
@@ -50,8 +50,54 @@
 % \begin{documentation}
 %
 % This module provides Unicode-specific functions along with loading data
-% from a range of Unicode Consortium files. At present, it provides no
-% public functions.
+% from a range of Unicode Consortium files. Most of the code here is
+% internal, but there are a small set of public functions. These work with
+% Unicode \meta{codepoints} and are designed to give useable results with
+% both Unicode-aware and $8$-bit engines.
+%
+% \begin{function}[EXP, added = 2022-10-09]
+%   {\codepoint_generate:nn}
+%   \begin{syntax}
+%      \cs{codepoint_generate:nn} \Arg{codepoint} \Arg{catcode}
+%   \end{syntax}
+%   Generates one or more character tokens representing the \meta{codepoint}.
+%   With Unicode engines, exactly one character token will be generated, and
+%   this will have the \meta{catcode} specified as the second argument:
+%   \begin{itemize}
+%     \item $1$ (begin group)
+%     \item $2$ (end group)
+%     \item $3$ (math toggle)
+%     \item $4$ (alignment)
+%     \item $6$ (parameter)
+%     \item $7$ (math superscript)
+%     \item $8$ (math subscript)
+%     \item $10$ (space)
+%     \item $11$ (letter)
+%     \item $12$ (other)
+%     \item $13$ (active)
+%   \end{itemize}
+%   For $8$-bit engines, between one and four character tokens will be
+%   produced: these will be the bytes of the UTF-8 representation of the
+%   \meta{codepoint}. For all codepoints outside of the classical ASCII
+%   range, the generated character tokens will be active (category code
+%   $13$); the \meta{catcode} argument is only used for codepoints in the
+%   ASCII range. To allow the result of this function to be used inside a
+%   expansion context, the result is protected by \cs{exp_not:n}.
+% \end{function}
+%
+% \begin{function}[EXP, added = 2022-10-09]
+%   {\codepoint_str_generate:n}
+%   \begin{syntax}
+%      \cs{codepoint_str_generate:n} \Arg{codepoint}
+%   \end{syntax}
+%   Generates one or more character tokens representing the \meta{codepoint}.
+%   With Unicode engines, exactly one character token will be generated.
+%   For $8$-bit engines, between one and four character tokens will be
+%   produced: these will be the bytes of the UTF-8 representation of the
+%   \meta{codepoint}. All of the generated character tokens will be of
+%   category code $12$, except any spaces (codepoint $32$), which will be
+%   category code $10$.
+% \end{function}
 %
 % \end{documentation}
 %
@@ -101,8 +147,11 @@
 % they are Unicode or $8$-bit internally. Parsing is therefore done by common
 % functions, with some data storage using engine-specific auxiliaries.
 %
-% \begin{macro}[EXP]{\@@_generate_str:n}
-% \begin{macro}[EXP]{\@@_generate_str:nnnn}
+% \begin{macro}[EXP]{\codepoint_str_generate:n}
+% \begin{macro}[EXP]{\@@_str_generate:nnnn}
+% \begin{macro}[EXP]{\codepoint_generate:nn}
+% \begin{macro}[EXP]{\@@_generate:nnnn}
+% \begin{macro}[EXP]{\@@_generate:n}
 %   Conversion of a codepoint to a character (Unicode engines) or to one
 %   or more bytes ($8$-bit engines) is required. For loading the data,
 %   all that is needed is the form which creates strings: these are outside
@@ -115,23 +164,36 @@
   { \sys_if_engine_luatex_p: }
   { \sys_if_engine_xetex_p: }
   {
-    \cs_new:Npn \@@_generate_str:n #1
+    \cs_new:Npn \codepoint_str_generate:n #1
       {
         \int_compare:nNnTF {#1} = { `\  }
           { ~ }
           { \char_generate:nn {#1} { 12 } }
       }
+   \cs_new:Npn \codepoint_generate:nn #1#2
+      {
+        \int_compare:nNnTF {#1} = { `\  }
+          { ~ }
+          {
+            \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
+              { \char_generate:nn {#1} {#2} }
+          }
+      }
   }
   {
-    \cs_new:Npn \@@_generate_str:n #1
+    \cs_new:Npn \codepoint_str_generate:n #1
       {
-        \use:e
+        \int_compare:nNnTF {#1} = { `\  }
+          { ~ }
           {
-            \exp_not:N \@@_generate_str:nnnn
-              \char_to_utfviii_bytes:n {#1}
+            \use:e
+              {
+                \exp_not:N \@@_str_generate:nnnn
+                  \char_to_utfviii_bytes:n {#1}
+              }
           }
       }
-    \cs_new:Npn \@@_generate_str:nnnn #1#2#3#4
+    \cs_new:Npn \@@_str_generate:nnnn #1#2#3#4
       {
         \char_generate:nn {#1} { 12 }
         \tl_if_blank:nF {#2}
@@ -145,10 +207,54 @@
               }
           }
       }
+    \cs_new:Npn \codepoint_generate:nn #1#2
+      {
+        \int_compare:nNnTF {#1} = { `\  }
+          { ~ }
+          {
+            \int_compare:nNnTF {#1} < { "80 }
+              {
+                \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
+                  { \char_generate:nn {#1} {#2} }
+              }
+              {
+                \use:e
+                  {
+                    \exp_not:N \@@_generate:nnnn
+                      \char_to_utfviii_bytes:n {#1}
+                  }
+              }
+          }
+      }
+    \cs_new:Npn \@@_generate:nnnn #1#2#3#4
+      {
+        \__kernel_exp_not:w \exp_after:wN
+          {
+            \tex_expanded:D
+              {
+                \@@_generate:n {#1}
+                \@@_generate:n {#2}
+                \tl_if_blank:nF {#3}
+                  {
+                    \@@_generate:n {#3}
+                    \tl_if_blank:nF {#4}
+                      { \@@_generate:n {#4} }
+                  }
+              }
+          }
+      }
+     \cs_new:Npn \@@_generate:n #1
+       {
+         \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN
+           { \char_generate:nn {#1} { 13 } }
+       }
   }
 %    \end{macrocode}
 % \end{macro}
 % \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
 %
 % As only the data needs to remain at the end of this process, everything
 % is set up inside a group. The only thing that is outside is creating a
@@ -218,7 +324,7 @@
   \cs_set_protected:Npn \@@_data_auxii:w #1 ; #2 ~ #3 \q_stop
     {
       \tl_const:cx
-        { c_@@_nfd_ \@@_generate_str:n {"#1} _tl }
+        { c_@@_nfd_ \codepoint_str_generate:n {"#1} _tl }
         {
           {"#2}
           { \tl_if_blank:nF {#3} {"#3} }
@@ -270,7 +376,7 @@
       \int_compare:nNnF {#3} = { \@@_data_offset:nn {#1} {#5} }
         {
           \tl_const:cx
-            { c_@@_titlecase_ \@@_generate_str:n {"#1} _tl }
+            { c_@@_titlecase_ \codepoint_str_generate:n {"#1} _tl }
             { {"#5} { } { } }
         }
       \tl_set:Nx \l_@@_next_codepoint_fint_tl
@@ -504,7 +610,7 @@
           \int_eval:n { \__kernel_codepoint_data:nn { lowercase } {"#1} + "#1 }
             = "#3 ~
           \tl_const:cx
-            { c_@@_casefold_ \@@_generate_str:n {"#1} _tl }
+            { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl }
             { {"#3} { } { } }
         \fi:
       \else:
@@ -519,7 +625,7 @@
 %    \begin{macrocode}
   \cs_set_protected:Npn \@@_data_auxii:w #1 ~ #2 ~ #3 ~ #4 \q_stop
     {
-      \tl_const:cx { c_@@_casefold_ \@@_generate_str:n {"#1} _tl }
+      \tl_const:cx { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl }
         {
           {"#2}
           {"#3}
@@ -552,7 +658,7 @@
     {
       \tl_if_empty:nF {#4}
         {
-          \tl_const:cx { c_@@_ #2 case_ \@@_generate_str:n {"#1} _tl }
+          \tl_const:cx { c_@@_ #2 case_ \codepoint_str_generate:n {"#1} _tl }
             {
               {"#3}
               {"#4}
@@ -588,7 +694,7 @@
 \cs_new:Npn \__kernel_codepoint_case:nn #1#2
   {
     \exp_args:Ne \@@_case:nnn
-      { \@@_generate_str:n {#2} } {#1} {#2}
+      { \codepoint_str_generate:n {#2} } {#1} {#2}
   }
 \cs_new:Npn \@@_case:nnn #1#2#3
   {
@@ -621,7 +727,7 @@
 %   A simple interface.
 %    \begin{macrocode}
 \cs_new:Npn \__kernel_codepoint_nfd:n #1
-  { \exp_args:Ne \@@_nfd:nn { \@@_generate_str:n {#1} } {#1} }
+  { \exp_args:Ne \@@_nfd:nn { \codepoint_str_generate:n {#1} } {#1} }
 \cs_new:Npn \@@_nfd:nn #1#2
   {
     \tl_if_exist:cTF { c_@@_nfd_ #1 _tl }

diff --git a/l3kernel/testfiles/m3unicode001.luatex.tlg b/l3kernel/testfiles/m3unicode001.luatex.tlg
@@ -0,0 +1,23 @@
+This is a generated file for the LaTeX (2e + expl3) validation system.
+Don't change this file in any respect.
+Author: Joseph Wright
+============================================================
+TEST 1: Codepoint to chars
+============================================================
+AA
+AA
+^^ad^^ad
+įį
+ͰͰ
+ԠԠ
+પપ
+ᄤᄤ
+𐀉𐀉
+============================================================
+============================================================
+TEST 2: Spaces
+============================================================
+X X
+X X
+XaX
+============================================================
diff --git a/l3kernel/testfiles/m3unicode001.lvt b/l3kernel/testfiles/m3unicode001.lvt
@@ -0,0 +1,45 @@
+%
+% Copyright (C) 2022 The LaTeX Project
+%
+\documentclass{minimal}
+\input{regression-test}
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+
+\begin{document}
+
+\START
+\AUTHOR{Joseph Wright}
+\ExplSyntaxOn
+
+\OMIT
+\cs_set:Npn \test:nn #1#2
+  {
+    \codepoint_generate:nn {#1} {#2}
+    \codepoint_str_generate:n {#1}
+  }
+\TIMO
+
+\TESTEXP { Codepoint~to~chars }
+  {
+    \test:nn { "0041 }  { 11 } \NEWLINE
+    \test:nn { "0041 }  { 12 } \NEWLINE
+    \test:nn { "00AD }  { 12 } \NEWLINE
+    \test:nn { "012F }  { 11 } \NEWLINE 
+    \test:nn { "0370 }  { 11 } \NEWLINE
+    \test:nn { "0520 }  { 11 } \NEWLINE
+    \test:nn { "0AAA }  { 11 } \NEWLINE
+    \test:nn { "1124 }  { 11 } \NEWLINE
+    \test:nn { "10009 } { 11 } \NEWLINE
+  }
+
+\TESTEXP { Spaces }
+  {
+    X \codepoint_generate:nn { 32 } { 11 } X \NEWLINE
+    X \codepoint_generate:nn { 32 } { 12 } X \NEWLINE
+    X \codepoint_generate:nn { 97 } { 10 } X \NEWLINE
+  }
+
+\END
diff --git a/l3kernel/testfiles/m3unicode001.tlg b/l3kernel/testfiles/m3unicode001.tlg
@@ -0,0 +1,23 @@
+This is a generated file for the LaTeX (2e + expl3) validation system.
+Don't change this file in any respect.
+Author: Joseph Wright
+============================================================
+TEST 1: Codepoint to chars
+============================================================
+AA
+AA
+^^c2^^ad^^c2^^ad
+^^c4^^af^^c4^^af
+^^cd^^b0^^cd^^b0
+^^d4^^a0^^d4^^a0
+^^e0^^aa^^aa^^e0^^aa^^aa
+^^e1^^84^^a4^^e1^^84^^a4
+^^f0^^90^^80^^89^^f0^^90^^80^^89
+============================================================
+============================================================
+TEST 2: Spaces
+============================================================
+X X
+X X
+XaX
+============================================================