Enable CJK token handling for (u)pTeX (fixes #1171)

latex3 · Feb 13, 2023 · a027168 · a027168
1 parent 428132c
commit a027168
Show file tree

Hide file tree

Showing 13 changed files with 822 additions and 98 deletions.
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
@@ -7,6 +7,10 @@ this project uses date-based 'snapshot' version identifiers.
 
 ## [Unreleased]
 
+### Fixed
+- CJK character handling for (u)pTeX (issue
+  [\#1171](https://github.com/latex3/latex3/issues/1171))
+
 ## [2023-02-07]
 
 ### Changed

diff --git a/l3kernel/l3str.dtx b/l3kernel/l3str.dtx
@@ -1917,7 +1917,8 @@
 % \begin{macro}[EXP]{\@@_change_case_end:nw}
 % \begin{macro}[EXP]{\@@_change_case_loop:nw}
 % \begin{macro}[EXP]{\@@_change_case_space:n}
-% \begin{macro}[EXP]{\@@_change_case_char:nN, \@@_change_case_char_aux:nN}
+% \begin{macro}[EXP]
+%   {\@@_change_case_char:nN, \@@_change_case_char_auxi:nN, \@@_change_case_char_auxii:nN}
 % \begin{macro}[EXP]{\@@_change_case_codepoint:nN}
 % \begin{macro}[EXP]{\@@_change_case_codepoint:nNN}
 % \begin{macro}[EXP]{\@@_change_case_codepoint:nNNN}
@@ -1977,22 +1978,33 @@
   \cs_new:Npn \@@_change_case_codepoint:nN #1#2
     { \@@_change_case_char:fnn { \int_eval:n {`#2} } {#1} {#2} }
 \else:
-    \cs_new:Npn \@@_change_case_codepoint:nN #1#2
+    \cs_new:Npx \@@_change_case_codepoint:nN #1#2
       {
-        \int_compare:nNnTF {`#2} > { "80 }
+        \exp_not:N \int_compare:nNnTF {`#2} > { "80 }
           {
-            \int_compare:nNnTF {`#2} < { "E0 }
-              { \@@_change_case_codepoint:nNN }
+            \cs_if_exist:NTF \tex_pdftexversion:D
+              { \exp_not:N \@@_change_case_char_auxi:nN }
               {
-                 \int_compare:nNnTF {`#2} < { "F0 }
-                   { \@@_change_case_codepoint:nNNN }
-                   { \@@_change_case_codepoint:nNNNNN }
+                \exp_not:N \int_compare:nNnTF {`#2} > { "FF }
+                  { \exp_not:N \@@_change_case_char_auxii:nN }
+                  { \exp_not:N \@@_change_case_char_auxi:nN }
               }
           }
-          { \@@_change_case_char_aux:nN }
+          { \exp_not:N \@@_change_case_char_auxii:nN }
+            {#1} #2
+      }
+    \cs_new:Npn \@@_change_case_char_auxi:nN #1#2
+      {
+        \int_compare:nNnTF {`#2} < { "E0 }
+          { \@@_change_case_codepoint:nNN }
+          {
+             \int_compare:nNnTF {`#2} < { "F0 }
+               { \@@_change_case_codepoint:nNNN }
+               { \@@_change_case_codepoint:nNNNNN }
+          }
             {#1} #2
       }
-    \cs_new:Npn \@@_change_case_char_aux:nN #1#2
+    \cs_new:Npn \@@_change_case_char_auxii:nN #1#2
       { \@@_change_case_char:fnn { \int_eval:n {`#2} } {#1} {#2} }
     \cs_new:Npn \@@_change_case_codepoint:nNN #1#2#3
       {

diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
@@ -633,7 +633,7 @@
 %
 % For working with codepoints in an engine-neutral way.
 %
-% \begin{macro}[EXP]{\@@_codepoint_process:nN}
+% \begin{macro}[EXP]{\@@_codepoint_process:nN, \@@_codepoint_process_aux:nN}
 % \begin{macro}[EXP]{\@@_codepoint_process:nNN}
 % \begin{macro}[EXP]{\@@_codepoint_process:nNNN}
 % \begin{macro}[EXP]{\@@_codepoint_process:nNNNN}
@@ -647,43 +647,38 @@
     \cs_new:Npn \@@_codepoint_process:nN #1#2 { #1 {#2} }
   }
   {
-    \cs_new:Npn \@@_codepoint_process:nN #1#2
+    \cs_new:Npx \@@_codepoint_process:nN #1#2
       {
-        \int_compare:nNnTF { `#2 } > { "80 }
+        \exp_not:N \int_compare:nNnTF {`#2} > { "80 }
           {
-            \int_compare:nNnTF { `#2 } < { "E0 }
-              { \@@_codepoint_process:nNN }
+            \sys_if_engine_pdftex:TF
+              { \exp_not:N \@@_codepoint_process_aux:nN }
               {
-                 \int_compare:nNnTF { `#2 } < { "F0 }
-                   { \@@_codepoint_process:nNNN }
-                   { \@@_codepoint_process:nNNNN }
+                \exp_not:N \int_compare:nNnTF {`#2} > { "FF }
+                  { \exp_not:N \use:n }
+                  { \exp_not:N \@@_codepoint_process_aux:nN }
               }
           }
-          { \use:n }
+          { \exp_not:N \use:n }
             {#1} #2
       }
-    \cs_new:Npn \@@_codepoint_process:nNN #1#2#3
-      { #1 {#2#3} }
-    \sys_if_engine_ptex:TF
+    \cs_new:Npn \@@_codepoint_process_aux:nN #1#2
       {
-        \cs_gset:Npn \@@_codepoint_process:nN #1#2
+        \int_compare:nNnTF { `#2 } < { "E0 }
+          { \@@_codepoint_process:nNN }
           {
-            \int_compare:nNnTF { `#2 } > { "80 }
-              {
-                \int_compare:nNnTF { `#2 } < { "E0 }
-                  { \@@_codepoint_process:nNN }
-                  { \use:n }
-              }
-          { \use:n }
+             \int_compare:nNnTF { `#2 } < { "F0 }
+               { \@@_codepoint_process:nNNN }
+               { \@@_codepoint_process:nNNNN }
+          }
             {#1} #2
         }
-      }
-      {
-        \cs_new:Npn \@@_codepoint_process:nNNN #1#2#3#4
-          { #1 {#2#3#4} }
-        \cs_new:Npn \@@_codepoint_process:nNNNN #1#2#3#4#5
-          { #1 {#2#3#4#5} }
-      }
+    \cs_new:Npn \@@_codepoint_process:nNN #1#2#3
+      { #1 {#2#3} }
+    \cs_new:Npn \@@_codepoint_process:nNNN #1#2#3#4
+      { #1 {#2#3#4} }
+    \cs_new:Npn \@@_codepoint_process:nNNNN #1#2#3#4#5
+      { #1 {#2#3#4#5} }
   }
 %    \end{macrocode}
 % \end{macro}
@@ -692,7 +687,8 @@
 % \end{macro}
 %
 % \begin{macro}[EXP, pTF]{\@@_codepoint_compare:nNn}
-% \begin{macro}[EXP]{\@@_codepoint_from_chars:Nw}
+% \begin{macro}[EXP]
+%   {\@@_codepoint_from_chars:Nw, \@@_codepoint_from_chars_aux:Nw}
 % \begin{macro}[EXP]{\@@_codepoint_from_chars:N}
 % \begin{macro}[EXP]{\@@_codepoint_from_chars:NN}
 % \begin{macro}[EXP]{\@@_codepoint_from_chars:NNN}
@@ -720,62 +716,56 @@
             #2 {#3}
           \prg_return_true: \prg_return_false:
       }
-    \cs_new:Npn \@@_codepoint_from_chars:Nw #1
+    \cs_new:Npx \@@_codepoint_from_chars:Nw #1
+      {
+        \exp_not:N \if_int_compare:w `#1 > "80 \exp_not:N \exp_stop_f:
+          \sys_if_engine_pdftex:TF
+            {
+              \exp_not:N \exp_after:wN
+                \exp_not:N \@@_codepoint_from_chars_aux:Nw
+            }
+            {
+              \exp_not:N \if_int_compare:w `#1 > "FF \exp_not:N \exp_stop_f:
+                \exp_not:N \exp_after:wN \exp_not:N \exp_after:wN
+                  \exp_not:N \exp_after:wN
+                  \exp_not:N \@@_codepoint_from_chars:N
+              \exp_not:N \else:
+                \exp_not:N \exp_after:wN \exp_not:N \exp_after:wN
+                  \exp_not:N \exp_after:wN
+                  \exp_not:N \@@_codepoint_from_chars_aux:Nw
+              \exp_not:N \fi:
+            }
+        \exp_not:N \else:
+          \exp_not:N \exp_after:wN \exp_not:N \@@_codepoint_from_chars:N
+        \exp_not:N \fi:
+          #1
+      }
+    \cs_new:Npn \@@_codepoint_from_chars_aux:Nw #1
       {
-        \if_int_compare:w `#1 > "80 \exp_stop_f:
-          \if_int_compare:w `#1 < "E0 \exp_stop_f:
+        \if_int_compare:w `#1 < "E0 \exp_stop_f:
+          \exp_after:wN \@@_codepoint_from_chars:NN
+        \else:
+          \if_int_compare:w `#1 < "F0 \exp_stop_f:
             \exp_after:wN \exp_after:wN \exp_after:wN
-              \@@_codepoint_from_chars:NN
+              \@@_codepoint_from_chars:NNN
           \else:
-            \if_int_compare:w `#1 < "F0 \exp_stop_f:
-              \exp_after:wN \exp_after:wN \exp_after:wN
-              \exp_after:wN \exp_after:wN \exp_after:wN
-              \exp_after:wN \@@_codepoint_from_chars:NNN
-            \else:
-              \exp_after:wN \exp_after:wN \exp_after:wN
-              \exp_after:wN \exp_after:wN \exp_after:wN
-              \exp_after:wN \@@_codepoint_from_chars:NNNN
-            \fi:
+            \exp_after:wN \exp_after:wN \exp_after:wN
+              \@@_codepoint_from_chars:NNNN
           \fi:
-        \else:
-          \exp_after:wN \@@_codepoint_from_chars:N
         \fi:
           #1
       }
-    \cs_new:Npn \@@_codepoint_from_chars:N #1 { `#1 }
+    \cs_new:Npn \@@_codepoint_from_chars:N #1 {`#1}
     \cs_new:Npn \@@_codepoint_from_chars:NN #1#2
       { (`#1 - "C0) * "40 + `#2 - "80 }
-    %    \end{macrocode}
-    %   Avoid high chars with p\TeX{}.
-    %    \begin{macrocode}
-    \sys_if_engine_ptex:TF
+    \cs_new:Npn \@@_codepoint_from_chars:NNN #1#2#3
+      { (`#1 - "E0) * "1000 + (`#2 - "80) * "40 + `#3 - "80 }
+    \cs_new:Npn \@@_codepoint_from_chars:NNNN #1#2#3#4
       {
-        \cs_gset:Npn \@@_codepoint_from_chars:Nw #1
-          {
-            \if_int_compare:w `#1 > "80 \exp_stop_f:
-              \if_int_compare:w `#1 < "E0 \exp_stop_f:
-                \exp_after:wN \exp_after:wN \exp_after:wN
-                  \@@_codepoint_from_chars:NN
-              \else:
-                \exp_after:wN \exp_after:wN \exp_after:wN
-                  \@@_codepoint_from_chars:N
-              \fi:
-            \else:
-              \exp_after:wN \@@_codepoint_from_chars:N
-            \fi:
-              #1
-          }
-      }
-      {
-        \cs_new:Npn \@@_codepoint_from_chars:NNN #1#2#3
-          { (`#1 - "E0) * "1000 + (`#2 - "80) * "40 + `#3 - "80 }
-        \cs_new:Npn \@@_codepoint_from_chars:NNNN #1#2#3#4
-          {
-              (`#1 - "F0) * "40000 
-            + (`#2 - "80) * "1000
-            + (`#3 - "80) * "40
-            + `#4 - "80
-          }
+          (`#1 - "F0) * "40000 
+        + (`#2 - "80) * "1000
+        + (`#3 - "80) * "40
+        + `#4 - "80
       }
   }
 %    \end{macrocode}

diff --git a/l3kernel/testfiles/m3str-convert005.lvt b/l3kernel/testfiles/m3str-convert005.lvt
@@ -11,7 +11,7 @@
 \ExplSyntaxOff
 
 \begin{document}
-\ifdefined\disablecjktoken\disablecjktoken\fi
+
 \START
 \AUTHOR{Joseph Wright}
 \ExplSyntaxOn

diff --git a/l3kernel/testfiles/m3str-convert005.uptex.tlg b/l3kernel/testfiles/m3str-convert005.uptex.tlg
@@ -0,0 +1,16 @@
+This is a generated file for the LaTeX (2e + expl3) validation system.
+Don't change this file in any respect.
+Author: Joseph Wright
+============================================================
+TEST 1: PDF names
+============================================================
+abczz
+brackets#28#29#5B#5D#7B#7D#3C#3Exxx
+gr#C3#BC#C3#9Fe##
+============================================================
+============================================================
+TEST 2: PDF names with spaces
+============================================================
+abc#20cde
+abc#20cde
+============================================================
diff --git a/l3kernel/testfiles/m3str002.lvt b/l3kernel/testfiles/m3str002.lvt
@@ -6,7 +6,7 @@
 \ExplSyntaxOn
 \debug_on:n { check-declarations , deprecation , log-functions }
 \ExplSyntaxOff
-\ifdefined\disablecjktoken\disablecjktoken\fi
+
 \START
 \AUTHOR{Joseph Wright}
 \ExplSyntaxOn
@@ -33,7 +33,7 @@
     \tl_set:Nx \l_tmpb_tl{ \str_casefold:n { ABC~123 } }
     \tl_if_eq:NNTF \l_tmpa_tl \l_tmpb_tl \TRUE \ERROR
   }
-\sys_if_engine_ptex:T { \END }
+
 \TESTEXP { Accented~characters,~etc. }
   {
     " \str_uppercase:n { Café } "

diff --git a/l3kernel/testfiles/m3str002.ptex.tlg b/l3kernel/testfiles/m3str002.ptex.tlg
@@ -14,3 +14,18 @@ TEST 2: Checking category codes
 FALSE
 TRUE
 ============================================================
+============================================================
+TEST 3: Accented characters, etc.
+============================================================
+"CAF^^c3^^89"
+"^^c4^^87^^c4^^97^^c9^^97^^e1^^b9^^91^^e1^^b9^^91"
+"^^e1^^bd^^a2^^ce^^b9ωΝ"
+"^^cf^^85^^cc^^88^^cc^^81^^cf^^85^^cc^^88^^cc^^80st"
+"^^ea^^9a^^89^^ea^^9a^^87"
+"Ｚ^^ea^^9d^^8f^^e2^^93^^a7"
+============================================================
+============================================================
+TEST 4: Characters with context-sensitive Unicode behaviour
+============================================================
+FALSE
+============================================================
diff --git a/l3kernel/testfiles/m3str002.uptex.tlg b/l3kernel/testfiles/m3str002.uptex.tlg
@@ -19,13 +19,13 @@ TEST 3: Accented characters, etc.
 ============================================================
 "CAF^^c3^^89"
 "^^c4^^87^^c4^^97^^c9^^97^^e1^^b9^^91^^e1^^b9^^91"
-"^^e1^^bd^^a2^^ce^^b9^^cf^^89^^ce^^bd"
+"^^e1^^bd^^a2^^ce^^b9ω^^ce^^bd"
 "^^cf^^85^^cc^^88^^cc^^81^^cf^^85^^cc^^88^^cc^^80st"
-"^^ea^^9a^^89^^ea^^9a^^87"
+"^^ea^^9a^^89ꚇ"
 "^^ef^^bd^^9a^^ea^^9d^^8f^^e2^^93^^a7"
 ============================================================
 ============================================================
 TEST 4: Characters with context-sensitive Unicode behaviour
 ============================================================
-TRUE
+FALSE
 ============================================================
diff --git a/l3kernel/testfiles/m3text002.lvt b/l3kernel/testfiles/m3text002.lvt
@@ -7,7 +7,7 @@
 \ExplSyntaxOn
 \debug_on:n { check-declarations , deprecation , log-functions }
 \ExplSyntaxOff
-\ifdefined\disablecjktoken\disablecjktoken\fi
+
 \START
 \AUTHOR{Joseph Wright}
 \ExplSyntaxOn
@@ -125,8 +125,6 @@
     \test:n { ABCÈ日本語}
   }
 
-\sys_if_engine_ptex:T { \END }
-
 \TESTEXP { Unicode~case~changing }
   {
     \test:n { åéîøὭдαƐ }