Split tl and str case data

This avoids needing __unicode but also makes the data more semantic: tokenized vs detokenized.
latex3 · Mar 16, 2018 · e5afb92 · e5afb92
1 parent 4b4efa0
commit e5afb92
Show file tree

Hide file tree

Showing 12 changed files with 3,533 additions and 2,798 deletions.
diff --git a/l3kernel/l3candidates.dtx b/l3kernel/l3candidates.dtx
@@ -3036,24 +3036,24 @@
   {
     \@@_change_case_output:fwn
       {
-        \cs_if_exist_use:cF { c__unicode_lower_ \token_to_str:N #1 _tl }
+        \cs_if_exist_use:cF { c_@@_lower_case_ \token_to_str:N #1 _tl }
           { \@@_change_case_char_auxii:nN { lower } #1 }
       }
   }
 \cs_new:Npn \@@_change_case_char_upper:N #1
   {
     \@@_change_case_output:fwn
       {
-        \cs_if_exist_use:cF { c__unicode_upper_ \token_to_str:N #1 _tl }
+        \cs_if_exist_use:cF { c_@@_upper_case_ \token_to_str:N #1 _tl }
           { \@@_change_case_char_auxii:nN { upper } #1 }
       }
   }
 \cs_new:Npn \@@_change_case_char_mixed:N #1
   {
-    \cs_if_exist:cTF { c__unicode_mixed_ \token_to_str:N #1 _tl }
+    \cs_if_exist:cTF { c_@@_mixed_case_ \token_to_str:N #1 _tl }
       {
         \@@_change_case_output:fwn
-          { \tl_use:c { c__unicode_mixed_ \token_to_str:N #1 _tl } }
+          { \tl_use:c { c_@@_mixed_case_ \token_to_str:N #1 _tl } }
       }
       { \@@_change_case_char_upper:N #1 }
   }
@@ -3083,10 +3083,10 @@
       { \@@_change_case_char_UTFviii:nnN {#1} {#2#4#5#6} #3 }
     \cs_new:Npn \@@_change_case_char_UTFviii:nnN #1#2#3
       {
-        \cs_if_exist:cTF { c__unicode_ #1 _ \tl_to_str:n {#2} _tl }
+        \cs_if_exist:cTF { c_@@_ #1 _case_ \tl_to_str:n {#2} _tl }
           {
             \@@_change_case_output:vwn
-              { c__unicode_ #1 _ \tl_to_str:n {#2} _tl }
+              { c_@@_ #1 _case_ \tl_to_str:n {#2} _tl }
           }
           { \@@_change_case_output:nwn {#2} }
         #3
@@ -3320,7 +3320,7 @@
   {
     \tl_if_head_is_N_type:nTF {#1}
       { \@@_change_case_lower_sigma:Nw #1 \q_recursion_stop }
-      { \c__unicode_final_sigma_tl }
+      { \c_@@_final_sigma_tl }
   }
 \cs_new:Npn \@@_change_case_lower_sigma:Nw #1#2 \q_recursion_stop
   {
@@ -3331,8 +3331,8 @@
       }
       {
         \token_if_letter:NTF #1
-          { \c__unicode_std_sigma_tl }
-          { \c__unicode_final_sigma_tl }
+          { \c_@@_std_sigma_tl }
+          { \c_@@_final_sigma_tl }
       }
   }
 %    \end{macrocode}
@@ -3380,7 +3380,7 @@
       {
         \tl_if_head_is_N_type:nTF {#2}
           { \@@_change_case_lower_tr_auxii:Nw #2 \q_recursion_stop }
-          { \@@_change_case_output:Vwn \c__unicode_dotless_i_tl }
+          { \@@_change_case_output:Vwn \c_@@_dotless_i_tl }
         #1 #2 \q_recursion_stop
       }
     \cs_new:Npn \@@_change_case_lower_tr_auxii:Nw #1#2 \q_recursion_stop
@@ -3394,7 +3394,7 @@
             \bool_lazy_or:nnTF
               { \token_if_cs_p:N #1 }
               { ! \int_compare_p:nNn { `#1 } = { "0307 } }
-              { \@@_change_case_output:Vwn \c__unicode_dotless_i_tl }
+              { \@@_change_case_output:Vwn \c_@@_dotless_i_tl }
               {
                 \@@_change_case_output:nwn { i }
                 \use_i:nn
@@ -3412,7 +3412,7 @@
     \cs_new:Npn \@@_change_case_lower_tr:Nnw #1#2
       {
         \int_compare:nNnTF { `#1 } = { "0049 }
-          { \@@_change_case_output:Vwn \c__unicode_dotless_i_tl }
+          { \@@_change_case_output:Vwn \c_@@_dotless_i_tl }
           {
             \int_compare:nNnTF { `#1 } = { 196 }
               { \@@_change_case_lower_tr_auxi:Nw #1 {#2} }
@@ -3438,7 +3438,7 @@
 \cs_new:Npn \@@_change_case_upper_tr:Nnw #1#2
   {
     \int_compare:nNnTF { `#1 } = { "0069 }
-      { \@@_change_case_output:Vwn \c__unicode_dotted_I_tl }
+      { \@@_change_case_output:Vwn \c_@@_dotted_I_tl }
       {#2}
   }
 %    \end{macrocode}
@@ -3476,7 +3476,7 @@
 \cs_new:Npn \@@_change_case_lower_lt:Nnw #1
   {
     \exp_args:Nf \@@_change_case_lower_lt:nNnw
-      { \str_case:nVF #1 \c__unicode_accents_lt_tl \exp_stop_f: }
+      { \str_case:nVF #1 \c_@@_accents_lt_tl \exp_stop_f: }
       #1
   }
 \cs_new:Npn \@@_change_case_lower_lt:nNnw #1#2
@@ -3489,7 +3489,7 @@
               {
                 { "0049 } i
                 { "004A } j
-                { "012E } \c__unicode_i_ogonek_tl
+                { "012E } \c_@@_i_ogonek_tl
               }
               \exp_stop_f:
           }
@@ -3536,7 +3536,7 @@
                 { \int_compare_p:nNn { `#2 } = { "0303 } }
               }
           }
-          { \@@_change_case_output:Vwn \c__unicode_dot_above_tl }
+          { \@@_change_case_output:Vwn \c_@@_dot_above_tl }
         #1 #2#3 \q_recursion_stop
       }
   }
@@ -3553,7 +3553,7 @@
           {
             { "0069 } I
             { "006A } J
-            { "012F } \c__unicode_I_ogonek_tl
+            { "012F } \c_@@_I_ogonek_tl
           }
           \exp_stop_f:
       }
@@ -3606,7 +3606,7 @@
 \cs_new:cpn { @@_change_case_upper_de-alt:Nnw } #1#2
   {
     \int_compare:nNnTF { `#1 } = { 223 }
-      { \@@_change_case_output:Vwn \c__unicode_upper_Eszett_tl }
+      { \@@_change_case_output:Vwn \c_@@_upper_Eszett_tl }
       {#2}
   }
 %    \end{macrocode}
@@ -3672,21 +3672,21 @@
 %
 % \begin{variable}
 %   {
-%     \c__unicode_std_sigma_tl    ,
-%     \c__unicode_final_sigma_tl  ,
-%     \c__unicode_accents_lt_tl   ,
-%     \c__unicode_dot_above_tl    ,
-%     \c__unicode_upper_Eszett_tl
+%     \c_@@_std_sigma_tl    ,
+%     \c_@@_final_sigma_tl  ,
+%     \c_@@_accents_lt_tl   ,
+%     \c_@@_dot_above_tl    ,
+%     \c_@@_upper_Eszett_tl
 %   }
 %   The above needs various special token lists containg pre-formed characters.
 %   This set are only available in Unicode engines, with no-op definitions
 %   for $8$-bit use.
 %    \begin{macrocode}
 \cs_if_exist:NTF \utex_char:D
   {
-    \tl_const:Nx \c__unicode_std_sigma_tl    { \utex_char:D "03C3 ~ }
-    \tl_const:Nx \c__unicode_final_sigma_tl  { \utex_char:D "03C2 ~ }
-    \tl_const:Nx \c__unicode_accents_lt_tl
+    \tl_const:Nx \c_@@_std_sigma_tl    { \utex_char:D "03C3 ~ }
+    \tl_const:Nx \c_@@_final_sigma_tl  { \utex_char:D "03C2 ~ }
+    \tl_const:Nx \c_@@_accents_lt_tl
       {
         \utex_char:D "00CC ~
           { \utex_char:D "0069 ~ \utex_char:D "0307 ~ \utex_char:D "0300 ~ }
@@ -3695,24 +3695,24 @@
         \utex_char:D "0128 ~
           { \utex_char:D "0069 ~ \utex_char:D "0307 ~ \utex_char:D "0303 ~ }
       }
-    \tl_const:Nx \c__unicode_dot_above_tl    { \utex_char:D "0307 ~ }
-    \tl_const:Nx \c__unicode_upper_Eszett_tl { \utex_char:D "1E9E ~ }
+    \tl_const:Nx \c_@@_dot_above_tl    { \utex_char:D "0307 ~ }
+    \tl_const:Nx \c_@@_upper_Eszett_tl { \utex_char:D "1E9E ~ }
   }
   {
-      \tl_const:Nn \c__unicode_std_sigma_tl    { }
-      \tl_const:Nn \c__unicode_final_sigma_tl  { }
-      \tl_const:Nn \c__unicode_accents_lt_tl   { }
-      \tl_const:Nn \c__unicode_dot_above_tl    { }
-      \tl_const:Nn \c__unicode_upper_Eszett_tl { }
+      \tl_const:Nn \c_@@_std_sigma_tl    { }
+      \tl_const:Nn \c_@@_final_sigma_tl  { }
+      \tl_const:Nn \c_@@_accents_lt_tl   { }
+      \tl_const:Nn \c_@@_dot_above_tl    { }
+      \tl_const:Nn \c_@@_upper_Eszett_tl { }
   }
 %    \end{macrocode}
 % \end{variable}
 % \begin{variable}
 %   {
-%     \c__unicode_dotless_i_tl    ,
-%     \c__unicode_dotted_I_tl     ,
-%     \c__unicode_i_ogonek_tl     ,
-%     \c__unicode_I_ogonek_tl     ,
+%     \c_@@_dotless_i_tl    ,
+%     \c_@@_dotted_I_tl     ,
+%     \c_@@_i_ogonek_tl     ,
+%     \c_@@_I_ogonek_tl     ,
 %   }
 %  For cases where there is an $8$-bit option in the |T1| font set up,
 %  a variant is provided in both cases.
@@ -3743,10 +3743,10 @@
           \group_end:
         }
     }
-  \@@_tmp:w \c__unicode_dotless_i_tl { 0131 }
-  \@@_tmp:w \c__unicode_dotted_I_tl  { 0130 }
-  \@@_tmp:w \c__unicode_i_ogonek_tl  { 012F }
-  \@@_tmp:w \c__unicode_I_ogonek_tl  { 012E }  
+  \@@_tmp:w \c_@@_dotless_i_tl { 0131 }
+  \@@_tmp:w \c_@@_dotted_I_tl  { 0130 }
+  \@@_tmp:w \c_@@_i_ogonek_tl  { 012F }
+  \@@_tmp:w \c_@@_I_ogonek_tl  { 012E }  
 \group_end:
 %    \end{macrocode}
 % \end{variable}
@@ -3777,7 +3777,7 @@
         {
           \tl_const:cx
             {
-              c__unicode_lower_
+              c_@@_lower_case_
               \char_generate:nn {#2} { 12 }
               \char_generate:nn {#3} { 12 }
               _tl
@@ -3790,7 +3790,7 @@
             }
           \tl_const:cx
             {
-              c__unicode_upper_
+              c_@@_upper_case_
               \char_generate:nn {#5} { 12 }
               \char_generate:nn {#6} { 12 }
               _tl
@@ -3911,7 +3911,7 @@
               {
                 \tl_const:cx
                   {
-                    c__unicode_ #3 _
+                    c_@@_ #3 _case_
                     \char_generate:nn {##2} { 12 }
                     \char_generate:nn {##3} { 12 }
                     _tl

diff --git a/l3kernel/l3str.dtx b/l3kernel/l3str.dtx
@@ -1926,10 +1926,10 @@
   {
     \quark_if_recursion_tail_stop_do:Nn #2
       { \@@_change_case_end:wn }
-    \cs_if_exist:cTF { c__unicode_ #1 _ #2 _tl }
+    \cs_if_exist:cTF { c__str_ #1 _case_ #2 _str }
       {
         \@@_change_case_output:fw
-          { \tl_to_str:c { c__unicode_ #1 _ #2 _tl } }
+          { \str_use:c { c__str_ #1 _case_ #2 _str } }
       }
       { \@@_change_case_char_aux:nN {#1} #2 }
     \@@_change_case_loop:nw {#1}
@@ -2037,9 +2037,8 @@
 % one-to-one situations and does not fully handle for example case folding.
 %
 % The data required for cross-module manipulations is loaded here: currently
-% this means for |str| and |tl| functions. As such, the prefix used is not
-% |str| but rather |unicode|. For performance (as the entire data set must
-% be read during each run) and as this code comes somewhat early in the
+% this means for |str| and |tl| functions. For performance (as the entire data
+% set must be read during each run) and as this code comes somewhat early in the
 % load process, there is quite a bit of low-level code here.
 %
 % As only the data needs to remain at the end of this process, everything
@@ -2123,22 +2122,6 @@
           \fi:
         }
 %    \end{macrocode}
-% Storing each exception is always done in the same way: create a constant
-% token list which expands to exactly the mapping. These have the
-% category codes \enquote{now} (so should be letters) but are later detokenized
-% for string use.
-%    \begin{macrocode}
-      \cs_set_protected:Npn \@@_store:nnnnn #1#2#3#4#5
-        {
-          \tl_const:cx { c_@@_ #2 _ \utex_char:D "#1 _tl }
-            {
-              \utex_char:D "#3 ~
-              \utex_char:D "#4 ~
-              \tl_if_blank:nF {#5}
-                { \utex_char:D "#5 }
-            }
-        }
-%    \end{macrocode}
 % Parse the main Unicode data file for title case exceptions (the one-to-one
 % lower and upper case mappings it contains are all be covered by the \TeX{}
 % data).
@@ -2154,7 +2137,7 @@
               \if_int_compare:w \__str_if_eq_x:nn { #5 ~ } {#7} = 0 \exp_stop_f:
               \else:
                 \tl_const:cx
-                  { c_@@_mixed_ \utex_char:D "#1 _tl }
+                  { c__tl_mixed_case_ \utex_char:D "#1 _tl }
                   { \utex_char:D "#7 }
               \fi:
             }
@@ -2171,8 +2154,8 @@
           \if_int_compare:w \__str_if_eq_x:nn {#2} { C } = 0 \exp_stop_f:
             \if_int_compare:w \tex_lccode:D "#1 = "#3 \scan_stop:
             \else:
-              \tl_const:cx
-                { c_@@_fold_ \utex_char:D "#1 _tl }
+              \str_const:cx
+                { c__str_fold_case_ \utex_char:D "#1 _str }
                 { \utex_char:D "#3 ~ }
             \fi:
           \else:
@@ -2183,10 +2166,21 @@
         }
       \cs_set_protected:Npn \@@_parse_auxii:w #1 ~ #2 ~ #3 ~ #4 \q_stop
         { \@@_store:nnnnn {#1} { fold } {#2} {#3} {#4} }
+      \cs_set_protected:Npn \@@_store:nnnnn #1#2#3#4#5
+        {
+          \str_const:cx { c__str_fold_case_ \utex_char:D "#1 _str }
+            {
+              \utex_char:D "#3 ~
+              \utex_char:D "#4 ~
+              \tl_if_blank:nF {#5}
+                { \utex_char:D "#5 }
+            }
+        }
       \@@_map_inline:n { CaseFolding.txt }
 %    \end{macrocode}
 % For upper and lower casing special situations, there is a bit more to
-% do as we also have title casing to consider.
+% do as we also have title casing to consider. Here, we have both token list
+% and string data to save.
 %    \begin{macrocode}
       \cs_set_protected:Npn \@@_parse_auxi:w #1 ;~ #2 ;~ #3 ;~ #4 ; #5 \q_stop
         {
@@ -2202,6 +2196,21 @@
           \tl_if_empty:nF {#4}
             { \@@_store:nnnnn {#1} {#2} {#3} {#4} {#5} }
         }
+      \cs_set_protected:Npn \@@_store:nnnnn #1#2#3#4#5
+        {
+          \tl_const:cx { c__tl_ #2 _case_ \utex_char:D "#1 _tl }
+            {
+              \utex_char:D "#3 ~
+              \utex_char:D "#4 ~
+              \tl_if_blank:nF {#5}
+                { \utex_char:D "#5 }
+            }
+          \if_int_compare:w \__str_if_eq_x:nn {#2} { mixed } = 0 \exp_stop_f:
+          \else:
+            \str_const:cx { c__str_ #2 _case_ \utex_char:D "#1 _str }
+              { \tl_use:c { c__tl_ #2 _case_ \utex_char:D "#1 _tl } }
+          \fi:
+        }
       \@@_map_inline:n { SpecialCasing.txt }
     }
 %    \end{macrocode}
@@ -2217,9 +2226,11 @@
           \if_meaning:w \q_recursion_tail #2
             \exp_after:wN \use_none_delimit_by_q_recursion_stop:w
           \fi:
-          \tl_const:cn { c_@@_fold_  #1 _tl } {#2}
-          \tl_const:cn { c_@@_lower_ #1 _tl } {#2}
-          \tl_const:cn { c_@@_upper_ #2 _tl } {#1}
+          \str_const:cn { c__str_fold_case_  #1 _str } {#2}
+          \str_const:cn { c__str_lower_case_ #1 _str } {#2}
+          \str_const:cn { c__str_upper_case_ #2 _str } {#1}
+          \tl_const:cn { c__tl_lower_case_ #1 _tl } {#2}
+          \tl_const:cn { c__tl_upper_case_ #2 _tl } {#1}
           \@@_tmp:NN
         }
       \@@_tmp:NN