Skip to content

Commit

Permalink
Split tl and str case data
Browse files Browse the repository at this point in the history
This avoids needing __unicode but also makes the
data more semantic: tokenized vs detokenized.
  • Loading branch information
josephwright committed Mar 16, 2018
1 parent 4b4efa0 commit e5afb92
Show file tree
Hide file tree
Showing 12 changed files with 3,533 additions and 2,798 deletions.
88 changes: 44 additions & 44 deletions l3kernel/l3candidates.dtx
Expand Up @@ -3036,24 +3036,24 @@
{
\@@_change_case_output:fwn
{
\cs_if_exist_use:cF { c__unicode_lower_ \token_to_str:N #1 _tl }
\cs_if_exist_use:cF { c_@@_lower_case_ \token_to_str:N #1 _tl }
{ \@@_change_case_char_auxii:nN { lower } #1 }
}
}
\cs_new:Npn \@@_change_case_char_upper:N #1
{
\@@_change_case_output:fwn
{
\cs_if_exist_use:cF { c__unicode_upper_ \token_to_str:N #1 _tl }
\cs_if_exist_use:cF { c_@@_upper_case_ \token_to_str:N #1 _tl }
{ \@@_change_case_char_auxii:nN { upper } #1 }
}
}
\cs_new:Npn \@@_change_case_char_mixed:N #1
{
\cs_if_exist:cTF { c__unicode_mixed_ \token_to_str:N #1 _tl }
\cs_if_exist:cTF { c_@@_mixed_case_ \token_to_str:N #1 _tl }
{
\@@_change_case_output:fwn
{ \tl_use:c { c__unicode_mixed_ \token_to_str:N #1 _tl } }
{ \tl_use:c { c_@@_mixed_case_ \token_to_str:N #1 _tl } }
}
{ \@@_change_case_char_upper:N #1 }
}
Expand Down Expand Up @@ -3083,10 +3083,10 @@
{ \@@_change_case_char_UTFviii:nnN {#1} {#2#4#5#6} #3 }
\cs_new:Npn \@@_change_case_char_UTFviii:nnN #1#2#3
{
\cs_if_exist:cTF { c__unicode_ #1 _ \tl_to_str:n {#2} _tl }
\cs_if_exist:cTF { c_@@_ #1 _case_ \tl_to_str:n {#2} _tl }
{
\@@_change_case_output:vwn
{ c__unicode_ #1 _ \tl_to_str:n {#2} _tl }
{ c_@@_ #1 _case_ \tl_to_str:n {#2} _tl }
}
{ \@@_change_case_output:nwn {#2} }
#3
Expand Down Expand Up @@ -3320,7 +3320,7 @@
{
\tl_if_head_is_N_type:nTF {#1}
{ \@@_change_case_lower_sigma:Nw #1 \q_recursion_stop }
{ \c__unicode_final_sigma_tl }
{ \c_@@_final_sigma_tl }
}
\cs_new:Npn \@@_change_case_lower_sigma:Nw #1#2 \q_recursion_stop
{
Expand All @@ -3331,8 +3331,8 @@
}
{
\token_if_letter:NTF #1
{ \c__unicode_std_sigma_tl }
{ \c__unicode_final_sigma_tl }
{ \c_@@_std_sigma_tl }
{ \c_@@_final_sigma_tl }
}
}
% \end{macrocode}
Expand Down Expand Up @@ -3380,7 +3380,7 @@
{
\tl_if_head_is_N_type:nTF {#2}
{ \@@_change_case_lower_tr_auxii:Nw #2 \q_recursion_stop }
{ \@@_change_case_output:Vwn \c__unicode_dotless_i_tl }
{ \@@_change_case_output:Vwn \c_@@_dotless_i_tl }
#1 #2 \q_recursion_stop
}
\cs_new:Npn \@@_change_case_lower_tr_auxii:Nw #1#2 \q_recursion_stop
Expand All @@ -3394,7 +3394,7 @@
\bool_lazy_or:nnTF
{ \token_if_cs_p:N #1 }
{ ! \int_compare_p:nNn { `#1 } = { "0307 } }
{ \@@_change_case_output:Vwn \c__unicode_dotless_i_tl }
{ \@@_change_case_output:Vwn \c_@@_dotless_i_tl }
{
\@@_change_case_output:nwn { i }
\use_i:nn
Expand All @@ -3412,7 +3412,7 @@
\cs_new:Npn \@@_change_case_lower_tr:Nnw #1#2
{
\int_compare:nNnTF { `#1 } = { "0049 }
{ \@@_change_case_output:Vwn \c__unicode_dotless_i_tl }
{ \@@_change_case_output:Vwn \c_@@_dotless_i_tl }
{
\int_compare:nNnTF { `#1 } = { 196 }
{ \@@_change_case_lower_tr_auxi:Nw #1 {#2} }
Expand All @@ -3438,7 +3438,7 @@
\cs_new:Npn \@@_change_case_upper_tr:Nnw #1#2
{
\int_compare:nNnTF { `#1 } = { "0069 }
{ \@@_change_case_output:Vwn \c__unicode_dotted_I_tl }
{ \@@_change_case_output:Vwn \c_@@_dotted_I_tl }
{#2}
}
% \end{macrocode}
Expand Down Expand Up @@ -3476,7 +3476,7 @@
\cs_new:Npn \@@_change_case_lower_lt:Nnw #1
{
\exp_args:Nf \@@_change_case_lower_lt:nNnw
{ \str_case:nVF #1 \c__unicode_accents_lt_tl \exp_stop_f: }
{ \str_case:nVF #1 \c_@@_accents_lt_tl \exp_stop_f: }
#1
}
\cs_new:Npn \@@_change_case_lower_lt:nNnw #1#2
Expand All @@ -3489,7 +3489,7 @@
{
{ "0049 } i
{ "004A } j
{ "012E } \c__unicode_i_ogonek_tl
{ "012E } \c_@@_i_ogonek_tl
}
\exp_stop_f:
}
Expand Down Expand Up @@ -3536,7 +3536,7 @@
{ \int_compare_p:nNn { `#2 } = { "0303 } }
}
}
{ \@@_change_case_output:Vwn \c__unicode_dot_above_tl }
{ \@@_change_case_output:Vwn \c_@@_dot_above_tl }
#1 #2#3 \q_recursion_stop
}
}
Expand All @@ -3553,7 +3553,7 @@
{
{ "0069 } I
{ "006A } J
{ "012F } \c__unicode_I_ogonek_tl
{ "012F } \c_@@_I_ogonek_tl
}
\exp_stop_f:
}
Expand Down Expand Up @@ -3606,7 +3606,7 @@
\cs_new:cpn { @@_change_case_upper_de-alt:Nnw } #1#2
{
\int_compare:nNnTF { `#1 } = { 223 }
{ \@@_change_case_output:Vwn \c__unicode_upper_Eszett_tl }
{ \@@_change_case_output:Vwn \c_@@_upper_Eszett_tl }
{#2}
}
% \end{macrocode}
Expand Down Expand Up @@ -3672,21 +3672,21 @@
%
% \begin{variable}
% {
% \c__unicode_std_sigma_tl ,
% \c__unicode_final_sigma_tl ,
% \c__unicode_accents_lt_tl ,
% \c__unicode_dot_above_tl ,
% \c__unicode_upper_Eszett_tl
% \c_@@_std_sigma_tl ,
% \c_@@_final_sigma_tl ,
% \c_@@_accents_lt_tl ,
% \c_@@_dot_above_tl ,
% \c_@@_upper_Eszett_tl
% }
% The above needs various special token lists containg pre-formed characters.
% This set are only available in Unicode engines, with no-op definitions
% for $8$-bit use.
% \begin{macrocode}
\cs_if_exist:NTF \utex_char:D
{
\tl_const:Nx \c__unicode_std_sigma_tl { \utex_char:D "03C3 ~ }
\tl_const:Nx \c__unicode_final_sigma_tl { \utex_char:D "03C2 ~ }
\tl_const:Nx \c__unicode_accents_lt_tl
\tl_const:Nx \c_@@_std_sigma_tl { \utex_char:D "03C3 ~ }
\tl_const:Nx \c_@@_final_sigma_tl { \utex_char:D "03C2 ~ }
\tl_const:Nx \c_@@_accents_lt_tl
{
\utex_char:D "00CC ~
{ \utex_char:D "0069 ~ \utex_char:D "0307 ~ \utex_char:D "0300 ~ }
Expand All @@ -3695,24 +3695,24 @@
\utex_char:D "0128 ~
{ \utex_char:D "0069 ~ \utex_char:D "0307 ~ \utex_char:D "0303 ~ }
}
\tl_const:Nx \c__unicode_dot_above_tl { \utex_char:D "0307 ~ }
\tl_const:Nx \c__unicode_upper_Eszett_tl { \utex_char:D "1E9E ~ }
\tl_const:Nx \c_@@_dot_above_tl { \utex_char:D "0307 ~ }
\tl_const:Nx \c_@@_upper_Eszett_tl { \utex_char:D "1E9E ~ }
}
{
\tl_const:Nn \c__unicode_std_sigma_tl { }
\tl_const:Nn \c__unicode_final_sigma_tl { }
\tl_const:Nn \c__unicode_accents_lt_tl { }
\tl_const:Nn \c__unicode_dot_above_tl { }
\tl_const:Nn \c__unicode_upper_Eszett_tl { }
\tl_const:Nn \c_@@_std_sigma_tl { }
\tl_const:Nn \c_@@_final_sigma_tl { }
\tl_const:Nn \c_@@_accents_lt_tl { }
\tl_const:Nn \c_@@_dot_above_tl { }
\tl_const:Nn \c_@@_upper_Eszett_tl { }
}
% \end{macrocode}
% \end{variable}
% \begin{variable}
% {
% \c__unicode_dotless_i_tl ,
% \c__unicode_dotted_I_tl ,
% \c__unicode_i_ogonek_tl ,
% \c__unicode_I_ogonek_tl ,
% \c_@@_dotless_i_tl ,
% \c_@@_dotted_I_tl ,
% \c_@@_i_ogonek_tl ,
% \c_@@_I_ogonek_tl ,
% }
% For cases where there is an $8$-bit option in the |T1| font set up,
% a variant is provided in both cases.
Expand Down Expand Up @@ -3743,10 +3743,10 @@
\group_end:
}
}
\@@_tmp:w \c__unicode_dotless_i_tl { 0131 }
\@@_tmp:w \c__unicode_dotted_I_tl { 0130 }
\@@_tmp:w \c__unicode_i_ogonek_tl { 012F }
\@@_tmp:w \c__unicode_I_ogonek_tl { 012E }
\@@_tmp:w \c_@@_dotless_i_tl { 0131 }
\@@_tmp:w \c_@@_dotted_I_tl { 0130 }
\@@_tmp:w \c_@@_i_ogonek_tl { 012F }
\@@_tmp:w \c_@@_I_ogonek_tl { 012E }
\group_end:
% \end{macrocode}
% \end{variable}
Expand Down Expand Up @@ -3777,7 +3777,7 @@
{
\tl_const:cx
{
c__unicode_lower_
c_@@_lower_case_
\char_generate:nn {#2} { 12 }
\char_generate:nn {#3} { 12 }
_tl
Expand All @@ -3790,7 +3790,7 @@
}
\tl_const:cx
{
c__unicode_upper_
c_@@_upper_case_
\char_generate:nn {#5} { 12 }
\char_generate:nn {#6} { 12 }
_tl
Expand Down Expand Up @@ -3911,7 +3911,7 @@
{
\tl_const:cx
{
c__unicode_ #3 _
c_@@_ #3 _case_
\char_generate:nn {##2} { 12 }
\char_generate:nn {##3} { 12 }
_tl
Expand Down
67 changes: 39 additions & 28 deletions l3kernel/l3str.dtx
Expand Up @@ -1926,10 +1926,10 @@
{
\quark_if_recursion_tail_stop_do:Nn #2
{ \@@_change_case_end:wn }
\cs_if_exist:cTF { c__unicode_ #1 _ #2 _tl }
\cs_if_exist:cTF { c__str_ #1 _case_ #2 _str }
{
\@@_change_case_output:fw
{ \tl_to_str:c { c__unicode_ #1 _ #2 _tl } }
{ \str_use:c { c__str_ #1 _case_ #2 _str } }
}
{ \@@_change_case_char_aux:nN {#1} #2 }
\@@_change_case_loop:nw {#1}
Expand Down Expand Up @@ -2037,9 +2037,8 @@
% one-to-one situations and does not fully handle for example case folding.
%
% The data required for cross-module manipulations is loaded here: currently
% this means for |str| and |tl| functions. As such, the prefix used is not
% |str| but rather |unicode|. For performance (as the entire data set must
% be read during each run) and as this code comes somewhat early in the
% this means for |str| and |tl| functions. For performance (as the entire data
% set must be read during each run) and as this code comes somewhat early in the
% load process, there is quite a bit of low-level code here.
%
% As only the data needs to remain at the end of this process, everything
Expand Down Expand Up @@ -2123,22 +2122,6 @@
\fi:
}
% \end{macrocode}
% Storing each exception is always done in the same way: create a constant
% token list which expands to exactly the mapping. These have the
% category codes \enquote{now} (so should be letters) but are later detokenized
% for string use.
% \begin{macrocode}
\cs_set_protected:Npn \@@_store:nnnnn #1#2#3#4#5
{
\tl_const:cx { c_@@_ #2 _ \utex_char:D "#1 _tl }
{
\utex_char:D "#3 ~
\utex_char:D "#4 ~
\tl_if_blank:nF {#5}
{ \utex_char:D "#5 }
}
}
% \end{macrocode}
% Parse the main Unicode data file for title case exceptions (the one-to-one
% lower and upper case mappings it contains are all be covered by the \TeX{}
% data).
Expand All @@ -2154,7 +2137,7 @@
\if_int_compare:w \__str_if_eq_x:nn { #5 ~ } {#7} = 0 \exp_stop_f:
\else:
\tl_const:cx
{ c_@@_mixed_ \utex_char:D "#1 _tl }
{ c__tl_mixed_case_ \utex_char:D "#1 _tl }
{ \utex_char:D "#7 }
\fi:
}
Expand All @@ -2171,8 +2154,8 @@
\if_int_compare:w \__str_if_eq_x:nn {#2} { C } = 0 \exp_stop_f:
\if_int_compare:w \tex_lccode:D "#1 = "#3 \scan_stop:
\else:
\tl_const:cx
{ c_@@_fold_ \utex_char:D "#1 _tl }
\str_const:cx
{ c__str_fold_case_ \utex_char:D "#1 _str }
{ \utex_char:D "#3 ~ }
\fi:
\else:
Expand All @@ -2183,10 +2166,21 @@
}
\cs_set_protected:Npn \@@_parse_auxii:w #1 ~ #2 ~ #3 ~ #4 \q_stop
{ \@@_store:nnnnn {#1} { fold } {#2} {#3} {#4} }
\cs_set_protected:Npn \@@_store:nnnnn #1#2#3#4#5
{
\str_const:cx { c__str_fold_case_ \utex_char:D "#1 _str }
{
\utex_char:D "#3 ~
\utex_char:D "#4 ~
\tl_if_blank:nF {#5}
{ \utex_char:D "#5 }
}
}
\@@_map_inline:n { CaseFolding.txt }
% \end{macrocode}
% For upper and lower casing special situations, there is a bit more to
% do as we also have title casing to consider.
% do as we also have title casing to consider. Here, we have both token list
% and string data to save.
% \begin{macrocode}
\cs_set_protected:Npn \@@_parse_auxi:w #1 ;~ #2 ;~ #3 ;~ #4 ; #5 \q_stop
{
Expand All @@ -2202,6 +2196,21 @@
\tl_if_empty:nF {#4}
{ \@@_store:nnnnn {#1} {#2} {#3} {#4} {#5} }
}
\cs_set_protected:Npn \@@_store:nnnnn #1#2#3#4#5
{
\tl_const:cx { c__tl_ #2 _case_ \utex_char:D "#1 _tl }
{
\utex_char:D "#3 ~
\utex_char:D "#4 ~
\tl_if_blank:nF {#5}
{ \utex_char:D "#5 }
}
\if_int_compare:w \__str_if_eq_x:nn {#2} { mixed } = 0 \exp_stop_f:
\else:
\str_const:cx { c__str_ #2 _case_ \utex_char:D "#1 _str }
{ \tl_use:c { c__tl_ #2 _case_ \utex_char:D "#1 _tl } }
\fi:
}
\@@_map_inline:n { SpecialCasing.txt }
}
% \end{macrocode}
Expand All @@ -2217,9 +2226,11 @@
\if_meaning:w \q_recursion_tail #2
\exp_after:wN \use_none_delimit_by_q_recursion_stop:w
\fi:
\tl_const:cn { c_@@_fold_ #1 _tl } {#2}
\tl_const:cn { c_@@_lower_ #1 _tl } {#2}
\tl_const:cn { c_@@_upper_ #2 _tl } {#1}
\str_const:cn { c__str_fold_case_ #1 _str } {#2}
\str_const:cn { c__str_lower_case_ #1 _str } {#2}
\str_const:cn { c__str_upper_case_ #2 _str } {#1}
\tl_const:cn { c__tl_lower_case_ #1 _tl } {#2}
\tl_const:cn { c__tl_upper_case_ #2 _tl } {#1}
\@@_tmp:NN
}
\@@_tmp:NN
Expand Down

0 comments on commit e5afb92

Please sign in to comment.