diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md index 6e55f71e03..0d32ba1765 100644 --- a/l3kernel/CHANGELOG.md +++ b/l3kernel/CHANGELOG.md @@ -8,6 +8,7 @@ this project uses date-based 'snapshot' version identifiers. ## [Unreleased] ### Added +- `\codepoint_to_bytes:n` - `\codepoint_str_generate:n` ### Changed @@ -19,6 +20,9 @@ this project uses date-based 'snapshot' version identifiers. tokens (issue [\#1110](https://github.com/latex3/latex3/issues/1110)), and an esoteric case (issue [\#1113](https://github.com/latex3/latex3/issues/1113)) +### Deprecated +- `\char_to_utfviii_bytes:n` + ## [2022-09-28] ### Added diff --git a/l3kernel/doc/l3obsolete.txt b/l3kernel/doc/l3obsolete.txt index 6f8844ce67..2377bb9854 100644 --- a/l3kernel/doc/l3obsolete.txt +++ b/l3kernel/doc/l3obsolete.txt @@ -22,6 +22,7 @@ Function Date deprecated \char_str_lower_case:N 2020-01-03 \char_str_mixed_case:N 2020-01-03 \char_str_upper_case:N 2020-01-03 +\char_to_utfviii_bytes:n 2022-10-09 \cs_argument_spec:N 2022-06-24 \l_keys_key_tl 2020-02-08 \l_keys_path_tl 2020-02-08 diff --git a/l3kernel/l3candidates.dtx b/l3kernel/l3candidates.dtx index 1bcbfc4ece..13a0e9b5c5 100644 --- a/l3kernel/l3candidates.dtx +++ b/l3kernel/l3candidates.dtx @@ -626,18 +626,6 @@ % (\enquote{active}), and character code $32$ (space). % \end{variable} % -% \begin{function}[added = 2020-01-09, EXP]{\char_to_utfviii_bytes:n} -% \begin{syntax} -% \cs{char_to_utfviii_bytes:n} \Arg{codepoint} -% \end{syntax} -% Converts the (Unicode) \meta{codepoint} to UTF-8 bytes. The expansion -% of this function comprises four brace groups, each of which will contain -% a hexadecimal value: the appropriate byte. As UTF-8 is a variable-length, -% one or more of the groups may be empty: the bytes read in the logical order, -% such that a two-byte codepoint will have groups |#1| and |#2| filled -% and |#3| and |#4| empty. -% \end{function} -% % \begin{function}[added = 2020-01-02, rEXP]{\char_to_nfd:N} % \begin{syntax} % \cs{char_to_nfd:N} \meta{char} diff --git a/l3kernel/l3deprecation.dtx b/l3kernel/l3deprecation.dtx index 02f353516f..85c5321137 100644 --- a/l3kernel/l3deprecation.dtx +++ b/l3kernel/l3deprecation.dtx @@ -549,6 +549,13 @@ % % \subsection{Deprecated \pkg{l3token} functions} % +% \begin{macro}[EXP]{\char_to_utfviii_bytes:n} +% \begin{macrocode} +\__kernel_patch_deprecation:nnNNpn { 2022-10-09 } { \codepoint_to_bytes:n } +\cs_gset:Npn \char_to_utfviii_bytes:n { \codepoint_to_bytes:n } +% \end{macrocode} +% \end{macro} +% % \begin{macro}[EXP] % { % \char_lower_case:N, \char_upper_case:N, diff --git a/l3kernel/l3str-convert.dtx b/l3kernel/l3str-convert.dtx index f2d3793461..3b5df6dda7 100644 --- a/l3kernel/l3str-convert.dtx +++ b/l3kernel/l3str-convert.dtx @@ -2635,7 +2635,7 @@ \cs_new:Npn \@@_convert_pdfname_bytes:n #1 { \exp_args:Ne \@@_convert_pdfname_bytes_aux:n - { \char_to_utfviii_bytes:n {`#1} } + { \codepoint_to_bytes:n {`#1} } } \cs_new:Npn \@@_convert_pdfname_bytes_aux:n #1 { \@@_convert_pdfname_bytes_aux:nnnn #1 } diff --git a/l3kernel/l3str.dtx b/l3kernel/l3str.dtx index 82e907b88c..c7237635c6 100644 --- a/l3kernel/l3str.dtx +++ b/l3kernel/l3str.dtx @@ -2056,7 +2056,7 @@ \use:e { \exp_not:N \@@_change_case_generate:nnnn - \char_to_utfviii_bytes:n {#1} + \codepoint_to_bytes:n {#1} } } \cs_new:Npn \@@_change_case_generate:nnnn #1#2#3#4 diff --git a/l3kernel/l3text-case.dtx b/l3kernel/l3text-case.dtx index cad0ed2954..18c2188e6c 100644 --- a/l3kernel/l3text-case.dtx +++ b/l3kernel/l3text-case.dtx @@ -1868,7 +1868,7 @@ } } \use:x - { \@@_tmp:w \char_to_utfviii_bytes:n { "#2 } } + { \@@_tmp:w \codepoint_to_bytes:n { "#2 } } \group_end: } \@@_tmp:w \c_@@_dotless_i_tl { 0131 } @@ -1902,8 +1902,8 @@ \use:x { \@@_tmp:w - \char_to_utfviii_bytes:n { "#1 } - \char_to_utfviii_bytes:n { "#2 } + \codepoint_to_bytes:n { "#1 } + \codepoint_to_bytes:n { "#2 } } \@@_loop:nn } @@ -2183,8 +2183,8 @@ \use:x { \@@_tmp:w - \char_to_utfviii_bytes:n { "#1 } - \char_to_utfviii_bytes:n { "#2 } + \codepoint_to_bytes:n { "#1 } + \codepoint_to_bytes:n { "#2 } } \group_end: } @@ -2238,7 +2238,7 @@ {#2} } \use:x - { \@@_tmp:w \char_to_utfviii_bytes:n { "#1 } } + { \@@_tmp:w \codepoint_to_bytes:n { "#1 } } \group_end: } \@@_tmp:w { 00DF } { SS } { upper } @@ -2463,8 +2463,8 @@ \use:x { \@@_tmp:w - \char_to_utfviii_bytes:n { "#1 } - \char_to_utfviii_bytes:n { "#2 } + \codepoint_to_bytes:n { "#1 } + \codepoint_to_bytes:n { "#2 } } \group_end: } diff --git a/l3kernel/l3text-purify.dtx b/l3kernel/l3text-purify.dtx index 7caaeb0b5e..dcec40bcab 100644 --- a/l3kernel/l3text-purify.dtx +++ b/l3kernel/l3text-purify.dtx @@ -486,7 +486,7 @@ \text_declare_purify_equivalent:Nx #1 { \exp_args:Ne \@@_tmp:n - { \char_to_utfviii_bytes:n { "#2 } } + { \codepoint_to_bytes:n { "#2 } } } \@@_loop:Nn } @@ -574,7 +574,7 @@ \cs_set:Npn \@@_tmp:n #1 { \exp_args:Ne \@@_tmp_aux:n - { \char_to_utfviii_bytes:n { "#1 } } + { \codepoint_to_bytes:n { "#1 } } } \cs_set:Npn \@@_tmp_aux:n #1 { \@@_tmp:nnnn #1 } \cs_set:Npn \@@_tmp:nnnn #1#2#3#4 diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx index 307cb8d085..356d88d57d 100644 --- a/l3kernel/l3token.dtx +++ b/l3kernel/l3token.dtx @@ -1690,111 +1690,6 @@ % \end{macro} % \end{macro} % -% \begin{macro}[EXP]{\char_to_utfviii_bytes:n} -% \begin{macro}[EXP]{\@@_to_utfviii_bytes_auxi:n} -% \begin{macro}[EXP]{\@@_to_utfviii_bytes_auxii:Nnn} -% \begin{macro}[EXP]{\@@_to_utfviii_bytes_auxiii:n} -% \begin{macro}[EXP] -% { -% \@@_to_utfviii_bytes_outputi:nw , -% \@@_to_utfviii_bytes_outputii:nw , -% \@@_to_utfviii_bytes_outputiii:nw , -% \@@_to_utfviii_bytes_outputiv:nw -% } -% \begin{macro}[EXP] -% {\@@_to_utfviii_bytes_output:nnn, \@@_to_utfviii_bytes_output:fnn} -% \begin{macro}[EXP]{\@@_to_utfviii_bytes_end:} -% This code converts a codepoint into the correct UTF-8 representation. -% In terms of the algorithm itself, see -% \url{https://en.wikipedia.org/wiki/UTF-8} for the octet pattern. -% \begin{macrocode} -\cs_new:Npn \char_to_utfviii_bytes:n #1 - { - \exp_args:Nf \@@_to_utfviii_bytes_auxi:n - { \int_eval:n {#1} } - } -\cs_new:Npn \@@_to_utfviii_bytes_auxi:n #1 - { - \if_int_compare:w #1 > "80 \exp_stop_f: - \if_int_compare:w #1 < "800 \exp_stop_f: - \@@_to_utfviii_bytes_outputi:nw - { \@@_to_utfviii_bytes_auxii:Nnn C {#1} { 64 } } - \@@_to_utfviii_bytes_outputii:nw - { \@@_to_utfviii_bytes_auxiii:n {#1} } - \else: - \if_int_compare:w #1 < "10000 \exp_stop_f: - \@@_to_utfviii_bytes_outputi:nw - { \@@_to_utfviii_bytes_auxii:Nnn E {#1} { 64 * 64 } } - \@@_to_utfviii_bytes_outputii:nw - { - \@@_to_utfviii_bytes_auxiii:n - { \int_div_truncate:nn {#1} { 64 } } - } - \@@_to_utfviii_bytes_outputiii:nw - { \@@_to_utfviii_bytes_auxiii:n {#1} } - \else: - \@@_to_utfviii_bytes_outputi:nw - { - \@@_to_utfviii_bytes_auxii:Nnn F - {#1} { 64 * 64 * 64 } - } - \@@_to_utfviii_bytes_outputii:nw - { - \@@_to_utfviii_bytes_auxiii:n - { \int_div_truncate:nn {#1} { 64 * 64 } } - } - \@@_to_utfviii_bytes_outputiii:nw - { - \@@_to_utfviii_bytes_auxiii:n - { \int_div_truncate:nn {#1} { 64 } } - } - \@@_to_utfviii_bytes_outputiv:nw - { \@@_to_utfviii_bytes_auxiii:n {#1} } - \fi: - \fi: - \else: - \@@_to_utfviii_bytes_outputi:nw {#1} - \fi: - \@@_to_utfviii_bytes_end: { } { } { } { } - } -\cs_new:Npn \@@_to_utfviii_bytes_auxii:Nnn #1#2#3 - { "#10 + \int_div_truncate:nn {#2} {#3} } -\cs_new:Npn \@@_to_utfviii_bytes_auxiii:n #1 - { \int_mod:nn {#1} { 64 } + 128 } -\cs_new:Npn \@@_to_utfviii_bytes_outputi:nw - #1 #2 \@@_to_utfviii_bytes_end: #3 - { \@@_to_utfviii_bytes_output:fnn { \int_eval:n {#1} } { } {#2} } -\cs_new:Npn \@@_to_utfviii_bytes_outputii:nw - #1 #2 \@@_to_utfviii_bytes_end: #3#4 - { \@@_to_utfviii_bytes_output:fnn { \int_eval:n {#1} } { {#3} } {#2} } -\cs_new:Npn \@@_to_utfviii_bytes_outputiii:nw - #1 #2 \@@_to_utfviii_bytes_end: #3#4#5 - { - \@@_to_utfviii_bytes_output:fnn - { \int_eval:n {#1} } { {#3} {#4} } {#2} - } -\cs_new:Npn \@@_to_utfviii_bytes_outputiv:nw - #1 #2 \@@_to_utfviii_bytes_end: #3#4#5#6 - { - \@@_to_utfviii_bytes_output:fnn - { \int_eval:n {#1} } { {#3} {#4} {#5} } {#2} - } -\cs_new:Npn \@@_to_utfviii_bytes_output:nnn #1#2#3 - { - #3 - \@@_to_utfviii_bytes_end: #2 {#1} - } -\cs_generate_variant:Nn \@@_to_utfviii_bytes_output:nnn { f } -\cs_new:Npn \@@_to_utfviii_bytes_end: { } -% \end{macrocode} -% \end{macro} -% \end{macro} -% \end{macro} -% \end{macro} -% \end{macro} -% \end{macro} -% \end{macro} -% % \begin{macro}[EXP]{\char_to_nfd:N} % \begin{macro}[EXP]{\char_to_nfd:n} % \begin{macro}[EXP]{\@@_to_nfd:nn} diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx index 0974cae519..a4cd8ae96d 100644 --- a/l3kernel/l3unicode.dtx +++ b/l3kernel/l3unicode.dtx @@ -99,6 +99,18 @@ % category code $10$. % \end{function} % +% \begin{function}[added = 2022-10-09, EXP]{\codepoints_to_bytes:n} +% \begin{syntax} +% \cs{codepoint_to_bytes:n} \Arg{codepoint} +% \end{syntax} +% Converts the \meta{codepoint} to UTF-8 bytes. The expansion +% of this function comprises four brace groups, each of which will contain +% a hexadecimal value: the appropriate byte. As UTF-8 is a variable-length, +% one or more of the groups may be empty: the bytes read in the logical order, +% such that a two-byte codepoint will have groups |#1| and |#2| filled +% and |#3| and |#4| empty. +% \end{function} +% % \end{documentation} % % \begin{implementation} @@ -113,39 +125,7 @@ %<@@=codepoint> % \end{macrocode} % -% Text operations requires data from the Unicode Consortium. Data read into -% Unicode engine formats is at best a small part of what we need, so there -% is a loader here to set up the appropriate data structures. -% -% Where we need data for most or all of the Unicode range, we use the two-stage -% table approach recommended by the Unicode Consortium and demonstrated in a -% model implementation in Python in -% \url{https://www.strchr.com/multi-stage_tables}. This approach uses the -% \texttt{intarray} (\texttt{fontdimen}-based) data type as it is fast for -% random access and avoids significant hash table usage. In contrast, where -% only a small subset of codepoints are required, storage as macros is -% preferable. There is also some consideration of the effort needed to load -% data: see for example the grapheme breaking information, which would be -% problematic to convert into a two-stage table but which can be used with -% reasonable performance in a small number of comma lists (at the cost that -% breaking at higher codepoint Hangul characters will be slightly slow). -% -% \begin{variable}{\c_@@_block_size_int} -% Choosing the block size for the blocks in the two-stage approach is -% non-trivial: depending on the data stored, the optimal size for -% memory usage will vary. At the same time, for us there is also the -% question of load-time: larger blocks require longer comma lists -% as intermediates, so are slower. As this is going to be needed -% to use the data, we set it up outside of the group for clarity. -% \begin{macrocode} -\int_const:Nn \c_@@_block_size_int { 64 } -% \end{macrocode} -% \end{variable} -% -% Parsing the data files can be the same way for all engines, but where they -% are stored as character tokens, the construction method depends on whether -% they are Unicode or $8$-bit internally. Parsing is therefore done by common -% functions, with some data storage using engine-specific auxiliaries. +% \subsection{User functions} % % \begin{macro}[EXP]{\codepoint_str_generate:n} % \begin{macro}[EXP]{\@@_str_generate:nnnn} @@ -189,7 +169,7 @@ \use:e { \exp_not:N \@@_str_generate:nnnn - \char_to_utfviii_bytes:n {#1} + \codepoint_to_bytes:n {#1} } } } @@ -221,7 +201,7 @@ \use:e { \exp_not:N \@@_generate:nnnn - \char_to_utfviii_bytes:n {#1} + \codepoint_to_bytes:n {#1} } } } @@ -256,6 +236,147 @@ % \end{macro} % \end{macro} % +% \begin{macro}[EXP]{\codepoint_to_bytes:n} +% \begin{macro}[EXP]{\@@_to_bytes_auxi:n} +% \begin{macro}[EXP]{\@@_to_bytes_auxii:Nnn} +% \begin{macro}[EXP]{\@@_to_bytes_auxiii:n} +% \begin{macro}[EXP] +% { +% \@@_to_bytes_outputi:nw , +% \@@_to_bytes_outputii:nw , +% \@@_to_bytes_outputiii:nw , +% \@@_to_bytes_outputiv:nw +% } +% \begin{macro}[EXP] +% {\@@_to_bytes_output:nnn, \@@_to_bytes_output:fnn} +% \begin{macro}[EXP]{\@@_to_bytes_end:} +% This code converts a codepoint into the correct UTF-8 representation. +% In terms of the algorithm itself, see +% \url{https://en.wikipedia.org/wiki/UTF-8} for the octet pattern. +% \begin{macrocode} +\cs_new:Npn \codepoint_to_bytes:n #1 + { + \exp_args:Nf \@@_to_bytes_auxi:n + { \int_eval:n {#1} } + } +\cs_new:Npn \@@_to_bytes_auxi:n #1 + { + \if_int_compare:w #1 > "80 \exp_stop_f: + \if_int_compare:w #1 < "800 \exp_stop_f: + \@@_to_bytes_outputi:nw + { \@@_to_bytes_auxii:Nnn C {#1} { 64 } } + \@@_to_bytes_outputii:nw + { \@@_to_bytes_auxiii:n {#1} } + \else: + \if_int_compare:w #1 < "10000 \exp_stop_f: + \@@_to_bytes_outputi:nw + { \@@_to_bytes_auxii:Nnn E {#1} { 64 * 64 } } + \@@_to_bytes_outputii:nw + { + \@@_to_bytes_auxiii:n + { \int_div_truncate:nn {#1} { 64 } } + } + \@@_to_bytes_outputiii:nw + { \@@_to_bytes_auxiii:n {#1} } + \else: + \@@_to_bytes_outputi:nw + { + \@@_to_bytes_auxii:Nnn F + {#1} { 64 * 64 * 64 } + } + \@@_to_bytes_outputii:nw + { + \@@_to_bytes_auxiii:n + { \int_div_truncate:nn {#1} { 64 * 64 } } + } + \@@_to_bytes_outputiii:nw + { + \@@_to_bytes_auxiii:n + { \int_div_truncate:nn {#1} { 64 } } + } + \@@_to_bytes_outputiv:nw + { \@@_to_bytes_auxiii:n {#1} } + \fi: + \fi: + \else: + \@@_to_bytes_outputi:nw {#1} + \fi: + \@@_to_bytes_end: { } { } { } { } + } +\cs_new:Npn \@@_to_bytes_auxii:Nnn #1#2#3 + { "#10 + \int_div_truncate:nn {#2} {#3} } +\cs_new:Npn \@@_to_bytes_auxiii:n #1 + { \int_mod:nn {#1} { 64 } + 128 } +\cs_new:Npn \@@_to_bytes_outputi:nw + #1 #2 \@@_to_bytes_end: #3 + { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { } {#2} } +\cs_new:Npn \@@_to_bytes_outputii:nw + #1 #2 \@@_to_bytes_end: #3#4 + { \@@_to_bytes_output:fnn { \int_eval:n {#1} } { {#3} } {#2} } +\cs_new:Npn \@@_to_bytes_outputiii:nw + #1 #2 \@@_to_bytes_end: #3#4#5 + { + \@@_to_bytes_output:fnn + { \int_eval:n {#1} } { {#3} {#4} } {#2} + } +\cs_new:Npn \@@_to_bytes_outputiv:nw + #1 #2 \@@_to_bytes_end: #3#4#5#6 + { + \@@_to_bytes_output:fnn + { \int_eval:n {#1} } { {#3} {#4} {#5} } {#2} + } +\cs_new:Npn \@@_to_bytes_output:nnn #1#2#3 + { + #3 + \@@_to_bytes_end: #2 {#1} + } +\cs_generate_variant:Nn \@@_to_bytes_output:nnn { f } +\cs_new:Npn \@@_to_bytes_end: { } +% \end{macrocode} +% \end{macro} +% \end{macro} +% \end{macro} +% \end{macro} +% \end{macro} +% \end{macro} +% \end{macro} +% +% \subsection{Data loader} +% +% Text operations requires data from the Unicode Consortium. Data read into +% Unicode engine formats is at best a small part of what we need, so there +% is a loader here to set up the appropriate data structures. +% +% Where we need data for most or all of the Unicode range, we use the two-stage +% table approach recommended by the Unicode Consortium and demonstrated in a +% model implementation in Python in +% \url{https://www.strchr.com/multi-stage_tables}. This approach uses the +% \texttt{intarray} (\texttt{fontdimen}-based) data type as it is fast for +% random access and avoids significant hash table usage. In contrast, where +% only a small subset of codepoints are required, storage as macros is +% preferable. There is also some consideration of the effort needed to load +% data: see for example the grapheme breaking information, which would be +% problematic to convert into a two-stage table but which can be used with +% reasonable performance in a small number of comma lists (at the cost that +% breaking at higher codepoint Hangul characters will be slightly slow). +% +% \begin{variable}{\c_@@_block_size_int} +% Choosing the block size for the blocks in the two-stage approach is +% non-trivial: depending on the data stored, the optimal size for +% memory usage will vary. At the same time, for us there is also the +% question of load-time: larger blocks require longer comma lists +% as intermediates, so are slower. As this is going to be needed +% to use the data, we set it up outside of the group for clarity. +% \begin{macrocode} +\int_const:Nn \c_@@_block_size_int { 64 } +% \end{macrocode} +% \end{variable} +% +% Parsing the data files can be the same way for all engines, but where they +% are stored as character tokens, the construction method depends on whether +% they are Unicode or $8$-bit internally. Parsing is therefore done by common +% functions, with some data storage using engine-specific auxiliaries. +% % As only the data needs to remain at the end of this process, everything % is set up inside a group. The only thing that is outside is creating a % stream: they are global anyway and it is best to force a stream for @@ -721,7 +842,6 @@ % \end{macro} % \end{macro} % -% % \begin{macro}[EXP]{\__kernel_codepoint_nfd:n} % \begin{macro}[EXP]{\@@_nfd:nn} % A simple interface. diff --git a/l3kernel/testfiles/m3char001.luatex.tlg b/l3kernel/testfiles/m3char001.luatex.tlg index 0b33b0a9d3..fe84822506 100644 --- a/l3kernel/testfiles/m3char001.luatex.tlg +++ b/l3kernel/testfiles/m3char001.luatex.tlg @@ -489,15 +489,7 @@ cell 2 C ============================================================ ============================================================ -TEST 7: \char_to_utfviii_bytes:n -============================================================ -{65}{}{}{} -{206}{169}{}{} -{225}{136}{128}{} -{240}{144}{128}{128} -============================================================ -============================================================ -TEST 8: Number of expansions +TEST 7: Number of expansions ============================================================ begin-group character A end-group character A @@ -511,7 +503,7 @@ the character A undefined ============================================================ ============================================================ -TEST 9: \char_ case:N +TEST 8: \char_ case:N ============================================================ The token list contains the tokens: > a (the letter a). @@ -595,7 +587,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 10: \char_str_ case:N +TEST 9: \char_str_ case:N ============================================================ The token list contains the tokens: > a (the character a). @@ -679,7 +671,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 11: Changing \lccode and \uccode +TEST 10: Changing \lccode and \uccode ============================================================ The token list contains the tokens: > q (the character q). diff --git a/l3kernel/testfiles/m3char001.lvt b/l3kernel/testfiles/m3char001.lvt index 3a217d4e4e..775ba9c862 100644 --- a/l3kernel/testfiles/m3char001.lvt +++ b/l3kernel/testfiles/m3char001.lvt @@ -143,14 +143,6 @@ } } -\TESTEXP { \char_to_utfviii_bytes:n } - { - \char_to_utfviii_bytes:n { `A } \NEWLINE - \char_to_utfviii_bytes:n { "03A9 } \NEWLINE - \char_to_utfviii_bytes:n { "1200 } \NEWLINE - \char_to_utfviii_bytes:n { "10000 } - } - \OMIT \cs_gset:Npn \test:nn #1#2 { diff --git a/l3kernel/testfiles/m3char001.ptex.tlg b/l3kernel/testfiles/m3char001.ptex.tlg index 446889d963..09526d1b50 100644 --- a/l3kernel/testfiles/m3char001.ptex.tlg +++ b/l3kernel/testfiles/m3char001.ptex.tlg @@ -518,15 +518,7 @@ cell 2 C ============================================================ ============================================================ -TEST 7: \char_to_utfviii_bytes:n -============================================================ -{65}{}{}{} -{206}{169}{}{} -{225}{136}{128}{} -{240}{144}{128}{128} -============================================================ -============================================================ -TEST 8: Number of expansions +TEST 7: Number of expansions ============================================================ begin-group character A end-group character A @@ -540,7 +532,7 @@ the character A undefined ============================================================ ============================================================ -TEST 9: \char_ case:N +TEST 8: \char_ case:N ============================================================ The token list contains the tokens: > a (the letter a). @@ -1042,7 +1034,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 10: \char_str_ case:N +TEST 9: \char_str_ case:N ============================================================ The token list contains the tokens: > a (the character a). @@ -1136,7 +1128,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 11: Changing \lccode and \uccode +TEST 10: Changing \lccode and \uccode ============================================================ The token list contains the tokens: > q (the character q). diff --git a/l3kernel/testfiles/m3char001.tlg b/l3kernel/testfiles/m3char001.tlg index 446889d963..09526d1b50 100644 --- a/l3kernel/testfiles/m3char001.tlg +++ b/l3kernel/testfiles/m3char001.tlg @@ -518,15 +518,7 @@ cell 2 C ============================================================ ============================================================ -TEST 7: \char_to_utfviii_bytes:n -============================================================ -{65}{}{}{} -{206}{169}{}{} -{225}{136}{128}{} -{240}{144}{128}{128} -============================================================ -============================================================ -TEST 8: Number of expansions +TEST 7: Number of expansions ============================================================ begin-group character A end-group character A @@ -540,7 +532,7 @@ the character A undefined ============================================================ ============================================================ -TEST 9: \char_ case:N +TEST 8: \char_ case:N ============================================================ The token list contains the tokens: > a (the letter a). @@ -1042,7 +1034,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 10: \char_str_ case:N +TEST 9: \char_str_ case:N ============================================================ The token list contains the tokens: > a (the character a). @@ -1136,7 +1128,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 11: Changing \lccode and \uccode +TEST 10: Changing \lccode and \uccode ============================================================ The token list contains the tokens: > q (the character q). diff --git a/l3kernel/testfiles/m3char001.uptex.tlg b/l3kernel/testfiles/m3char001.uptex.tlg index 446889d963..09526d1b50 100644 --- a/l3kernel/testfiles/m3char001.uptex.tlg +++ b/l3kernel/testfiles/m3char001.uptex.tlg @@ -518,15 +518,7 @@ cell 2 C ============================================================ ============================================================ -TEST 7: \char_to_utfviii_bytes:n -============================================================ -{65}{}{}{} -{206}{169}{}{} -{225}{136}{128}{} -{240}{144}{128}{128} -============================================================ -============================================================ -TEST 8: Number of expansions +TEST 7: Number of expansions ============================================================ begin-group character A end-group character A @@ -540,7 +532,7 @@ the character A undefined ============================================================ ============================================================ -TEST 9: \char_ case:N +TEST 8: \char_ case:N ============================================================ The token list contains the tokens: > a (the letter a). @@ -1042,7 +1034,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 10: \char_str_ case:N +TEST 9: \char_str_ case:N ============================================================ The token list contains the tokens: > a (the character a). @@ -1136,7 +1128,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 11: Changing \lccode and \uccode +TEST 10: Changing \lccode and \uccode ============================================================ The token list contains the tokens: > q (the character q). diff --git a/l3kernel/testfiles/m3char001.xetex.tlg b/l3kernel/testfiles/m3char001.xetex.tlg index 8854c31f0b..5e42184bdf 100644 --- a/l3kernel/testfiles/m3char001.xetex.tlg +++ b/l3kernel/testfiles/m3char001.xetex.tlg @@ -489,15 +489,7 @@ cell 2 C ============================================================ ============================================================ -TEST 7: \char_to_utfviii_bytes:n -============================================================ -{65}{}{}{} -{206}{169}{}{} -{225}{136}{128}{} -{240}{144}{128}{128} -============================================================ -============================================================ -TEST 8: Number of expansions +TEST 7: Number of expansions ============================================================ begin-group character A end-group character A @@ -511,7 +503,7 @@ the character A undefined ============================================================ ============================================================ -TEST 9: \char_ case:N +TEST 8: \char_ case:N ============================================================ The token list contains the tokens: > a (the letter a). @@ -595,7 +587,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 10: \char_str_ case:N +TEST 9: \char_str_ case:N ============================================================ The token list contains the tokens: > a (the character a). @@ -679,7 +671,7 @@ The token list contains the tokens: l. ... } ============================================================ ============================================================ -TEST 11: Changing \lccode and \uccode +TEST 10: Changing \lccode and \uccode ============================================================ The token list contains the tokens: > q (the character q). diff --git a/l3kernel/testfiles/m3text006.lvt b/l3kernel/testfiles/m3text006.lvt index 2357e6bb54..5c2d8b86a8 100644 --- a/l3kernel/testfiles/m3text006.lvt +++ b/l3kernel/testfiles/m3text006.lvt @@ -127,7 +127,7 @@ { \exp_args:Ne \test_generate_aux:n { - \exp_args:Ne \char_to_utfviii_bytes:n + \exp_args:Ne \codepoint_to_bytes:n { " \tl_trim_spaces:n {#1} } } } diff --git a/l3kernel/testfiles/m3token006.luatex.tlg b/l3kernel/testfiles/m3token006.luatex.tlg index a1c542abed..f5a12fbb09 100644 --- a/l3kernel/testfiles/m3token006.luatex.tlg +++ b/l3kernel/testfiles/m3token006.luatex.tlg @@ -2,17 +2,7 @@ This is a generated file for the LaTeX (2e + expl3) validation system. Don't change this file in any respect. Author: Joseph Wright ============================================================ -TEST 1: Byte_decomposition -============================================================ -{65}{}{}{} -{195}{142}{}{} -{206}{137}{}{} -{225}{182}{173}{} -{239}{191}{189}{} -{240}{144}{128}{128} -============================================================ -============================================================ -TEST 2: Character decomposition +TEST 1: Character decomposition ============================================================ A Î diff --git a/l3kernel/testfiles/m3token006.lvt b/l3kernel/testfiles/m3token006.lvt index 9aeea764c7..cdb8df767a 100644 --- a/l3kernel/testfiles/m3token006.lvt +++ b/l3kernel/testfiles/m3token006.lvt @@ -15,16 +15,6 @@ \ExplSyntaxOn -\TESTEXP { Byte_decomposition } - { - \char_to_utfviii_bytes:n { `A } \NEWLINE - \char_to_utfviii_bytes:n { "00CE } \NEWLINE - \char_to_utfviii_bytes:n { "0389 } \NEWLINE - \char_to_utfviii_bytes:n { "1DAD } \NEWLINE - \char_to_utfviii_bytes:n { "FFFD } \NEWLINE - \char_to_utfviii_bytes:n { "10000 } - } - \TESTEXP { Character~decomposition } { \char_to_nfd:n { `A } \NEWLINE diff --git a/l3kernel/testfiles/m3token006.tlg b/l3kernel/testfiles/m3token006.tlg index 45256798d7..7ab6b11e28 100644 --- a/l3kernel/testfiles/m3token006.tlg +++ b/l3kernel/testfiles/m3token006.tlg @@ -2,17 +2,7 @@ This is a generated file for the LaTeX (2e + expl3) validation system. Don't change this file in any respect. Author: Joseph Wright ============================================================ -TEST 1: Byte_decomposition -============================================================ -{65}{}{}{} -{195}{142}{}{} -{206}{137}{}{} -{225}{182}{173}{} -{239}{191}{189}{} -{240}{144}{128}{128} -============================================================ -============================================================ -TEST 2: Character decomposition +TEST 1: Character decomposition ============================================================ A I^^cc^^82 diff --git a/l3kernel/testfiles/m3token006.xetex.tlg b/l3kernel/testfiles/m3token006.xetex.tlg index a1c542abed..f5a12fbb09 100644 --- a/l3kernel/testfiles/m3token006.xetex.tlg +++ b/l3kernel/testfiles/m3token006.xetex.tlg @@ -2,17 +2,7 @@ This is a generated file for the LaTeX (2e + expl3) validation system. Don't change this file in any respect. Author: Joseph Wright ============================================================ -TEST 1: Byte_decomposition -============================================================ -{65}{}{}{} -{195}{142}{}{} -{206}{137}{}{} -{225}{182}{173}{} -{239}{191}{189}{} -{240}{144}{128}{128} -============================================================ -============================================================ -TEST 2: Character decomposition +TEST 1: Character decomposition ============================================================ A Î diff --git a/l3kernel/testfiles/m3unicode001.luatex.tlg b/l3kernel/testfiles/m3unicode001.luatex.tlg index a812f43452..0e243b747f 100644 --- a/l3kernel/testfiles/m3unicode001.luatex.tlg +++ b/l3kernel/testfiles/m3unicode001.luatex.tlg @@ -21,3 +21,13 @@ X X X X XaX ============================================================ +============================================================ +TEST 3: Byte decomposition +============================================================ +{65}{}{}{} +{195}{142}{}{} +{206}{137}{}{} +{225}{182}{173}{} +{239}{191}{189}{} +{240}{144}{128}{128} +============================================================ diff --git a/l3kernel/testfiles/m3unicode001.lvt b/l3kernel/testfiles/m3unicode001.lvt index ed724c47e6..73e6a41dfa 100644 --- a/l3kernel/testfiles/m3unicode001.lvt +++ b/l3kernel/testfiles/m3unicode001.lvt @@ -42,4 +42,14 @@ X \codepoint_generate:nn { 97 } { 10 } X \NEWLINE } +\TESTEXP { Byte~decomposition } + { + \codepoint_to_bytes:n { `A } \NEWLINE + \codepoint_to_bytes:n { "00CE } \NEWLINE + \codepoint_to_bytes:n { "0389 } \NEWLINE + \codepoint_to_bytes:n { "1DAD } \NEWLINE + \codepoint_to_bytes:n { "FFFD } \NEWLINE + \codepoint_to_bytes:n { "10000 } + } + \END \ No newline at end of file diff --git a/l3kernel/testfiles/m3unicode001.tlg b/l3kernel/testfiles/m3unicode001.tlg index 2924a5588c..21244eb8b0 100644 --- a/l3kernel/testfiles/m3unicode001.tlg +++ b/l3kernel/testfiles/m3unicode001.tlg @@ -21,3 +21,13 @@ X X X X XaX ============================================================ +============================================================ +TEST 3: Byte decomposition +============================================================ +{65}{}{}{} +{195}{142}{}{} +{206}{137}{}{} +{225}{182}{173}{} +{239}{191}{189}{} +{240}{144}{128}{128} +============================================================ diff --git a/l3kernel/testfiles/m3unicode001.xetex.tlg b/l3kernel/testfiles/m3unicode001.xetex.tlg index a812f43452..0e243b747f 100644 --- a/l3kernel/testfiles/m3unicode001.xetex.tlg +++ b/l3kernel/testfiles/m3unicode001.xetex.tlg @@ -21,3 +21,13 @@ X X X X XaX ============================================================ +============================================================ +TEST 3: Byte decomposition +============================================================ +{65}{}{}{} +{195}{142}{}{} +{206}{137}{}{} +{225}{182}{173}{} +{239}{191}{189}{} +{240}{144}{128}{128} +============================================================