diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md index a5b1842d91..6e55f71e03 100644 --- a/l3kernel/CHANGELOG.md +++ b/l3kernel/CHANGELOG.md @@ -7,6 +7,9 @@ this project uses date-based 'snapshot' version identifiers. ## [Unreleased] +### Added +- `\codepoint_str_generate:n` + ### Changed - Usage of `\exp_not:n`/`\exp_not:N` in `\peek_analysis_map_inline:n` output diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx index b2b5e3468e..307cb8d085 100644 --- a/l3kernel/l3token.dtx +++ b/l3kernel/l3token.dtx @@ -1828,51 +1828,13 @@ \cs_new:Npn \@@_to_nfd:nnnn #1#2#3#4 { \int_compare:nNnTF {#1} = {#3} - { \@@_to_nfd_generate:nn {#1} {#4} } + { \codepoint_generate:nn {#1} {#4} } { \@@_to_nfd:nn {#1} {#4} \tl_if_blank:nF {#2} { \@@_to_nfd:nn {#2} {#4} } } } -\bool_lazy_or:nnTF - { \sys_if_engine_luatex_p: } - { \sys_if_engine_xetex_p: } - { - \cs_new:Npn \@@_to_nfd_generate:nn - { \char_generate:nn } - } - { - \cs_new:Npn \@@_to_nfd_generate:nn #1#2 - { - \exp_args:Ne \@@_to_nfd_generate:n - { \char_to_utfviii_bytes:n {#1} } - } - \cs_new:Npn \@@_to_nfd_generate:n #1 - { \@@_to_nfd_generate:nnnn #1 } - \cs_new:Npn \@@_to_nfd_generate:nnnn #1#2#3#4 - { - \int_compare:nNnTF {#1} < { "80 } - { \char_generate:nn {#1} { \char_value_catcode:n {#1} } } - { - \exp_after:wN \exp_after:wN \exp_after:wN - \exp_not:N \char_generate:nn {#1} { 13 } - \exp_after:wN \exp_after:wN \exp_after:wN - \exp_not:N \char_generate:nn {#2} { 13 } - \tl_if_blank:nF {#3} - { - \exp_after:wN \exp_after:wN \exp_after:wN - \exp_not:N \char_generate:nn {#3} { 13 } - \tl_if_blank:nF {#4} - { - \exp_after:wN \exp_after:wN \exp_after:wN - \exp_not:N \char_generate:nn {#4} { 13 } - } - } - } - - } - } % \end{macrocode} % \end{macro} % \end{macro} diff --git a/l3kernel/l3unicode.dtx b/l3kernel/l3unicode.dtx index c9d3f51f21..0974cae519 100644 --- a/l3kernel/l3unicode.dtx +++ b/l3kernel/l3unicode.dtx @@ -50,8 +50,54 @@ % \begin{documentation} % % This module provides Unicode-specific functions along with loading data -% from a range of Unicode Consortium files. At present, it provides no -% public functions. +% from a range of Unicode Consortium files. Most of the code here is +% internal, but there are a small set of public functions. These work with +% Unicode \meta{codepoints} and are designed to give useable results with +% both Unicode-aware and $8$-bit engines. +% +% \begin{function}[EXP, added = 2022-10-09] +% {\codepoint_generate:nn} +% \begin{syntax} +% \cs{codepoint_generate:nn} \Arg{codepoint} \Arg{catcode} +% \end{syntax} +% Generates one or more character tokens representing the \meta{codepoint}. +% With Unicode engines, exactly one character token will be generated, and +% this will have the \meta{catcode} specified as the second argument: +% \begin{itemize} +% \item $1$ (begin group) +% \item $2$ (end group) +% \item $3$ (math toggle) +% \item $4$ (alignment) +% \item $6$ (parameter) +% \item $7$ (math superscript) +% \item $8$ (math subscript) +% \item $10$ (space) +% \item $11$ (letter) +% \item $12$ (other) +% \item $13$ (active) +% \end{itemize} +% For $8$-bit engines, between one and four character tokens will be +% produced: these will be the bytes of the UTF-8 representation of the +% \meta{codepoint}. For all codepoints outside of the classical ASCII +% range, the generated character tokens will be active (category code +% $13$); the \meta{catcode} argument is only used for codepoints in the +% ASCII range. To allow the result of this function to be used inside a +% expansion context, the result is protected by \cs{exp_not:n}. +% \end{function} +% +% \begin{function}[EXP, added = 2022-10-09] +% {\codepoint_str_generate:n} +% \begin{syntax} +% \cs{codepoint_str_generate:n} \Arg{codepoint} +% \end{syntax} +% Generates one or more character tokens representing the \meta{codepoint}. +% With Unicode engines, exactly one character token will be generated. +% For $8$-bit engines, between one and four character tokens will be +% produced: these will be the bytes of the UTF-8 representation of the +% \meta{codepoint}. All of the generated character tokens will be of +% category code $12$, except any spaces (codepoint $32$), which will be +% category code $10$. +% \end{function} % % \end{documentation} % @@ -101,8 +147,11 @@ % they are Unicode or $8$-bit internally. Parsing is therefore done by common % functions, with some data storage using engine-specific auxiliaries. % -% \begin{macro}[EXP]{\@@_generate_str:n} -% \begin{macro}[EXP]{\@@_generate_str:nnnn} +% \begin{macro}[EXP]{\codepoint_str_generate:n} +% \begin{macro}[EXP]{\@@_str_generate:nnnn} +% \begin{macro}[EXP]{\codepoint_generate:nn} +% \begin{macro}[EXP]{\@@_generate:nnnn} +% \begin{macro}[EXP]{\@@_generate:n} % Conversion of a codepoint to a character (Unicode engines) or to one % or more bytes ($8$-bit engines) is required. For loading the data, % all that is needed is the form which creates strings: these are outside @@ -115,23 +164,36 @@ { \sys_if_engine_luatex_p: } { \sys_if_engine_xetex_p: } { - \cs_new:Npn \@@_generate_str:n #1 + \cs_new:Npn \codepoint_str_generate:n #1 { \int_compare:nNnTF {#1} = { `\ } { ~ } { \char_generate:nn {#1} { 12 } } } + \cs_new:Npn \codepoint_generate:nn #1#2 + { + \int_compare:nNnTF {#1} = { `\ } + { ~ } + { + \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN + { \char_generate:nn {#1} {#2} } + } + } } { - \cs_new:Npn \@@_generate_str:n #1 + \cs_new:Npn \codepoint_str_generate:n #1 { - \use:e + \int_compare:nNnTF {#1} = { `\ } + { ~ } { - \exp_not:N \@@_generate_str:nnnn - \char_to_utfviii_bytes:n {#1} + \use:e + { + \exp_not:N \@@_str_generate:nnnn + \char_to_utfviii_bytes:n {#1} + } } } - \cs_new:Npn \@@_generate_str:nnnn #1#2#3#4 + \cs_new:Npn \@@_str_generate:nnnn #1#2#3#4 { \char_generate:nn {#1} { 12 } \tl_if_blank:nF {#2} @@ -145,10 +207,54 @@ } } } + \cs_new:Npn \codepoint_generate:nn #1#2 + { + \int_compare:nNnTF {#1} = { `\ } + { ~ } + { + \int_compare:nNnTF {#1} < { "80 } + { + \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN + { \char_generate:nn {#1} {#2} } + } + { + \use:e + { + \exp_not:N \@@_generate:nnnn + \char_to_utfviii_bytes:n {#1} + } + } + } + } + \cs_new:Npn \@@_generate:nnnn #1#2#3#4 + { + \__kernel_exp_not:w \exp_after:wN + { + \tex_expanded:D + { + \@@_generate:n {#1} + \@@_generate:n {#2} + \tl_if_blank:nF {#3} + { + \@@_generate:n {#3} + \tl_if_blank:nF {#4} + { \@@_generate:n {#4} } + } + } + } + } + \cs_new:Npn \@@_generate:n #1 + { + \__kernel_exp_not:w \exp_after:wN \exp_after:wN \exp_after:wN + { \char_generate:nn {#1} { 13 } } + } } % \end{macrocode} % \end{macro} % \end{macro} +% \end{macro} +% \end{macro} +% \end{macro} % % As only the data needs to remain at the end of this process, everything % is set up inside a group. The only thing that is outside is creating a @@ -218,7 +324,7 @@ \cs_set_protected:Npn \@@_data_auxii:w #1 ; #2 ~ #3 \q_stop { \tl_const:cx - { c_@@_nfd_ \@@_generate_str:n {"#1} _tl } + { c_@@_nfd_ \codepoint_str_generate:n {"#1} _tl } { {"#2} { \tl_if_blank:nF {#3} {"#3} } @@ -270,7 +376,7 @@ \int_compare:nNnF {#3} = { \@@_data_offset:nn {#1} {#5} } { \tl_const:cx - { c_@@_titlecase_ \@@_generate_str:n {"#1} _tl } + { c_@@_titlecase_ \codepoint_str_generate:n {"#1} _tl } { {"#5} { } { } } } \tl_set:Nx \l_@@_next_codepoint_fint_tl @@ -504,7 +610,7 @@ \int_eval:n { \__kernel_codepoint_data:nn { lowercase } {"#1} + "#1 } = "#3 ~ \tl_const:cx - { c_@@_casefold_ \@@_generate_str:n {"#1} _tl } + { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl } { {"#3} { } { } } \fi: \else: @@ -519,7 +625,7 @@ % \begin{macrocode} \cs_set_protected:Npn \@@_data_auxii:w #1 ~ #2 ~ #3 ~ #4 \q_stop { - \tl_const:cx { c_@@_casefold_ \@@_generate_str:n {"#1} _tl } + \tl_const:cx { c_@@_casefold_ \codepoint_str_generate:n {"#1} _tl } { {"#2} {"#3} @@ -552,7 +658,7 @@ { \tl_if_empty:nF {#4} { - \tl_const:cx { c_@@_ #2 case_ \@@_generate_str:n {"#1} _tl } + \tl_const:cx { c_@@_ #2 case_ \codepoint_str_generate:n {"#1} _tl } { {"#3} {"#4} @@ -588,7 +694,7 @@ \cs_new:Npn \__kernel_codepoint_case:nn #1#2 { \exp_args:Ne \@@_case:nnn - { \@@_generate_str:n {#2} } {#1} {#2} + { \codepoint_str_generate:n {#2} } {#1} {#2} } \cs_new:Npn \@@_case:nnn #1#2#3 { @@ -621,7 +727,7 @@ % A simple interface. % \begin{macrocode} \cs_new:Npn \__kernel_codepoint_nfd:n #1 - { \exp_args:Ne \@@_nfd:nn { \@@_generate_str:n {#1} } {#1} } + { \exp_args:Ne \@@_nfd:nn { \codepoint_str_generate:n {#1} } {#1} } \cs_new:Npn \@@_nfd:nn #1#2 { \tl_if_exist:cTF { c_@@_nfd_ #1 _tl } diff --git a/l3kernel/testfiles/m3unicode001.luatex.tlg b/l3kernel/testfiles/m3unicode001.luatex.tlg new file mode 100644 index 0000000000..a812f43452 --- /dev/null +++ b/l3kernel/testfiles/m3unicode001.luatex.tlg @@ -0,0 +1,23 @@ +This is a generated file for the LaTeX (2e + expl3) validation system. +Don't change this file in any respect. +Author: Joseph Wright +============================================================ +TEST 1: Codepoint to chars +============================================================ +AA +AA +^^ad^^ad +įį +ͰͰ +ԠԠ +પપ +ᄤᄤ +𐀉𐀉 +============================================================ +============================================================ +TEST 2: Spaces +============================================================ +X X +X X +XaX +============================================================ diff --git a/l3kernel/testfiles/m3unicode001.lvt b/l3kernel/testfiles/m3unicode001.lvt new file mode 100644 index 0000000000..ed724c47e6 --- /dev/null +++ b/l3kernel/testfiles/m3unicode001.lvt @@ -0,0 +1,45 @@ +% +% Copyright (C) 2022 The LaTeX Project +% +\documentclass{minimal} +\input{regression-test} +\RequirePackage[enable-debug]{expl3} +\ExplSyntaxOn +\debug_on:n { check-declarations , deprecation , log-functions } +\ExplSyntaxOff + +\begin{document} + +\START +\AUTHOR{Joseph Wright} +\ExplSyntaxOn + +\OMIT +\cs_set:Npn \test:nn #1#2 + { + \codepoint_generate:nn {#1} {#2} + \codepoint_str_generate:n {#1} + } +\TIMO + +\TESTEXP { Codepoint~to~chars } + { + \test:nn { "0041 } { 11 } \NEWLINE + \test:nn { "0041 } { 12 } \NEWLINE + \test:nn { "00AD } { 12 } \NEWLINE + \test:nn { "012F } { 11 } \NEWLINE + \test:nn { "0370 } { 11 } \NEWLINE + \test:nn { "0520 } { 11 } \NEWLINE + \test:nn { "0AAA } { 11 } \NEWLINE + \test:nn { "1124 } { 11 } \NEWLINE + \test:nn { "10009 } { 11 } \NEWLINE + } + +\TESTEXP { Spaces } + { + X \codepoint_generate:nn { 32 } { 11 } X \NEWLINE + X \codepoint_generate:nn { 32 } { 12 } X \NEWLINE + X \codepoint_generate:nn { 97 } { 10 } X \NEWLINE + } + +\END \ No newline at end of file diff --git a/l3kernel/testfiles/m3unicode001.tlg b/l3kernel/testfiles/m3unicode001.tlg new file mode 100644 index 0000000000..2924a5588c --- /dev/null +++ b/l3kernel/testfiles/m3unicode001.tlg @@ -0,0 +1,23 @@ +This is a generated file for the LaTeX (2e + expl3) validation system. +Don't change this file in any respect. +Author: Joseph Wright +============================================================ +TEST 1: Codepoint to chars +============================================================ +AA +AA +^^c2^^ad^^c2^^ad +^^c4^^af^^c4^^af +^^cd^^b0^^cd^^b0 +^^d4^^a0^^d4^^a0 +^^e0^^aa^^aa^^e0^^aa^^aa +^^e1^^84^^a4^^e1^^84^^a4 +^^f0^^90^^80^^89^^f0^^90^^80^^89 +============================================================ +============================================================ +TEST 2: Spaces +============================================================ +X X +X X +XaX +============================================================ diff --git a/l3kernel/testfiles/m3unicode001.xetex.tlg b/l3kernel/testfiles/m3unicode001.xetex.tlg new file mode 100644 index 0000000000..a812f43452 --- /dev/null +++ b/l3kernel/testfiles/m3unicode001.xetex.tlg @@ -0,0 +1,23 @@ +This is a generated file for the LaTeX (2e + expl3) validation system. +Don't change this file in any respect. +Author: Joseph Wright +============================================================ +TEST 1: Codepoint to chars +============================================================ +AA +AA +^^ad^^ad +įį +ͰͰ +ԠԠ +પપ +ᄤᄤ +𐀉𐀉 +============================================================ +============================================================ +TEST 2: Spaces +============================================================ +X X +X X +XaX +============================================================