Skip to content

Commit

Permalink
Enable CJK token handling for (u)pTeX (fixes #1171)
Browse files Browse the repository at this point in the history
  • Loading branch information
josephwright committed Feb 13, 2023
1 parent 428132c commit a027168
Show file tree
Hide file tree
Showing 13 changed files with 822 additions and 98 deletions.
4 changes: 4 additions & 0 deletions l3kernel/CHANGELOG.md
Expand Up @@ -7,6 +7,10 @@ this project uses date-based 'snapshot' version identifiers.

## [Unreleased]

### Fixed
- CJK character handling for (u)pTeX (issue
[\#1171](https://github.com/latex3/latex3/issues/1171))

## [2023-02-07]

### Changed
Expand Down
32 changes: 22 additions & 10 deletions l3kernel/l3str.dtx
Expand Up @@ -1917,7 +1917,8 @@
% \begin{macro}[EXP]{\@@_change_case_end:nw}
% \begin{macro}[EXP]{\@@_change_case_loop:nw}
% \begin{macro}[EXP]{\@@_change_case_space:n}
% \begin{macro}[EXP]{\@@_change_case_char:nN, \@@_change_case_char_aux:nN}
% \begin{macro}[EXP]
% {\@@_change_case_char:nN, \@@_change_case_char_auxi:nN, \@@_change_case_char_auxii:nN}
% \begin{macro}[EXP]{\@@_change_case_codepoint:nN}
% \begin{macro}[EXP]{\@@_change_case_codepoint:nNN}
% \begin{macro}[EXP]{\@@_change_case_codepoint:nNNN}
Expand Down Expand Up @@ -1977,22 +1978,33 @@
\cs_new:Npn \@@_change_case_codepoint:nN #1#2
{ \@@_change_case_char:fnn { \int_eval:n {`#2} } {#1} {#2} }
\else:
\cs_new:Npn \@@_change_case_codepoint:nN #1#2
\cs_new:Npx \@@_change_case_codepoint:nN #1#2
{
\int_compare:nNnTF {`#2} > { "80 }
\exp_not:N \int_compare:nNnTF {`#2} > { "80 }
{
\int_compare:nNnTF {`#2} < { "E0 }
{ \@@_change_case_codepoint:nNN }
\cs_if_exist:NTF \tex_pdftexversion:D
{ \exp_not:N \@@_change_case_char_auxi:nN }
{
\int_compare:nNnTF {`#2} < { "F0 }
{ \@@_change_case_codepoint:nNNN }
{ \@@_change_case_codepoint:nNNNNN }
\exp_not:N \int_compare:nNnTF {`#2} > { "FF }
{ \exp_not:N \@@_change_case_char_auxii:nN }
{ \exp_not:N \@@_change_case_char_auxi:nN }
}
}
{ \@@_change_case_char_aux:nN }
{ \exp_not:N \@@_change_case_char_auxii:nN }
{#1} #2
}
\cs_new:Npn \@@_change_case_char_auxi:nN #1#2
{
\int_compare:nNnTF {`#2} < { "E0 }
{ \@@_change_case_codepoint:nNN }
{
\int_compare:nNnTF {`#2} < { "F0 }
{ \@@_change_case_codepoint:nNNN }
{ \@@_change_case_codepoint:nNNNNN }
}
{#1} #2
}
\cs_new:Npn \@@_change_case_char_aux:nN #1#2
\cs_new:Npn \@@_change_case_char_auxii:nN #1#2
{ \@@_change_case_char:fnn { \int_eval:n {`#2} } {#1} {#2} }
\cs_new:Npn \@@_change_case_codepoint:nNN #1#2#3
{
Expand Down
138 changes: 64 additions & 74 deletions l3kernel/l3text.dtx
Expand Up @@ -633,7 +633,7 @@
%
% For working with codepoints in an engine-neutral way.
%
% \begin{macro}[EXP]{\@@_codepoint_process:nN}
% \begin{macro}[EXP]{\@@_codepoint_process:nN, \@@_codepoint_process_aux:nN}
% \begin{macro}[EXP]{\@@_codepoint_process:nNN}
% \begin{macro}[EXP]{\@@_codepoint_process:nNNN}
% \begin{macro}[EXP]{\@@_codepoint_process:nNNNN}
Expand All @@ -647,43 +647,38 @@
\cs_new:Npn \@@_codepoint_process:nN #1#2 { #1 {#2} }
}
{
\cs_new:Npn \@@_codepoint_process:nN #1#2
\cs_new:Npx \@@_codepoint_process:nN #1#2
{
\int_compare:nNnTF { `#2 } > { "80 }
\exp_not:N \int_compare:nNnTF {`#2} > { "80 }
{
\int_compare:nNnTF { `#2 } < { "E0 }
{ \@@_codepoint_process:nNN }
\sys_if_engine_pdftex:TF
{ \exp_not:N \@@_codepoint_process_aux:nN }
{
\int_compare:nNnTF { `#2 } < { "F0 }
{ \@@_codepoint_process:nNNN }
{ \@@_codepoint_process:nNNNN }
\exp_not:N \int_compare:nNnTF {`#2} > { "FF }
{ \exp_not:N \use:n }
{ \exp_not:N \@@_codepoint_process_aux:nN }
}
}
{ \use:n }
{ \exp_not:N \use:n }
{#1} #2
}
\cs_new:Npn \@@_codepoint_process:nNN #1#2#3
{ #1 {#2#3} }
\sys_if_engine_ptex:TF
\cs_new:Npn \@@_codepoint_process_aux:nN #1#2
{
\cs_gset:Npn \@@_codepoint_process:nN #1#2
\int_compare:nNnTF { `#2 } < { "E0 }
{ \@@_codepoint_process:nNN }
{
\int_compare:nNnTF { `#2 } > { "80 }
{
\int_compare:nNnTF { `#2 } < { "E0 }
{ \@@_codepoint_process:nNN }
{ \use:n }
}
{ \use:n }
\int_compare:nNnTF { `#2 } < { "F0 }
{ \@@_codepoint_process:nNNN }
{ \@@_codepoint_process:nNNNN }
}
{#1} #2
}
}
{
\cs_new:Npn \@@_codepoint_process:nNNN #1#2#3#4
{ #1 {#2#3#4} }
\cs_new:Npn \@@_codepoint_process:nNNNN #1#2#3#4#5
{ #1 {#2#3#4#5} }
}
\cs_new:Npn \@@_codepoint_process:nNN #1#2#3
{ #1 {#2#3} }
\cs_new:Npn \@@_codepoint_process:nNNN #1#2#3#4
{ #1 {#2#3#4} }
\cs_new:Npn \@@_codepoint_process:nNNNN #1#2#3#4#5
{ #1 {#2#3#4#5} }
}
% \end{macrocode}
% \end{macro}
Expand All @@ -692,7 +687,8 @@
% \end{macro}
%
% \begin{macro}[EXP, pTF]{\@@_codepoint_compare:nNn}
% \begin{macro}[EXP]{\@@_codepoint_from_chars:Nw}
% \begin{macro}[EXP]
% {\@@_codepoint_from_chars:Nw, \@@_codepoint_from_chars_aux:Nw}
% \begin{macro}[EXP]{\@@_codepoint_from_chars:N}
% \begin{macro}[EXP]{\@@_codepoint_from_chars:NN}
% \begin{macro}[EXP]{\@@_codepoint_from_chars:NNN}
Expand Down Expand Up @@ -720,62 +716,56 @@
#2 {#3}
\prg_return_true: \prg_return_false:
}
\cs_new:Npn \@@_codepoint_from_chars:Nw #1
\cs_new:Npx \@@_codepoint_from_chars:Nw #1
{
\exp_not:N \if_int_compare:w `#1 > "80 \exp_not:N \exp_stop_f:
\sys_if_engine_pdftex:TF
{
\exp_not:N \exp_after:wN
\exp_not:N \@@_codepoint_from_chars_aux:Nw
}
{
\exp_not:N \if_int_compare:w `#1 > "FF \exp_not:N \exp_stop_f:
\exp_not:N \exp_after:wN \exp_not:N \exp_after:wN
\exp_not:N \exp_after:wN
\exp_not:N \@@_codepoint_from_chars:N
\exp_not:N \else:
\exp_not:N \exp_after:wN \exp_not:N \exp_after:wN
\exp_not:N \exp_after:wN
\exp_not:N \@@_codepoint_from_chars_aux:Nw
\exp_not:N \fi:
}
\exp_not:N \else:
\exp_not:N \exp_after:wN \exp_not:N \@@_codepoint_from_chars:N
\exp_not:N \fi:
#1
}
\cs_new:Npn \@@_codepoint_from_chars_aux:Nw #1
{
\if_int_compare:w `#1 > "80 \exp_stop_f:
\if_int_compare:w `#1 < "E0 \exp_stop_f:
\if_int_compare:w `#1 < "E0 \exp_stop_f:
\exp_after:wN \@@_codepoint_from_chars:NN
\else:
\if_int_compare:w `#1 < "F0 \exp_stop_f:
\exp_after:wN \exp_after:wN \exp_after:wN
\@@_codepoint_from_chars:NN
\@@_codepoint_from_chars:NNN
\else:
\if_int_compare:w `#1 < "F0 \exp_stop_f:
\exp_after:wN \exp_after:wN \exp_after:wN
\exp_after:wN \exp_after:wN \exp_after:wN
\exp_after:wN \@@_codepoint_from_chars:NNN
\else:
\exp_after:wN \exp_after:wN \exp_after:wN
\exp_after:wN \exp_after:wN \exp_after:wN
\exp_after:wN \@@_codepoint_from_chars:NNNN
\fi:
\exp_after:wN \exp_after:wN \exp_after:wN
\@@_codepoint_from_chars:NNNN
\fi:
\else:
\exp_after:wN \@@_codepoint_from_chars:N
\fi:
#1
}
\cs_new:Npn \@@_codepoint_from_chars:N #1 { `#1 }
\cs_new:Npn \@@_codepoint_from_chars:N #1 {`#1}
\cs_new:Npn \@@_codepoint_from_chars:NN #1#2
{ (`#1 - "C0) * "40 + `#2 - "80 }
% \end{macrocode}
% Avoid high chars with p\TeX{}.
% \begin{macrocode}
\sys_if_engine_ptex:TF
\cs_new:Npn \@@_codepoint_from_chars:NNN #1#2#3
{ (`#1 - "E0) * "1000 + (`#2 - "80) * "40 + `#3 - "80 }
\cs_new:Npn \@@_codepoint_from_chars:NNNN #1#2#3#4
{
\cs_gset:Npn \@@_codepoint_from_chars:Nw #1
{
\if_int_compare:w `#1 > "80 \exp_stop_f:
\if_int_compare:w `#1 < "E0 \exp_stop_f:
\exp_after:wN \exp_after:wN \exp_after:wN
\@@_codepoint_from_chars:NN
\else:
\exp_after:wN \exp_after:wN \exp_after:wN
\@@_codepoint_from_chars:N
\fi:
\else:
\exp_after:wN \@@_codepoint_from_chars:N
\fi:
#1
}
}
{
\cs_new:Npn \@@_codepoint_from_chars:NNN #1#2#3
{ (`#1 - "E0) * "1000 + (`#2 - "80) * "40 + `#3 - "80 }
\cs_new:Npn \@@_codepoint_from_chars:NNNN #1#2#3#4
{
(`#1 - "F0) * "40000
+ (`#2 - "80) * "1000
+ (`#3 - "80) * "40
+ `#4 - "80
}
(`#1 - "F0) * "40000
+ (`#2 - "80) * "1000
+ (`#3 - "80) * "40
+ `#4 - "80
}
}
% \end{macrocode}
Expand Down
2 changes: 1 addition & 1 deletion l3kernel/testfiles/m3str-convert005.lvt
Expand Up @@ -11,7 +11,7 @@
\ExplSyntaxOff

\begin{document}
\ifdefined\disablecjktoken\disablecjktoken\fi

\START
\AUTHOR{Joseph Wright}
\ExplSyntaxOn
Expand Down
16 changes: 16 additions & 0 deletions l3kernel/testfiles/m3str-convert005.uptex.tlg
@@ -0,0 +1,16 @@
This is a generated file for the LaTeX (2e + expl3) validation system.
Don't change this file in any respect.
Author: Joseph Wright
============================================================
TEST 1: PDF names
============================================================
abczz
brackets#28#29#5B#5D#7B#7D#3C#3Exxx
gr#C3#BC#C3#9Fe##
============================================================
============================================================
TEST 2: PDF names with spaces
============================================================
abc#20cde
abc#20cde
============================================================
4 changes: 2 additions & 2 deletions l3kernel/testfiles/m3str002.lvt
Expand Up @@ -6,7 +6,7 @@
\ExplSyntaxOn
\debug_on:n { check-declarations , deprecation , log-functions }
\ExplSyntaxOff
\ifdefined\disablecjktoken\disablecjktoken\fi

\START
\AUTHOR{Joseph Wright}
\ExplSyntaxOn
Expand All @@ -33,7 +33,7 @@
\tl_set:Nx \l_tmpb_tl{ \str_casefold:n { ABC~123 } }
\tl_if_eq:NNTF \l_tmpa_tl \l_tmpb_tl \TRUE \ERROR
}
\sys_if_engine_ptex:T { \END }

\TESTEXP { Accented~characters,~etc. }
{
" \str_uppercase:n { Café } "
Expand Down
15 changes: 15 additions & 0 deletions l3kernel/testfiles/m3str002.ptex.tlg
Expand Up @@ -14,3 +14,18 @@ TEST 2: Checking category codes
FALSE
TRUE
============================================================
============================================================
TEST 3: Accented characters, etc.
============================================================
"CAF^^c3^^89"
"^^c4^^87^^c4^^97^^c9^^97^^e1^^b9^^91^^e1^^b9^^91"
"^^e1^^bd^^a2^^ce^^b9ωΝ"
"^^cf^^85^^cc^^88^^cc^^81^^cf^^85^^cc^^88^^cc^^80st"
"^^ea^^9a^^89^^ea^^9a^^87"
"Z^^ea^^9d^^8f^^e2^^93^^a7"
============================================================
============================================================
TEST 4: Characters with context-sensitive Unicode behaviour
============================================================
FALSE
============================================================
6 changes: 3 additions & 3 deletions l3kernel/testfiles/m3str002.uptex.tlg
Expand Up @@ -19,13 +19,13 @@ TEST 3: Accented characters, etc.
============================================================
"CAF^^c3^^89"
"^^c4^^87^^c4^^97^^c9^^97^^e1^^b9^^91^^e1^^b9^^91"
"^^e1^^bd^^a2^^ce^^b9^^cf^^89^^ce^^bd"
"^^e1^^bd^^a2^^ce^^b9ω^^ce^^bd"
"^^cf^^85^^cc^^88^^cc^^81^^cf^^85^^cc^^88^^cc^^80st"
"^^ea^^9a^^89^^ea^^9a^^87"
"^^ea^^9a^^89ꚇ"
"^^ef^^bd^^9a^^ea^^9d^^8f^^e2^^93^^a7"
============================================================
============================================================
TEST 4: Characters with context-sensitive Unicode behaviour
============================================================
TRUE
FALSE
============================================================
4 changes: 1 addition & 3 deletions l3kernel/testfiles/m3text002.lvt
Expand Up @@ -7,7 +7,7 @@
\ExplSyntaxOn
\debug_on:n { check-declarations , deprecation , log-functions }
\ExplSyntaxOff
\ifdefined\disablecjktoken\disablecjktoken\fi

\START
\AUTHOR{Joseph Wright}
\ExplSyntaxOn
Expand Down Expand Up @@ -125,8 +125,6 @@
\test:n { ABCÈ日本語}
}

\sys_if_engine_ptex:T { \END }

\TESTEXP { Unicode~case~changing }
{
\test:n { åéîøὭдαƐ }
Expand Down

0 comments on commit a027168

Please sign in to comment.