From 41ea83bb293c9b09688f743f32ee67f8ff503390 Mon Sep 17 00:00:00 2001 From: Frank Mittelbach Date: Tue, 12 Feb 2019 14:14:34 +0100 Subject: [PATCH] different concept for utf8 handling, so medium size rewrite. We forgot to handle \protected@def and the like --- base/testfiles-legacy/utf8-test-001.lvt | 101 +++++++++ base/testfiles-legacy/utf8-test-001.tlg | 57 +++++ base/utf8andspace.tex | 276 +++++++++++++++++++++--- 3 files changed, 403 insertions(+), 31 deletions(-) create mode 100644 base/testfiles-legacy/utf8-test-001.lvt create mode 100644 base/testfiles-legacy/utf8-test-001.tlg diff --git a/base/testfiles-legacy/utf8-test-001.lvt b/base/testfiles-legacy/utf8-test-001.lvt new file mode 100644 index 000000000..a68acb203 --- /dev/null +++ b/base/testfiles-legacy/utf8-test-001.lvt @@ -0,0 +1,101 @@ +\documentclass{article} + +%\usepackage{trace} + +\input{test2e} + + +% ---------------------------------------------------------- + +\START + +\begin{filecontents*}{one two three} +1 2 3 +\end{filecontents*} + + +%\traceon +\begin{filecontents*}{füße.tex} +Füße file +\end{filecontents*} + +\begin{filecontents}{füße€€€.tex} +Expensive feet +\end{filecontents} + +% this needs quotes +\begin{filecontents*}{"füße im sand.tex"} +eureka +\end{filecontents*} + + + +\includeonly{foo, füße€€€ , öfoo} + + +\begin{document} + + +\tableofcontents + +\section{A with ref: ``\ref{abß}'' and Füßen} + +\label{öfoo} + + +Some refs: \ref{abß} and \ref{öfoo} and \ref{bar€} + +\fontencoding{T1}\selectfont + +Some refs: \ref{abß} and \ref{öfoo} and \ref{bar€} + +\subsection{include tests} + +\label{bar€} + +\include{füße€€€} % exists +\include{€€€} % doesn't exist = No file €€€.aux. + +\subsection{input tests} + +\input{"füße im sand"} % exists + +%\input{unknownfüße} % doesn't exist (should give file error) + % = File `unknownfüße.tex' not found. + +\subsection{@input tests} +\label{abß} + +\makeatletter + +\@input{füße} % exists +\@input{€€€} % doesn't exist (should give warning) + % = No file €€€. + +\@input{"füße im sand"} % exists + +\makeatother + +\def\foo{"füße im sand"} +\input{\foo} % exists + +% next one fails +%\input füße + + +\input{one two three} + + +% here we test if protected edef works +\makeatletter + +\protected@edef\foo{Füße} + +\show\foo + +\setbox0\hbox{\foo} \foo + +\showbox0 + +\end{document} + diff --git a/base/testfiles-legacy/utf8-test-001.tlg b/base/testfiles-legacy/utf8-test-001.tlg new file mode 100644 index 000000000..030dee5c9 --- /dev/null +++ b/base/testfiles-legacy/utf8-test-001.tlg @@ -0,0 +1,57 @@ +This is a generated file for the LaTeX2e validation system. +Don't change this file in any respect. +LaTeX Warning: File `one two three' already exists on the system. + Not generating it from this source. +LaTeX Warning: File `f^^c3^^bc^^c3^^9fe.tex' already exists on the system. + Not generating it from this source. +LaTeX Warning: File `f^^c3^^bc^^c3^^9fe^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.tex' already exists on the system. + Not generating it from this source. +LaTeX Warning: File `"f^^c3^^bc^^c3^^9fe im sand.tex"' already exists on the system. + Not generating it from this source. +(utf8-test-001.aux (f^^c3^^bc^^c3^^9fe^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.aux) +No file ^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.aux. +) +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line .... +LaTeX Font Info: ... okay on input line .... +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line .... +LaTeX Font Info: ... okay on input line .... +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line .... +LaTeX Font Info: ... okay on input line .... +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line .... +LaTeX Font Info: ... okay on input line .... +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line .... +LaTeX Font Info: ... okay on input line .... +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line .... +LaTeX Font Info: ... okay on input line .... +(utf8-test-001.toc +LaTeX Warning: Reference `ab^^c3^^9f' on page 1 undefined on input line .... +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <7> on input line .... +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <5> on input line .... +) +\tf@toc=\write... +[1 +] +(f^^c3^^bc^^c3^^9fe^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.tex) [2 +] +(f^^c3^^bc^^c3^^9fe im sand.tex) (f^^c3^^bc^^c3^^9fe.tex) +No file ^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac. +(f^^c3^^bc^^c3^^9fe im sand.tex) (f^^c3^^bc^^c3^^9fe im sand.tex) +(one two three.tex) +> \foo=macro: +->F^^c3^^bc^^c3^^9fe. +l. ...\show\foo +> \box...= +\hbox(6.8872+0.0)x21.52252 +.\T1/cmr/m/n/10 F +.\T1/cmr/m/n/10 ^^fc +.\T1/cmr/m/n/10 ^^ff +.\T1/cmr/m/n/10 e +! OK. +l. ...\showbox0 +[3 +] (utf8-test-001.aux (f^^c3^^bc^^c3^^9fe^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.aux) +No file ^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.aux. +) +LaTeX Warning: There were undefined references. diff --git a/base/utf8andspace.tex b/base/utf8andspace.tex index 7d5772d71..c5d6cf670 100644 --- a/base/utf8andspace.tex +++ b/base/utf8andspace.tex @@ -1,75 +1,278 @@ -\makeatletter +%% This is a patch of the LaTeX kernel to support UTF8 character in +%% all places where they can be supported by an 8bit engine such as +%% pdfTeX. +%% +%% +%% This should enable UTF8 not only in ordinary text (as +%% already provided by a recent LaTeX release but in addition +%% supports: +%% +%% - utf8 characters in file names used by \input \includegraphics +%% and the like --- this includes spaces and it is not longer necessary +% to quote the file name in this case (not possible is the use of +%% the " as part of a file name, this is restriction of the library +%% the TeX engines use). +%% +%% - use of all utf8 characters in labels +%% +%% - in contrast to the utf8 characters that are used in typesetting +%% it is not necessary that LaTeX has any knowledge how to render +%% the character, e.g., without loading the textcomp package it is +%% not possible to typeset € but even then you can have a file or a +%% label with that character. +%% +%% +%% The plan is to integrate this patch (or a version of it) into the +%% kernel. Thus the current external version is intended to invite +%% tests with real documents beyond the test suite that we have +%% available at our disposal. +%% +%% +%% If you find any issues, please prepare a short example and submit +%% it as an issue at +%% +%% https://github.com/latex3/latex2e/issues +%% +%% Thanks! +\makeatletter -% quoting spaces -% a b c -> "a b c" -% "a b c" -> "a b c" -% a" "b" "c -> "a b c" -% -> "" -\def\quote@name#1{"\quote@@name#1\@gobble""} -\def\quote@@name#1"{#1\quote@@name} +% utf8 +% +% +% whenever we encounter a UTF8 char in non-typesetting situation we make sure it +% doesn't expand. +%------------------------------------------------------------------------- -% utf8 +% Approach +% +% The utf8 characters are seen by an 8-bit engine as a sequence of octets. +% +% We make each starting octet an active character. +% + +% - When typesetting we pick up the necessary number of additional +% octets check if they form a command that LaTeX knows about +% ( \csname u8:\string#1\string#2...\encsname ) and if so use that +% for typesetting. \string is needed as the octets may (all?) be +% active and we want the literal values in the name. + +% - If the utf8 character is going to be part of a label then it is +% essentially becoming part of some csname and with the +% test \ifincsname we can find this out. If so we render the whole +% sequence off octets harmless by using \string too when the +% starting octet executes. +% + +% - Another possible case is that \protect has *not* the meaning +% of \typeset@protect. In that case we may do a write or we may do +% a \protected@edef or ... In all such cases we want to keep the +% sequence of octets unchanged, but we can't use \string since at +% least in the case of \protect@edef the result may later be +% typeset after all (in fact that is quite likely) and so at that +% point the starting octet needs to be an active character again +% (the others could be stringified). So for those cases we use \noexpand. +% + +% So the code for a start octet of a two byte sequence would there +% look like this: + +% +% +% \long\def\UTFviii@two@octets{% +% \ifincsname +% \expandafter +% \UTF@twostring@octets +% \else +% \ifx\protect\@typeset@protect +% \else +% \expandafter\expandafter\expandafter +% \UTF@twoharmless@octets +% \fi +% \fi +% \UTFviii@two@octets@do +% } +% + +% \ifcsname is tested first because that can be true even if we are +% otherwise doing typesetting. If this is the case use \string on the +% whole octet sequence. \UTF@twostring@octets not only does this but +% also gets rid of \UTFviii@two@octets@do in the input stream by +% picking it up as a first argument and dropping it. +% +% If this is not the case and we are doing typesetting (i.e., \protect +% is \typeset@protect) then execute \UTFviii@two@octets@do which +% picks up all octets and typesets the character (or generates an +% error if it doesn't know how to typeset it). +% +% If we are not doing typesetting then we run \UTFviii@two@octets@do +% which is like \UTF@twostring@octets but uses \noexpand instead +% of \string. This way the sequence is temporay frozen, eg would +% display as is or stays put inside a \protected@edef but if the +% result is later reused the starting octet is still active. +% +% The definitions for the other starting octets are the same except +% that they pick up more octets after them. + + + +% In the original all starting octets would be defined as calling such +% a \UTFviii@...@octets command followed by a \string version of the +% octet itself (so that it can be used to form the character). We now +% need to keep that octet active and so we have to do a slightly +% different setup. +% +% +% So here is the new setup loop. Note that for error cases we can and +% should of course use a \string version of the octet since there is +% no point do extra work. + +\begingroup +\catcode`\~13 +\catcode`\"12 +\def\UTFviii@loop{% + \uccode`\~\count@ + \uppercase\expandafter{\UTFviii@tmp}% + \advance\count@\@ne + \ifnum\count@<\@tempcnta + \expandafter\UTFviii@loop + \fi} + \def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@undefined@err{:\string~}}} + \count@"1 + \@tempcnta9 +\UTFviii@loop + \count@11 + \@tempcnta12 +\UTFviii@loop + \count@14 + \@tempcnta32 +\UTFviii@loop + \count@"80 + \@tempcnta"C2 + \def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@invalid@err\string~}} +\UTFviii@loop + \count@"C2 + \@tempcnta"E0 + \def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@two@octets\noexpand~}} +\UTFviii@loop + \count@"E0 + \@tempcnta"F0 + \def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@three@octets\noexpand~}} +\UTFviii@loop + \count@"F0 + \@tempcnta"F5 + \def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@four@octets\noexpand~}} +\UTFviii@loop + \count@"F5 + \@tempcnta"100 + \def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@invalid@err\string~}} +\UTFviii@loop +\endgroup + +% These are new work macros for the sequences as discussed above. \long\def\UTFviii@two@octets{% - \ifx\protect\relax - \ifincsname + \ifincsname + \expandafter + \UTF@twostring@octets + \else + \ifx\protect\@typeset@protect + \else \expandafter\expandafter\expandafter - \UTF@twoharmless@octets + \UTF@twoharmless@octets \fi - \else - \expandafter\UTF@twoharmless@octets \fi \UTFviii@two@octets@do } \long\def\UTFviii@three@octets{% - \ifx\protect\relax - \ifincsname + \ifincsname + \expandafter + \UTF@threestring@octets + \else + \ifx\protect\@typeset@protect + \else \expandafter\expandafter\expandafter - \UTF@threeharmless@octets + \UTF@threeharmless@octets \fi - \else - \expandafter\UTF@threeharmless@octets \fi \UTFviii@three@octets@do } \long\def\UTFviii@four@octets{% - \ifx\protect\relax - \ifincsname + \ifincsname + \expandafter + \UTF@fourstring@octets + \else + \ifx\protect\@typeset@protect + \else \expandafter\expandafter\expandafter - \UTF@fourharmless@octets + \UTF@fourharmless@octets \fi - \else - \expandafter\UTF@fourharmless@octets \fi \UTFviii@four@octets@do } +% The \...@do are more or less what the original code was doing as +% part of \UTFviii@...@octets. However #1 is now active (wasn't in +% the original impl) so we better string that inside the cs. This +% is faster than having it figure out that by itself that it is in a +% csname. \long\def\UTFviii@two@octets@do#1#2{\expandafter - \UTFviii@defined\csname u8:#1\string#2\endcsname} + \UTFviii@defined\csname u8:\string#1\string#2\endcsname} \long\def\UTFviii@three@octets@do#1#2#3{\expandafter - \UTFviii@defined\csname u8:#1\string#2\string#3\endcsname} + \UTFviii@defined\csname u8:\string#1\string#2\string#3\endcsname} \long\def\UTFviii@four@octets@do#1#2#3#4{\expandafter - \UTFviii@defined\csname u8:#1\string#2\string#3\string#4\endcsname} + \UTFviii@defined\csname u8:\string#1\string#2\string#3\string#4\endcsname} + + +% These tempoarily prevent the active chars from expanding. (Maybe +% using \unexpanded would be faster here?) + +\long\def\UTF@twoharmless@octets#1#2{\noexpand#2\noexpand} +\long\def\UTF@threeharmless@octets#1#2#3{\noexpand#2\noexpand#3\noexpand} +\long\def\UTF@fourharmless@octets#1#2#3#4{\noexpand#2\noexpand#3\noexpand#4\noexpand} +% And the same with \string for use in \csname constructions. -\long\def\UTF@twoharmless@octets#1#2{\string#2\string} -\long\def\UTF@threeharmless@octets#1#2#3{\string#2\string#3\string} -\long\def\UTF@fourharmless@octets#1#2#3#4{\string#2\string#3\string#4\string} +\long\def\UTF@twostring@octets#1#2{\string#2\string} +\long\def\UTF@threestring@octets#1#2#3{\string#2\string#3\string} +\long\def\UTF@fourstring@octets#1#2#3#4{\string#2\string#3\string#4\string} +% The kernel already has saved away definitions for the starting code so +% we have to refresh that (until the day this is properly integrated): + +% if used in the kernel we also need this: +\let\UTFviii@two@octets@@\UTFviii@two@octets +\let\UTFviii@three@octets@@\UTFviii@three@octets +\let\UTFviii@four@octets@@\UTFviii@four@octets + +% Done :-) + %------------------------------------------------------------------------- +% + +% File name handling is done by generating a csname from the provided +% file name (which means that utf8 octets gets turned into strings +% due to the above procedure). By setting \escapchar to -1 we ensure +% that we don't get a \ in front. As a result we end up with all +% characters as catcode 12 (plus spaces). We then sometimes add +% quotes around the contruct (removing any existing inner +% quotes. Somes we only remove the quotes if they have been supplied +% by the user. There is clearly some room for improvement. +% +% A side effect of the new code is that we will see quotes around file +% name displays where there haven't been any before. \def\set@curr@file#1{% \begingroup @@ -78,6 +281,18 @@ \endgroup } +% quoting spaces +% a b c -> "a b c" +% "a b c" -> "a b c" +% a" "b" "c -> "a b c" +% -> "" +\def\quote@name#1{"\quote@@name#1\@gobble""} +\def\quote@@name#1"{#1\quote@@name} + +% removing quotes +% +\def\unquote@name#1{\quote@@name#1\@gobble"} + %------------------------------------------------------------------------- @@ -210,7 +425,6 @@ % graphics -\def\unquote@name#1{\quote@@name#1\@gobble"} \AtBeginDocument{%