-
Notifications
You must be signed in to change notification settings - Fork 177
/
l3regex.dtx
7211 lines (7211 loc) · 263 KB
/
l3regex.dtx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% \iffalse meta-comment
%
%% File: l3regex.dtx
%
% Copyright (C) 2011-2021 The LaTeX Project
%
% It may be distributed and/or modified under the conditions of the
% LaTeX Project Public License (LPPL), either version 1.3c of this
% license or (at your option) any later version. The latest version
% of this license is in the file
%
% https://www.latex-project.org/lppl.txt
%
% This file is part of the "l3kernel bundle" (The Work in LPPL)
% and all files in that bundle must be distributed together.
%
% -----------------------------------------------------------------------
%
% The development version of the bundle can be found at
%
% https://github.com/latex3/latex3
%
% for those people who are interested.
%
%<*driver>
\documentclass[full,kernel]{l3doc}
\begin{document}
\DocInput{\jobname.dtx}
\end{document}
%</driver>
% \fi
%
% \title{^^A
% The \textsf{l3regex} package: Regular expressions in \TeX{}^^A
% }
%
% \author{^^A
% The \LaTeX{} Project\thanks
% {^^A
% E-mail:
% \href{mailto:latex-team@latex-project.org}
% {latex-team@latex-project.org}^^A
% }^^A
% }
%
% \date{Released 2021-02-18}
%
% \maketitle
%
% \begin{documentation}
% \newenvironment{l3regex-syntax}
% {\begin{itemize}\def\\{\char`\\}\def\makelabel##1{\hss\llap{\ttfamily##1}}}
% {\end{itemize}}
%
% The \pkg{l3regex} package provides regular expression testing,
% extraction of submatches, splitting, and replacement, all acting
% on token lists. The syntax of regular expressions is mostly a subset
% of the \textsc{pcre} syntax (and very close to \textsc{posix}),
% with some additions
% due to the fact that \TeX{} manipulates tokens rather than characters.
% For performance reasons, only a limited set of features are implemented.
% Notably, back-references are not supported.
%
% Let us give a few examples. After
% \begin{verbatim}
% \tl_set:Nn \l_my_tl { That~cat. }
% \regex_replace_once:nnN { at } { is } \l_my_tl
% \end{verbatim}
% the token list variable \cs{l_my_tl} holds the text
% \enquote{\texttt{This cat.}}, where the first
% occurrence of \enquote{\texttt{at}} was replaced
% by \enquote{\texttt{is}}. A more complicated example is
% a pattern to emphasize each word and add a comma after it:
% \begin{verbatim}
% \regex_replace_all:nnN { \w+ } { \c{emph}\cB\{ \0 \cE\} , } \l_my_tl
% \end{verbatim}
% The |\w| sequence represents any \enquote{word} character, and |+|
% indicates that the |\w| sequence should be repeated as many times as
% possible (at least once), hence matching a word in the input token
% list. In the replacement text, |\0| denotes the full match (here, a
% word). The command |\emph| is inserted using |\c{emph}|, and its
% argument |\0| is put between braces |\cB\{| and |\cE\}|.
%
% If a regular expression is to be used several times,
% it can be compiled once, and stored in a regex
% variable using \cs{regex_const:Nn}. For example,
% \begin{verbatim}
% \regex_const:Nn \c_foo_regex { \c{begin} \cB. (\c[^BE].*) \cE. }
% \end{verbatim}
% stores in \cs{c_foo_regex} a regular expression which matches the
% starting marker for an environment: \cs{begin}, followed by a
% begin-group token (|\cB.|), then any number of tokens which are
% neither begin-group nor end-group character tokens (|\c[^BE].*|),
% ending with an end-group token (|\cE.|). As explained in the next
% section, the parentheses \enquote{capture} the result of |\c[^BE].*|,
% giving us access to the name of the environment when doing
% replacements.
%
% \section{Syntax of regular expressions}
%
% We start with a few examples, and encourage the reader to apply
% \cs{regex_show:n} to these regular expressions.
% \begin{itemize}
% \item |Cat| matches the word \enquote{Cat} capitalized in this way,
% but also matches the beginning of the word \enquote{Cattle}: use
% |\bCat\b| to match a complete word only.
% \item |[abc]| matches one letter among \enquote{a}, \enquote{b},
% \enquote{c}; the pattern \verb"(a|b|c)" matches the same three
% possible letters (but see the discussion of submatches below).
% \item |[A-Za-z]*| matches any number (due to the quantifier
% \verb"*") of Latin letters (not accented).
% \item |\c{[A-Za-z]*}| matches a control sequence made of Latin
% letters.
% \item |\_[^\_]*\_| matches an underscore, any number of characters
% other than underscore, and another underscore; it is equivalent to
% |\_.*?\_| where |.| matches arbitrary characters and the
% lazy quantifier |*?| means to match as few characters as
% possible, thus avoiding matching underscores.
% \item |[\+\-]?\d+| matches an explicit integer with at most one
% sign.
% \item \verb*"[\+\-\ ]*\d+\ *" matches an explicit integer with any
% number of $+$ and $-$ signs, with spaces allowed except within the
% mantissa, and surrounded by spaces.
% \item \verb*"[\+\-\ ]*(\d+|\d*\.\d+)\ *" matches an explicit integer or
% decimal number; using \verb*"[.,]" instead of \verb*"\." would allow
% the comma as a decimal marker.
% \item
% \verb*"[\+\-\ ]*(\d+|\d*\.\d+)\ *((?i)pt|in|[cem]m|ex|[bs]p|[dn]d|[pcn]c)\ *"
% \allowbreak matches an explicit dimension with any unit that \TeX{} knows, where
% \verb*"(?i)" means to treat lowercase and uppercase letters
% identically.
% \item \verb*"[\+\-\ ]*((?i)nan|inf|(\d+|\d*\.\d+)(\ *e[\+\-\ ]*\d+)?)\ *"
% matches an explicit floating point number or the special values
% \verb*"nan" and \verb*"inf" (with signs and spaces allowed).
% \item \verb*"[\+\-\ ]*(\d+|\cC.)\ *" matches an explicit integer or
% control sequence (without checking whether it is an integer
% variable).
% \item |\G.*?\K| at the beginning of a regular expression matches and
% discards (due to |\K|) everything between the end of the previous
% match (|\G|) and what is matched by the rest of the regular
% expression; this is useful in \cs{regex_replace_all:nnN} when the
% goal is to extract matches or submatches in a finer way than with
% \cs{regex_extract_all:nnN}.
% \end{itemize}
% While it is impossible for a regular expression to match only integer
% expressions, \verb*"[\+\-\(]*\d+\)*([\+\-*/][\+\-\(]*\d+\)*)*" matches among
% other things all valid integer expressions (made only with explicit
% integers). One should follow it with further testing.
%
% Most characters match exactly themselves,
% with an arbitrary category code. Some characters are
% special and must be escaped with a backslash (\emph{e.g.}, |\*|
% matches a star character). Some escape sequences of
% the form backslash--letter also have a special meaning
% (for instance |\d| matches any digit). As a rule,
% \begin{itemize}
% \item every alphanumeric character (\texttt{A}--\texttt{Z},
% \texttt{a}--\texttt{z}, \texttt{0}--\texttt{9}) matches
% exactly itself, and should not be escaped, because
% |\A|, |\B|, \ldots{} have special meanings;
% \item non-alphanumeric printable ascii characters can (and should)
% always be escaped: many of them have special meanings (\emph{e.g.},
% use |\(|, |\)|, |\?|, |\.|);
% \item spaces should always be escaped (even in character
% classes);
% \item any other character may be escaped or not, without any
% effect: both versions match exactly that character.
% \end{itemize}
% Note that these rules play nicely with the fact that many
% non-alphanumeric characters are difficult to input into \TeX{}
% under normal category codes. For instance, |\\abc\%|
% matches the characters |\abc%| (with arbitrary category codes),
% but does not match the control sequence |\abc| followed by a
% percent character. Matching control sequences can be done
% using the |\c|\Arg{regex} syntax (see below).
%
% Any special character which appears at a place where its special
% behaviour cannot apply matches itself instead (for instance, a
% quantifier appearing at the beginning of a string), after raising a
% warning.
%
% Characters.
% \begin{l3regex-syntax}
% \item[\\x\{hh\ldots{}\}] Character with hex code \texttt{hh\ldots{}}
% \item[\\xhh] Character with hex code \texttt{hh}.
% \item[\\a] Alarm (hex 07).
% \item[\\e] Escape (hex 1B).
% \item[\\f] Form-feed (hex 0C).
% \item[\\n] New line (hex 0A).
% \item[\\r] Carriage return (hex 0D).
% \item[\\t] Horizontal tab (hex 09).
% \end{l3regex-syntax}
%
% Character types.
% \begin{l3regex-syntax}
% \item[.] A single period matches any token.
% \item[\\d] Any decimal digit.
% \item[\\h] Any horizontal space character,
% equivalent to |[\ \^^I]|: space and tab.
% \item[\\s] Any space character,
% equivalent to |[\ \^^I\^^J\^^L\^^M]|.
% \item[\\v] Any vertical space character,
% equivalent to |[\^^J\^^K\^^L\^^M]|. Note that |\^^K| is a vertical space,
% but not a space, for compatibility with Perl.
% \item[\\w] Any word character, \emph{i.e.},
% alphanumerics and underscore, equivalent to the explicit
% class |[A-Za-z0-9\_]|.
% \item[\\D] Any token not matched by |\d|.
% \item[\\H] Any token not matched by |\h|.
% \item[\\N] Any token other than the |\n| character (hex 0A).
% \item[\\S] Any token not matched by |\s|.
% \item[\\V] Any token not matched by |\v|.
% \item[\\W] Any token not matched by |\w|.
% \end{l3regex-syntax}
% Of those, |.|, |\D|, |\H|, |\N|, |\S|, |\V|, and |\W| match arbitrary
% control sequences.
%
% Character classes match exactly one token in the subject.
% \begin{l3regex-syntax}
% \item[{[\ldots{}]}] Positive character class.
% Matches any of the specified tokens.
% \item[{[\char`\^\ldots{}]}] Negative character class.
% Matches any token other than the specified characters.
% \item[{x-y}] Within a character class, this denotes a range (can be
% used with escaped characters).
% \item[{[:\meta{name}:]}] Within a character class (one more set of
% brackets), this denotes the \textsc{posix} character class
% \meta{name}, which can be \texttt{alnum}, \texttt{alpha},
% \texttt{ascii}, \texttt{blank}, \texttt{cntrl}, \texttt{digit},
% \texttt{graph}, \texttt{lower}, \texttt{print}, \texttt{punct},
% \texttt{space}, \texttt{upper}, \texttt{word}, or \texttt{xdigit}.
% \item[{[:\char`\^\meta{name}:]}] Negative \textsc{posix} character class.
% \end{l3regex-syntax}
% For instance, |[a-oq-z\cC.]| matches any lowercase latin letter
% except |p|, as well as control sequences (see below for a description
% of |\c|).
%
% Quantifiers (repetition).
% \begin{l3regex-syntax}
% \item[?] $0$ or $1$, greedy.
% \item[??] $0$ or $1$, lazy.
% \item[*] $0$ or more, greedy.
% \item[*?] $0$ or more, lazy.
% \item[+] $1$ or more, greedy.
% \item[+?] $1$ or more, lazy.
% \item[\{$n$\}] Exactly $n$.
% \item[\{$n,$\}] $n$ or more, greedy.
% \item[\{$n,$\}?] $n$ or more, lazy.
% \item[\{$n,m$\}] At least $n$, no more than $m$, greedy.
% \item[\{$n,m$\}?] At least $n$, no more than $m$, lazy.
% \end{l3regex-syntax}
%
% Anchors and simple assertions.
% \begin{l3regex-syntax}
% \item[\\b] Word boundary: either the previous token is matched by
% |\w| and the next by |\W|, or the opposite. For this purpose,
% the ends of the token list are considered as |\W|.
% \item[\\B] Not a word boundary: between two |\w| tokens
% or two |\W| tokens (including the boundary).
% \item[\char`^ \textrm{or} \\A]
% Start of the subject token list.
% \item[\char`$\textrm{,} \\Z \textrm{or} \\z] ^^A $
% End of the subject token list.
% \item[\\G] Start of the current match. This is only different from |^|
% in the case of multiple matches: for instance
% |\regex_count:nnN { \G a } { aaba } \l_tmpa_int| yields $2$, but
% replacing |\G| by |^| would result in \cs{l_tmpa_int} holding the
% value $1$.
% \end{l3regex-syntax}
%
% Alternation and capturing groups.
% \begin{l3regex-syntax}
% \item[A\char`|B\char`|C] Either one of \texttt{A}, \texttt{B},
% or \texttt{C}.
% \item[(\ldots{})] Capturing group.
% \item[(?:\ldots{})] Non-capturing group.
% \item[(?\char`|\ldots{})] Non-capturing group which resets
% the group number for capturing groups in each alternative.
% The following group is numbered with the first unused
% group number.
% \end{l3regex-syntax}
%
% The |\c| escape sequence allows to test the category code of tokens,
% and match control sequences. Each character category is represented
% by a single uppercase letter:
% \begin{itemize}
% \item |C| for control sequences;
% \item |B| for begin-group tokens;
% \item |E| for end-group tokens;
% \item |M| for math shift;
% \item |T| for alignment tab tokens;
% \item |P| for macro parameter tokens;
% \item |U| for superscript tokens (up);
% \item |D| for subscript tokens (down);
% \item |S| for spaces;
% \item |L| for letters;
% \item |O| for others; and
% \item |A| for active characters.
% \end{itemize}
% The |\c| escape sequence is used as follows.
% \begin{l3regex-syntax}
% \item[\\c\Arg{regex}] A control sequence whose csname matches the
% \meta{regex}, anchored at the beginning and end, so that |\c{begin}|
% matches exactly \cs{begin}, and nothing else.
% \item[\\cX] Applies to the next object, which can be a character,
% character property, class, or group, and forces this object to
% only match tokens with category |X| (any of |CBEMTPUDSLOA|. For
% instance, |\cL[A-Z\d]| matches uppercase letters and digits of
% category code letter, |\cC.| matches any control sequence, and
% |\cO(abc)| matches |abc| where each character has category other.
% \item[{\\c[XYZ]}] Applies to the next object, and forces it to only
% match tokens with category |X|, |Y|, or |Z| (each being any of
% |CBEMTPUDSLOA|). For instance, |\c[LSO](..)| matches two tokens of
% category letter, space, or other.
% \item[{\\c[\char`\^XYZ]}] Applies to the next object and prevents it
% from matching any token with category |X|, |Y|, or |Z| (each being
% any of |CBEMTPUDSLOA|). For instance, |\c[^O]\d| matches digits
% which have any category different from other.
% \end{l3regex-syntax}
% The category code tests can be used inside classes; for instance,
% |[\cO\d \c[LO][A-F]]| matches what \TeX{} considers as hexadecimal
% digits, namely digits with category other, or uppercase letters from
% |A| to |F| with category either letter or other. Within a group
% affected by a category code test, the outer test can be overridden by
% a nested test: for instance, |\cL(ab\cO\*cd)| matches |ab*cd| where
% all characters are of category letter, except |*| which has category
% other.
%
% The |\u| escape sequence allows to insert the contents of a token list
% directly into a regular expression or a replacement, avoiding the need
% to escape special characters. Namely, |\u|\Arg{var~name} matches
% the exact contents of the variable \cs[no-index]{\meta{var~name}},
% which are obtained by applying \cs{exp_not:v} \Arg{var~name} at the
% time the regular expression is compiled. Within a |\c{...}|
% control sequence matching, the |\u| escape sequence only expands its
% argument once, in effect performing \cs{tl_to_str:v}. Quantifiers are
% not supported directly: use a group, for instance as in
% |(?:\u|\Arg{var~name}|){2,4}|.
%
% The option |(?i)| makes the match case insensitive (identifying
% \texttt{A}--\texttt{Z} with \texttt{a}--\texttt{z}; no Unicode support
% yet). This applies until the end of the group in which it appears, and
% can be reverted using |(?-i)|. For instance, in
% \verb"(?i)(a(?-i)b|c)d", the letters |a| and |d| are affected by the
% |i| option. Characters within ranges and classes are affected
% individually: |(?i)[Y-\\]| is equivalent to |[YZ\[\\yz]|, and
% |(?i)[^aeiou]| matches any character which is not a vowel. Neither
% character properties, nor |\c{...}| nor |\u{...}| are affected by the
% |i| option.
% ^^A \]
%
% In character classes, only |[|, |^|, |-|, |]|, |\| and spaces are
% special, and should be escaped. Other non-alphanumeric characters can
% still be escaped without harm. Any escape sequence which matches a
% single character (|\d|, |\D|, \emph{etc.}) is supported in character
% classes. If the first character is |^|, then
% the meaning of the character class is inverted; |^| appearing anywhere
% else in the range is not special. If the first character (possibly
% following a leading |^|) is |]| then it does not need to be escaped
% since ending the range there would make it empty.
% Ranges of characters
% can be expressed using |-|, for instance, |[\D 0-5]| and |[^6-9]| are
% equivalent.
%
% Capturing groups are a means of extracting information about the
% match. Parenthesized groups are labelled in the order of their
% opening parenthesis, starting at $1$. The contents of those groups
% corresponding to the \enquote{best} match (leftmost longest)
% can be extracted and stored in a sequence of token lists using for
% instance \cs{regex_extract_once:nnNTF}.
%
% The |\K| escape sequence resets the beginning of the match to the
% current position in the token list. This only affects what is reported
% as the full match. For instance,
% \begin{verbatim}
% \regex_extract_all:nnN { a \K . } { a123aaxyz } \l_foo_seq
% \end{verbatim}
% results in \cs{l_foo_seq} containing the items |{1}| and |{a}|: the
% true matches are |{a1}| and |{aa}|, but they are trimmed by the use of
% |\K|. The |\K| command does not affect capturing groups: for instance,
% \begin{verbatim}
% \regex_extract_once:nnN { (. \K c)+ \d } { acbc3 } \l_foo_seq
% \end{verbatim}
% results in \cs{l_foo_seq} containing the items |{c3}| and |{bc}|: the
% true match is |{acbc3}|, with first submatch |{bc}|, but |\K| resets
% the beginning of the match to the last position where it appears.
%
% \section{Syntax of the replacement text}
%
% Most of the features described in regular expressions do not make
% sense within the replacement text. Backslash introduces various
% special constructions, described further below:
% \begin{itemize}
% \item |\0| is the whole match;
% \item |\1| is the submatch that was matched by the first (capturing)
% group |(...)|; similarly for |\2|, \ldots{}, |\9| and
% |\g{|\meta{number}|}|;
% \item \verb*|\ | inserts a space (spaces are ignored when not
% escaped);
% \item |\a|, |\e|, |\f|, |\n|, |\r|, |\t|, |\xhh|, |\x{hhh}|
% correspond to single characters as in regular expressions;
% \item |\c|\Arg{cs~name} inserts a control sequence;
% \item |\c|\meta{category}\meta{character} (see below);
% \item |\u|\Arg{tl~var~name} inserts the contents of the
% \meta{tl~var} (see below).
% \end{itemize}
% Characters other than backslash and space are simply inserted in the
% result (but since the replacement text is first converted to a string,
% one should also escape characters that are special for \TeX{}, for
% instance use~|\#|). Non-alphanumeric characters can always be safely
% escaped with a backslash.
%
% For instance,
% \begin{verbatim}
% \tl_set:Nn \l_my_tl { Hello,~world! }
% \regex_replace_all:nnN { ([er]?l|o) . } { (\0--\1) } \l_my_tl
% \end{verbatim}
% results in \cs{l_my_tl} holding |H(ell--el)(o,--o) w(or--o)(ld--l)!|
%
% The submatches are numbered according to the order in which the
% opening parenthesis of capturing groups appear in the regular
% expression to match. The $n$-th submatch is empty if there are fewer
% than $n$ capturing groups or for capturing groups that appear in
% alternatives that were not used for the match. In case a capturing
% group matches several times during a match (due to quantifiers) only
% the last match is used in the replacement text. Submatches always keep
% the same category codes as in the original token list.
%
% By default, the category code of characters inserted by the
% replacement are determined by the prevailing category code regime at
% the time where the replacement is made, with two exceptions:
% \begin{itemize}
% \item space characters (with character code $32$) inserted with
% \verb*|\ | or |\x20| or |\x{20}| have category code~$10$ regardless
% of the prevailing category code regime;
% \item if the category code would be $0$~(escape), $5$~(newline),
% $9$~(ignore), $14$~(comment) or $15$~(invalid), it is replaced by
% $12$~(other) instead.
% \end{itemize}
% The escape sequence |\c| allows to insert characters
% with arbitrary category codes, as well as control sequences.
% \begin{l3regex-syntax}
% \item[\\cX(\ldots{})] Produces the characters \enquote{\ldots{}} with
% category~|X|, which must be one of |CBEMTPUDSLOA| as in regular
% expressions. Parentheses are optional for a single character (which
% can be an escape sequence). When nested, the innermost category
% code applies, for instance |\cL(Hello\cS\ world)!| gives this text
% with standard category codes.
% \item[\\c\Arg{text}] Produces the control sequence with csname
% \meta{text}. The \meta{text} may contain references to the
% submatches |\0|, |\1|, and so on, as in the example for |\u| below.
% \end{l3regex-syntax}
%
% The escape sequence |\u|\Arg{tl~var~name} allows to insert the
% contents of the token list with name \meta{tl~var~name} directly into
% the replacement, giving an easier control of category codes. When
% nested in |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the
% |\u| and |\c|~escape sequences perform \cs{tl_to_str:v}, namely
% extract the value of the control sequence and turn it into a string.
% Matches can also be used within the arguments of |\c| and |\u|. For
% instance,
% \begin{verbatim}
% \tl_set:Nn \l_my_one_tl { first }
% \tl_set:Nn \l_my_two_tl { \emph{second} }
% \tl_set:Nn \l_my_tl { one , two , one , one }
% \regex_replace_all:nnN { [^,]+ } { \u{l_my_\0_tl} } \l_my_tl
% \end{verbatim}
% results in \cs{l_my_tl} holding |first,\emph{second},first,first|.
%
% \section{Pre-compiling regular expressions}
%
% If a regular expression is to be used several times,
% it is better to compile it once rather than doing it
% each time the regular expression is used. The compiled
% regular expression is stored in a variable. All
% of the \pkg{l3regex} module's functions can be given their
% regular expression argument either as an explicit string
% or as a compiled regular expression.
%
% \begin{function}[added = 2017-05-26]{\regex_new:N}
% \begin{syntax}
% \cs{regex_new:N} \meta{regex~var}
% \end{syntax}
% Creates a new \meta{regex~var} or raises an error if the
% name is already taken. The declaration is global. The
% \meta{regex~var} is initially such that it never matches.
% \end{function}
%
% \begin{function}[added = 2017-05-26]
% {\regex_set:Nn, \regex_gset:Nn, \regex_const:Nn}
% \begin{syntax}
% \cs{regex_set:Nn} \meta{regex~var} \Arg{regex}
% \end{syntax}
% Stores a compiled version of the \meta{regular expression}
% in the \meta{regex~var}. For instance, this function can be used
% as
% \begin{verbatim}
% \regex_new:N \l_my_regex
% \regex_set:Nn \l_my_regex { my\ (simple\ )? reg(ex|ular\ expression) }
% \end{verbatim}
% The assignment is local for \cs{regex_set:Nn} and global for
% \cs{regex_gset:Nn}. Use \cs{regex_const:Nn} for compiled expressions
% which never change.
% \end{function}
%
% \begin{function}[added = 2017-05-26]{\regex_show:n, \regex_show:N}
% \begin{syntax}
% \cs{regex_show:n} \Arg{regex}
% \end{syntax}
% Shows how \pkg{l3regex} interprets the \meta{regex}. For instance,
% \cs{regex_show:n} \verb+{\A X|Y}+ shows
% \begin{verbatim}
% +-branch
% anchor at start (\A)
% char code 88
% +-branch
% char code 89
% \end{verbatim}
% indicating that the anchor |\A| only applies to the first branch:
% the second branch is not anchored to the beginning of the match.
% \end{function}
%
% \section{Matching}
%
% All regular expression functions are available in both |:n| and |:N|
% variants. The former require a \enquote{standard} regular expression,
% while the later require a compiled expression as generated by
% \cs{regex_(g)set:Nn}.
%
% \begin{function}[TF, added = 2017-05-26]{\regex_match:nn, \regex_match:Nn}
% \begin{syntax}
% \cs{regex_match:nnTF} \Arg{regex} \Arg{token list} \Arg{true code} \Arg{false code}
% \end{syntax}
% Tests whether the \meta{regular expression} matches any part
% of the \meta{token list}. For instance,
% \begin{verbatim}
% \regex_match:nnTF { b [cde]* } { abecdcx } { TRUE } { FALSE }
% \regex_match:nnTF { [b-dq-w] } { example } { TRUE } { FALSE }
% \end{verbatim}
% leaves \texttt{TRUE} then \texttt{FALSE} in the input stream.
% \end{function}
%
% \begin{function}[added = 2017-05-26]{\regex_count:nnN, \regex_count:NnN}
% \begin{syntax}
% \cs{regex_count:nnN} \Arg{regex} \Arg{token list} \meta{int var}
% \end{syntax}
% Sets \meta{int var} within the current \TeX{} group level
% equal to the number of times
% \meta{regular expression} appears in \meta{token list}.
% The search starts by finding the left-most longest match,
% respecting greedy and lazy (non-greedy) operators. Then the search
% starts again from the character following the last character
% of the previous match, until reaching the end of the token list.
% Infinite loops are prevented in the case where the regular expression
% can match an empty token list: then we count one match between each
% pair of characters.
% For instance,
% \begin{verbatim}
% \int_new:N \l_foo_int
% \regex_count:nnN { (b+|c) } { abbababcbb } \l_foo_int
% \end{verbatim}
% results in \cs{l_foo_int} taking the value $5$.
% \end{function}
%
% \section{Submatch extraction}
%
% \begin{function}[noTF, added = 2017-05-26]
% {\regex_extract_once:nnN, \regex_extract_once:NnN}
% \begin{syntax}
% \cs{regex_extract_once:nnN} \Arg{regex} \Arg{token list} \meta{seq~var}
% \cs{regex_extract_once:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code}
% \end{syntax}
% Finds the first match of the \meta{regular expression} in the
% \meta{token list}. If it exists, the match is stored as the first
% item of the \meta{seq~var}, and further items are the contents of
% capturing groups, in the order of their opening parenthesis. The
% \meta{seq~var} is assigned locally. If there is no match, the
% \meta{seq~var} is cleared. The testing versions insert the
% \meta{true code} into the input stream if a match was found, and the
% \meta{false code} otherwise.
%
% For instance, assume that you type
% \begin{verbatim}
% \regex_extract_once:nnNTF { \A(La)?TeX(!*)\Z } { LaTeX!!! } \l_foo_seq
% { true } { false }
% \end{verbatim}
% Then the regular expression (anchored at the start with |\A| and
% at the end with |\Z|) must match the whole token list. The first
% capturing group, |(La)?|, matches |La|, and the second capturing
% group, |(!*)|, matches |!!!|. Thus, |\l_foo_seq| contains as a result
% the items |{LaTeX!!!}|, |{La}|, and |{!!!}|, and the \texttt{true}
% branch is left in the input stream.
% Note that the $n$-th item of |\l_foo_seq|, as obtained using
% \cs{seq_item:Nn}, correspond to the submatch numbered $(n-1)$ in
% functions such as \cs{regex_replace_once:nnN}.
% \end{function}
%
% \begin{function}[noTF, added = 2017-05-26]
% {\regex_extract_all:nnN, \regex_extract_all:NnN}
% \begin{syntax}
% \cs{regex_extract_all:nnN} \Arg{regex} \Arg{token list} \meta{seq~var}
% \cs{regex_extract_all:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code}
% \end{syntax}
% Finds all matches of the \meta{regular expression}
% in the \meta{token list}, and stores all the submatch information
% in a single sequence (concatenating the results of
% multiple \cs{regex_extract_once:nnN} calls).
% The \meta{seq~var} is assigned locally. If there is no match,
% the \meta{seq~var} is cleared.
% The testing versions insert the \meta{true code} into the input
% stream if a match was found, and the \meta{false code} otherwise.
% For instance, assume that you type
% \begin{verbatim}
% \regex_extract_all:nnNTF { \w+ } { Hello,~world! } \l_foo_seq
% { true } { false }
% \end{verbatim}
% Then the regular expression matches twice, the resulting
% sequence contains the two items |{Hello}| and |{world}|,
% and the \texttt{true} branch is left in the input stream.
% \end{function}
%
% \begin{function}[noTF, added = 2017-05-26]{\regex_split:nnN, \regex_split:NnN}
% \begin{syntax}
% \cs{regex_split:nnN} \Arg{regular expression} \Arg{token list} \meta{seq~var}
% \cs{regex_split:nnNTF} \Arg{regular expression} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code}
% \end{syntax}
% Splits the \meta{token list} into a sequence of parts, delimited by
% matches of the \meta{regular expression}. If the \meta{regular expression}
% has capturing groups, then the token lists that they match are stored as
% items of the sequence as well. The assignment to \meta{seq~var} is local.
% If no match is found the resulting \meta{seq~var} has the
% \meta{token list} as its sole item. If the \meta{regular expression}
% matches the empty token list, then the \meta{token list} is split
% into single tokens.
% The testing versions insert the \meta{true code} into the input
% stream if a match was found, and the \meta{false code} otherwise.
% For example, after
% \begin{verbatim}
% \seq_new:N \l_path_seq
% \regex_split:nnNTF { / } { the/path/for/this/file.tex } \l_path_seq
% { true } { false }
% \end{verbatim}
% the sequence |\l_path_seq| contains the items |{the}|, |{path}|,
% |{for}|, |{this}|, and |{file.tex}|, and the \texttt{true} branch
% is left in the input stream.
% \end{function}
%
% \section{Replacement}
%
% \begin{function}[noTF, added = 2017-05-26]
% {\regex_replace_once:nnN,\regex_replace_once:NnN}
% \begin{syntax}
% \cs{regex_replace_once:nnN} \Arg{regular expression} \Arg{replacement} \meta{tl~var}
% \cs{regex_replace_once:nnNTF} \Arg{regular expression} \Arg{replacement} \meta{tl~var} \Arg{true code} \Arg{false code}
% \end{syntax}
% Searches for the \meta{regular expression} in the \meta{token list}
% and replaces the first match with the \meta{replacement}. The result
% is assigned locally to \meta{tl~var}. In the \meta{replacement},
% |\0| represents the full match, |\1| represent the contents of the
% first capturing group, |\2| of the second, \emph{etc.}
% \end{function}
%
% \begin{function}[noTF, added = 2017-05-26]
% {\regex_replace_all:nnN, \regex_replace_all:NnN}
% \begin{syntax}
% \cs{regex_replace_all:nnN} \Arg{regular expression} \Arg{replacement} \meta{tl~var}
% \cs{regex_replace_all:nnNTF} \Arg{regular expression} \Arg{replacement} \meta{tl~var} \Arg{true code} \Arg{false code}
% \end{syntax}
% Replaces all occurrences of the \meta{regular expression} in the
% \meta{token list} by the \meta{replacement}, where |\0| represents
% the full match, |\1| represent the contents of the first capturing
% group, |\2| of the second, \emph{etc.} Every match is treated
% independently, and matches cannot overlap. The result is assigned
% locally to \meta{tl~var}.
% \end{function}
%
% \section{Constants and variables}
%
% \begin{variable}[added = 2017-12-11]{\l_tmpa_regex, \l_tmpb_regex}
% Scratch regex for local assignment. These are never used by
% the kernel code, and so are safe for use with any \LaTeX3-defined
% function. However, they may be overwritten by other non-kernel
% code and so should only be used for short-term storage.
% \end{variable}
%
% \begin{variable}[added = 2017-12-11]{\g_tmpa_regex, \g_tmpb_regex}
% Scratch regex for global assignment. These are never used by
% the kernel code, and so are safe for use with any \LaTeX3-defined
% function. However, they may be overwritten by other non-kernel
% code and so should only be used for short-term storage.
% \end{variable}
%
% \section{Bugs, misfeatures, future work, and other possibilities}
%
% The following need to be done now.
% \begin{itemize}
% \item Rewrite the documentation in a more ordered way, perhaps add a
% \textsc{bnf}?
% \end{itemize}
%
% Additional error-checking to come.
% \begin{itemize}
% \item Clean up the use of messages.
% \item Cleaner error reporting in the replacement phase.
% \item Add tracing information.
% \item Detect attempts to use back-references and other
% non-implemented syntax.
% \item Test for the maximum register \cs{c_max_register_int}.
% \item Find out whether the fact that |\W| and friends match the
% end-marker leads to bugs. Possibly update \cs[no-index]{__regex_item_reverse:n}.
% \item The empty cs should be matched by |\c{}|, not by
% |\c{csname.?endcsname\s?}|.
% \end{itemize}
%
% Code improvements to come.
% \begin{itemize}
% \item Shift arrays so that the useful information starts at
% position~$1$.
% \item Only build |\c{...}| once.
% \item Use arrays for the left and right state stacks when
% compiling a regex.
% \item Should \cs[no-index]{__regex_action_free_group:n} only be used for greedy
% |{n,}| quantifier? (I think not.)
% \item Quantifiers for |\u| and assertions.
% \item When matching, keep track of an explicit stack of
% \texttt{curr_state} and \texttt{curr_submatches}.
% \item If possible, when a state is reused by the same thread, kill
% other subthreads.
% \item Use an array rather than \cs[no-index]{l__regex_balance_tl}
% to build the function \cs[no-index]{__regex_replacement_balance_one_match:n}.
% \item Reduce the number of epsilon-transitions in alternatives.
% \item Optimize simple strings: use less states (|abcade| should give
% two states, for |abc| and |ade|). [Does that really make sense?]
% \item Optimize groups with no alternative.
% \item Optimize states with a single \cs[no-index]{__regex_action_free:n}.
% \item Optimize the use of \cs[no-index]{__regex_action_success:} by inserting it
% in state $2$ directly instead of having an extra transition.
% \item Optimize the use of \cs{int_step_...} functions.
% \item Groups don't capture within regexes for csnames; optimize and
% document.
% \item Better \enquote{show} for anchors, properties, and catcode tests.
% \item Does |\K| really need a new state for itself?
% \item When compiling, use a boolean \texttt{in_cs} and less magic
% numbers.
% \item Instead of checking whether the character is special or
% alphanumeric using its character code, check if it is special in
% regexes with \cs{cs_if_exist} tests.
% \end{itemize}
%
% The following features are likely to be implemented at some point
% in the future.
% \begin{itemize}
% \item General look-ahead/behind assertions.
% \item Regex matching on external files.
% \item Conditional subpatterns with look ahead/behind: \enquote{if
% what follows is [\ldots{}], then [\ldots{}]}.
% \item |(*..)| and |(?..)| sequences to set some options.
% \item UTF-8 mode for \pdfTeX{}.
% \item Newline conventions are not done.
% In particular, we should have an option for |.| not to match newlines.
% Also, |\A| should differ from |^|, and |\Z|, |\z| and |$| should
% differ.
% \item Unicode properties: |\p{..}| and |\P{..}|;
% |\X| which should match any \enquote{extended} Unicode sequence.
% This requires to manipulate a lot of data, probably using tree-boxes.
% \item Provide a syntax such as |\ur{l_my_regex}| to use an
% already-compiled regex in a more complicated regex. This makes
% regexes more easily composable.
% \end{itemize}
%
% The following features of \textsc{pcre} or Perl may or may not be
% implemented.
% \begin{itemize}
% \item Callout with |(?C...)| or other syntax: some internal code
% changes make that possible, and it can be useful for instance in
% the replacement code to stop a regex replacement when some marker
% has been found; this raises the question of a potential
% |\regex_break:| and then of playing well with \cs{tl_map_break:}
% called from within the code in a regex. It also raises the
% question of nested calls to the regex machinery, which is a
% problem since \tn{fontdimen} are global.
% \item Conditional subpatterns (other than with a look-ahead or
% look-behind condition): this is non-regular, isn't it?
% \item Named subpatterns: \TeX{} programmers have lived so far
% without any need for named macro parameters.
% \end{itemize}
%
% The following features of \textsc{pcre} or Perl will definitely not be
% implemented.
% \begin{itemize}
% \item Back-references: non-regular feature, this requires
% backtracking, which is prohibitively slow.
% \item Recursion: this is a non-regular feature.
% \item Atomic grouping, possessive quantifiers: those tools, mostly
% meant to fix catastrophic backtracking, are unnecessary in a
% non-backtracking algorithm, and difficult to implement.
% \item Subroutine calls: this syntactic sugar is difficult to include
% in a non-backtracking algorithm, in particular because the
% corresponding group should be treated as atomic.
% \item Backtracking control verbs: intrinsically tied to
% backtracking.
% \item |\ddd|, matching the character with octal code \texttt{ddd}:
% we already have |\x{...}| and the syntax is confusingly close to
% what we could have used for backreferences (|\1|, |\2|, \ldots{}),
% making it harder to produce useful error message.
% \item |\cx|, similar to \TeX{}'s own |\^^x|.
% \item Comments: \TeX{} already has its own system for comments.
% \item |\Q...\E| escaping: this would require to read the argument
% verbatim, which is not in the scope of this module.
% \item |\C| single byte in UTF-8 mode: \XeTeX{} and \LuaTeX{} serve
% us characters directly, and splitting those into bytes is tricky,
% encoding dependent, and most likely not useful anyways.
% \end{itemize}
%
% \end{documentation}
%
% \begin{implementation}
%
% \section{\pkg{l3regex} implementation}
%
% \begin{macrocode}
%<*package>
% \end{macrocode}
%
% \begin{macrocode}
%<@@=regex>
% \end{macrocode}
%
% \subsection{Plan of attack}
%
% Most regex engines use backtracking. This allows to provide very
% powerful features (back-references come to mind first), but it is
% costly, and raises the problem of catastrophic backtracking. Since
% \TeX{} is not first and foremost a programming language, complicated
% code tends to run slowly, and we must use faster, albeit slightly more
% restrictive, techniques, coming from automata theory.
%
% Given a regular expression of $n$ characters, we do the following:
% \begin{itemize}
% \item (Compiling.) Analyse the regex, finding invalid input, and
% convert it to an internal representation.
% \item (Building.) Convert the compiled regex to a non-deterministic
% finite automaton (\textsc{nfa}) with $O(n)$ states which
% accepts precisely token lists matching that regex.
% \item (Matching.) Loop through the query token list one token (one
% \enquote{position}) at a time, exploring in parallel every
% possible path (\enquote{active thread}) through the \textsc{nfa},
% considering active threads in an order determined by the
% quantifiers' greediness.
% \end{itemize}
%
% We use the following vocabulary in the code comments (and in variable
% names).
% \begin{itemize}
% \item \emph{Group}: index of the capturing group, $-1$ for
% non-capturing groups. ^^A start/end index?
% \item \emph{Position}: each token in the query is labelled by an
% integer \meta{position}, with $\texttt{min_pos} - 1 \leq
% \meta{position} \leq \texttt{max_pos}$. The lowest and highest
% positions $\texttt{min_pos} - 1$ and $\texttt{max_pos}$
% correspond to imaginary begin and end markers (with
% non-existent category code and character code).
% $\texttt{max_pos}$ is only set quite late in the processing.
% \item \emph{Query}: the token list to which we apply the regular
% expression.
% \item \emph{State}: each state of the \textsc{nfa} is labelled by an
% integer \meta{state} with $\texttt{min_state} \leq \meta{state} <
% \texttt{max_state}$.
% \item \emph{Active thread}: state of the \textsc{nfa} that is reached
% when reading the query token list for the matching. Those threads
% are ordered according to the greediness of quantifiers.
% \item \emph{Step}: used when matching, starts at $0$, incremented
% every time a character is read, and is not reset when searching
% for repeated matches. The integer \cs{l_@@_step_int} is a
% unique id for all the steps of the matching algorithm.
% \end{itemize}
%
% We use \pkg{l3intarray} to manipulate arrays of integers.
% We also abuse \TeX{}'s
% \tn{toks} registers, by accessing them directly by number rather than
% tying them to control sequence using the \tn{newtoks} allocation
% functions. Specifically, these arrays and \tn{toks} are used as
% follows. When building,
% \tn{toks}\meta{state} holds the tests and actions to perform in the
% \meta{state} of the \textsc{nfa}. When matching,
% \begin{itemize}
% \item \cs{g_@@_state_active_intarray} holds the last \meta{step} in
% which each \meta{state} was active.
% \item \cs{g_@@_thread_info_intarray} consists of blocks for each
% \meta{thread} (with $\texttt{min_thread} \leq \meta{thread} <
% \texttt{max_thread}$). Each block has
% $1+2\cs{l_@@_capturing_group_int}$ entries: the \meta{state} in
% which the \meta{thread} currently is, followed by the beginnings
% of all submatches, and then the ends of all submatches. The
% \meta{threads} are ordered starting from the best to the least
% preferred.
% \item \cs{g_@@_submatch_prev_intarray}, \cs{g_@@_submatch_begin_intarray}
% and \cs{g_@@_submatch_end_intarray} hold, for each submatch (as would
% be extracted by \cs{regex_extract_all:nnN}), the place where the
% submatch started to be looked for and its two end-points. For
% historical reasons, the minimum index is twice \texttt{max_state},
% and the used registers go up to \cs{l_@@_submatch_int}. They are
% organized in blocks of \cs{l_@@_capturing_group_int} entries, each
% block corresponding to one match with all its submatches stored in
% consecutive entries.
% \end{itemize}
% When actually building the result,
% \begin{itemize}
% \item \tn{toks}\meta{position} holds \meta{tokens} which \texttt{o}-
% and \texttt{x}-expand to the \meta{position}-th token in the query.
% \item \cs{g_@@_balance_intarray} holds the balance of begin-group and
% end-group character tokens which appear before that point in the
% token list.
% \end{itemize}
%
% The code is structured as follows. Variables are introduced in the
% relevant section. First we present some generic helper functions. Then
% comes the code for compiling a regular expression, and for showing the
% result of the compilation. The building phase converts a compiled
% regex to \textsc{nfa} states, and the automaton is run by the code in
% the following section. The only remaining brick is parsing the
% replacement text and performing the replacement. We are then ready for
% all the user functions. Finally, messages, and a little bit of tracing
% code.
%
% \subsection{Helpers}
%
% \begin{macro}{\@@_int_eval:w}
% Access the primitive: performance is key here, so we do not use
% the slower route \emph{via} \cs{int_eval:n}.
% \begin{macrocode}
\cs_new_eq:NN \@@_int_eval:w \tex_numexpr:D
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_standard_escapechar:}
% Make the \tn{escapechar} into the standard backslash.
% \begin{macrocode}
\cs_new_protected:Npn \@@_standard_escapechar:
{ \int_set:Nn \tex_escapechar:D { `\\ } }
% \end{macrocode}
% \end{macro}
%
% \begin{macro}[EXP]{\@@_toks_use:w}
% Unpack a \tn{toks} given its number.
% \begin{macrocode}
\cs_new:Npn \@@_toks_use:w { \tex_the:D \tex_toks:D }
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_toks_clear:N, \@@_toks_set:Nn, \@@_toks_set:No}
% Empty a \tn{toks} or set it to a value, given its number.
% \begin{macrocode}
\cs_new_protected:Npn \@@_toks_clear:N #1
{ \@@_toks_set:Nn #1 { } }
\cs_new_eq:NN \@@_toks_set:Nn \tex_toks:D
\cs_new_protected:Npn \@@_toks_set:No #1
{ \tex_toks:D #1 \exp_after:wN }
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_toks_memcpy:NNn}
% Copy |#3| \tn{toks} registers from |#2| onwards to |#1| onwards,
% like |C|'s |memcpy|.
% \begin{macrocode}
\cs_new_protected:Npn \@@_toks_memcpy:NNn #1#2#3
{
\prg_replicate:nn {#3}
{
\tex_toks:D #1 = \tex_toks:D #2
\int_incr:N #1
\int_incr:N #2
}
}
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_toks_put_left:Nx}
% \begin{macro}{\@@_toks_put_right:Nx, \@@_toks_put_right:Nn}
% During the building phase we wish to add \texttt{x}-expanded
% material to \tn{toks}, either to the left or to the right. The
% expansion is done \enquote{by hand} for optimization (these
% operations are used quite a lot). The \texttt{Nn} version of
% \cs{@@_toks_put_right:Nx} is provided because it is more
% efficient than \texttt{x}-expanding with \cs{exp_not:n}.
% \begin{macrocode}
\cs_new_protected:Npn \@@_toks_put_left:Nx #1#2
{
\cs_set_nopar:Npx \@@_tmp:w { #2 }
\tex_toks:D #1 \exp_after:wN \exp_after:wN \exp_after:wN
{ \exp_after:wN \@@_tmp:w \tex_the:D \tex_toks:D #1 }
}
\cs_new_protected:Npn \@@_toks_put_right:Nx #1#2
{
\cs_set_nopar:Npx \@@_tmp:w {#2}
\tex_toks:D #1 \exp_after:wN
{ \tex_the:D \tex_toks:D \exp_after:wN #1 \@@_tmp:w }
}
\cs_new_protected:Npn \@@_toks_put_right:Nn #1#2
{ \tex_toks:D #1 \exp_after:wN { \tex_the:D \tex_toks:D #1 #2 } }