forked from schacon/perl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utf8.c
4206 lines (3612 loc) · 132 KB
/
utf8.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* utf8.c
*
* Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
* by Larry Wall and others
*
* You may distribute under the terms of either the GNU General Public
* License or the Artistic License, as specified in the README file.
*
*/
/*
* 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
* heard of that we don't want to see any closer; and that's the one place
* we're trying to get to! And that's just where we can't get, nohow.'
*
* [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
*
* 'Well do I understand your speech,' he answered in the same language;
* 'yet few strangers do so. Why then do you not speak in the Common Tongue,
* as is the custom in the West, if you wish to be answered?'
* --Gandalf, addressing Théoden's door wardens
*
* [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
*
* ...the travellers perceived that the floor was paved with stones of many
* hues; branching runes and strange devices intertwined beneath their feet.
*
* [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
*/
#include "EXTERN.h"
#define PERL_IN_UTF8_C
#include "perl.h"
#include "inline_invlist.c"
#include "charclass_invlists.h"
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
/*
=head1 Unicode Support
These are various utility functions for manipulating UTF8-encoded
strings. For the uninitiated, this is a method of representing arbitrary
Unicode characters as a variable number of bytes, in such a way that
characters in the ASCII range are unmodified, and a zero byte never appears
within non-zero characters.
=cut
*/
/*
=for apidoc is_invariant_string
Returns true iff the first C<len> bytes of the string C<s> are the same
regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
EBCDIC machines). That is, if they are UTF-8 invariant. On ASCII-ish
machines, all the ASCII characters and only the ASCII characters fit this
definition. On EBCDIC machines, the ASCII-range characters are invariant, but
so also are the C1 controls and C<\c?> (which isn't in the ASCII range on
EBCDIC).
If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
use this option, that C<s> can't have embedded C<NUL> characters and has to
have a terminating C<NUL> byte).
See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
=cut
*/
bool
Perl_is_invariant_string(const U8 *s, STRLEN len)
{
const U8* const send = s + (len ? len : strlen((const char *)s));
const U8* x = s;
PERL_ARGS_ASSERT_IS_INVARIANT_STRING;
for (; x < send; ++x) {
if (!UTF8_IS_INVARIANT(*x))
break;
}
return x == send;
}
/*
=for apidoc uvoffuni_to_utf8_flags
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
Instead, B<Almost all code should use L</uvchr_to_utf8> or
L</uvchr_to_utf8_flags>>.
This function is like them, but the input is a strict Unicode
(as opposed to native) code point. Only in very rare circumstances should code
not be using the native code point.
For details, see the description for L</uvchr_to_utf8_flags>>.
=cut
*/
U8 *
Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
if (UNI_IS_INVARIANT(uv)) {
*d++ = (U8) LATIN1_TO_NATIVE(uv);
return d;
}
#ifdef EBCDIC
/* Not representable in UTF-EBCDIC */
flags |= UNICODE_DISALLOW_FE_FF;
#endif
/* The first problematic code point is the first surrogate */
if (uv >= UNICODE_SURROGATE_FIRST
&& ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
{
if (UNICODE_IS_SURROGATE(uv)) {
if (flags & UNICODE_WARN_SURROGATE) {
Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
"UTF-16 surrogate U+%04"UVXf, uv);
}
if (flags & UNICODE_DISALLOW_SURROGATE) {
return NULL;
}
}
else if (UNICODE_IS_SUPER(uv)) {
if (flags & UNICODE_WARN_SUPER
|| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
{
Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
}
if (flags & UNICODE_DISALLOW_SUPER
|| (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
{
#ifdef EBCDIC
Perl_die(aTHX_ "Can't represent character for Ox%"UVXf" on this platform", uv);
NOT_REACHED;
#endif
return NULL;
}
}
else if (UNICODE_IS_NONCHAR(uv)) {
if (flags & UNICODE_WARN_NONCHAR) {
Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
"Unicode non-character U+%04"UVXf" is illegal for open interchange",
uv);
}
if (flags & UNICODE_DISALLOW_NONCHAR) {
return NULL;
}
}
}
#if defined(EBCDIC)
{
STRLEN len = OFFUNISKIP(uv);
U8 *p = d+len-1;
while (p > d) {
*p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
uv >>= UTF_ACCUMULATION_SHIFT;
}
*p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
return d+len;
}
#else /* Non loop style */
if (uv < 0x800) {
*d++ = (U8)(( uv >> 6) | 0xc0);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
if (uv < 0x10000) {
*d++ = (U8)(( uv >> 12) | 0xe0);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
if (uv < 0x200000) {
*d++ = (U8)(( uv >> 18) | 0xf0);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
if (uv < 0x4000000) {
*d++ = (U8)(( uv >> 24) | 0xf8);
*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
if (uv < 0x80000000) {
*d++ = (U8)(( uv >> 30) | 0xfc);
*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
#ifdef UTF8_QUAD_MAX
if (uv < UTF8_QUAD_MAX)
#endif
{
*d++ = 0xfe; /* Can't match U+FEFF! */
*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
#ifdef UTF8_QUAD_MAX
{
*d++ = 0xff; /* Can't match U+FFFE! */
*d++ = 0x80; /* 6 Reserved bits */
*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
*d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
*d++ = (U8)(( uv & 0x3f) | 0x80);
return d;
}
#endif
#endif /* Non loop style */
}
/*
=for apidoc uvchr_to_utf8
Adds the UTF-8 representation of the native code point C<uv> to the end
of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
the byte after the end of the new character. In other words,
d = uvchr_to_utf8(d, uv);
is the recommended wide native character-aware way of saying
*(d++) = uv;
This function accepts any UV as input. To forbid or warn on non-Unicode code
points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
=cut
*/
/* This is also a macro */
PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
U8 *
Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
{
return uvchr_to_utf8(d, uv);
}
/*
=for apidoc uvchr_to_utf8_flags
Adds the UTF-8 representation of the native code point C<uv> to the end
of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
the byte after the end of the new character. In other words,
d = uvchr_to_utf8_flags(d, uv, flags);
or, in most cases,
d = uvchr_to_utf8_flags(d, uv, 0);
This is the Unicode-aware way of saying
*(d++) = uv;
This function will convert to UTF-8 (and not warn) even code points that aren't
legal Unicode or are problematic, unless C<flags> contains one or more of the
following flags:
If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
the function will raise a warning, provided UTF8 warnings are enabled. If instead
UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
If both flags are set, the function will both warn and return NULL.
The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags
affect how the function handles a Unicode non-character. And likewise, the
UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of
code points that are
above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
even less portable) can be warned and/or disallowed even if other above-Unicode
code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
flags.
And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
DISALLOW flags.
=cut
*/
/* This is also a macro */
PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
U8 *
Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
return uvchr_to_utf8_flags(d, uv, flags);
}
/*
=for apidoc is_utf8_string
Returns true if the first C<len> bytes of string C<s> form a valid
UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
using C<strlen(s)> (which means if you use this option, that C<s> can't have
embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
that all characters being ASCII constitute 'a valid UTF-8 string'.
See also L</is_invariant_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
=cut
*/
bool
Perl_is_utf8_string(const U8 *s, STRLEN len)
{
const U8* const send = s + (len ? len : strlen((const char *)s));
const U8* x = s;
PERL_ARGS_ASSERT_IS_UTF8_STRING;
while (x < send) {
STRLEN len = isUTF8_CHAR(x, send);
if (UNLIKELY(! len)) {
return FALSE;
}
x += len;
}
return TRUE;
}
/*
Implemented as a macro in utf8.h
=for apidoc is_utf8_string_loc
Like L</is_utf8_string> but stores the location of the failure (in the
case of "utf8ness failure") or the location C<s>+C<len> (in the case of
"utf8ness success") in the C<ep>.
See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
=for apidoc is_utf8_string_loclen
Like L</is_utf8_string>() but stores the location of the failure (in the
case of "utf8ness failure") or the location C<s>+C<len> (in the case of
"utf8ness success") in the C<ep>, and the number of UTF-8
encoded characters in the C<el>.
See also L</is_utf8_string_loc>() and L</is_utf8_string>().
=cut
*/
bool
Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
{
const U8* const send = s + (len ? len : strlen((const char *)s));
const U8* x = s;
STRLEN outlen = 0;
PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
while (x < send) {
STRLEN len = isUTF8_CHAR(x, send);
if (UNLIKELY(! len)) {
goto out;
}
x += len;
outlen++;
}
out:
if (el)
*el = outlen;
if (ep)
*ep = x;
return (x == send);
}
/*
=for apidoc utf8n_to_uvchr
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
Bottom level UTF-8 decode routine.
Returns the native code point value of the first character in the string C<s>,
which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
the length, in bytes, of that character.
The value of C<flags> determines the behavior when C<s> does not point to a
well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
next possible position in C<s> that could begin a non-malformed character.
Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
Various ALLOW flags can be set in C<flags> to allow (and not warn on)
individual types of malformations, such as the sequence being overlong (that
is, when there is a shorter sequence that can express the same code point;
overlong sequences are expressly forbidden in the UTF-8 standard due to
potential security issues). Another malformation example is the first byte of
a character not being a legal first byte. See F<utf8.h> for the list of such
flags. For allowed 0 length strings, this function returns 0; for allowed
overlong sequences, the computed code point is returned; for all other allowed
malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
determinable reasonable value.
The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
flags) malformation is found. If this flag is set, the routine assumes that
the caller will raise a warning, and this function will silently just set
C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
Note that this API requires disambiguation between successful decoding a C<NUL>
character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
first byte of C<s> is 0 as well. If so, the input was a C<NUL>; if not, the
input had an error.
Certain code points are considered problematic. These are Unicode surrogates,
Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
By default these are considered regular code points, but certain situations
warrant special handling for them. If C<flags> contains
UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
maximum) can be set to disallow these categories individually.
The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
for their respective categories, but otherwise the code points are considered
valid (not malformations). To get a category to both be treated as a
malformation and raise a warning, specify both the WARN and DISALLOW flags.
(But note that warnings are not raised if lexically disabled nor if
UTF8_CHECK_ONLY is also specified.)
Very large code points (above 0x7FFF_FFFF) are considered more problematic than
the others that are above the Unicode legal maximum. There are several
reasons: they requre at least 32 bits to represent them on ASCII platforms, are
not representable at all on EBCDIC platforms, and the original UTF-8
specification never went above this number (the current 0x10FFFF limit was
imposed later). (The smaller ones, those that fit into 32 bits, are
representable by a UV on ASCII platforms, but not by an IV, which means that
the number of operations that can be performed on them is quite restricted.)
The UTF-8 encoding on ASCII platforms for these large code points begins with a
byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to
be treated as malformations, while allowing smaller above-Unicode code points.
(Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
including these, as malformations.)
Similarly, UTF8_WARN_FE_FF acts just like
the other WARN flags, but applies just to these code points.
All other code points corresponding to Unicode characters, including private
use and those yet to be assigned, are never considered malformed and never
warn.
=cut
*/
UV
Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
const U8 * const s0 = s;
U8 overflow_byte = '\0'; /* Save byte in case of overflow */
U8 * send;
UV uv = *s;
STRLEN expectlen;
SV* sv = NULL;
UV outlier_ret = 0; /* return value when input is in error or problematic
*/
UV pack_warn = 0; /* Save result of packWARN() for later */
bool unexpected_non_continuation = FALSE;
bool overflowed = FALSE;
bool do_overlong_test = TRUE; /* May have to skip this test */
const char* const malformed_text = "Malformed UTF-8 character";
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
/* The order of malformation tests here is important. We should consume as
* few bytes as possible in order to not skip any valid character. This is
* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
* http://unicode.org/reports/tr36 for more discussion as to why. For
* example, once we've done a UTF8SKIP, we can tell the expected number of
* bytes, and could fail right off the bat if the input parameters indicate
* that there are too few available. But it could be that just that first
* byte is garbled, and the intended character occupies fewer bytes. If we
* blindly assumed that the first byte is correct, and skipped based on
* that number, we could skip over a valid input character. So instead, we
* always examine the sequence byte-by-byte.
*
* We also should not consume too few bytes, otherwise someone could inject
* things. For example, an input could be deliberately designed to
* overflow, and if this code bailed out immediately upon discovering that,
* returning to the caller C<*retlen> pointing to the very next byte (one
* which is actually part of of the overflowing sequence), that could look
* legitimate to the caller, which could discard the initial partial
* sequence and process the rest, inappropriately */
/* Zero length strings, if allowed, of necessity are zero */
if (UNLIKELY(curlen == 0)) {
if (retlen) {
*retlen = 0;
}
if (flags & UTF8_ALLOW_EMPTY) {
return 0;
}
if (! (flags & UTF8_CHECK_ONLY)) {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
}
goto malformed;
}
expectlen = UTF8SKIP(s);
/* A well-formed UTF-8 character, as the vast majority of calls to this
* function will be for, has this expected length. For efficiency, set
* things up here to return it. It will be overriden only in those rare
* cases where a malformation is found */
if (retlen) {
*retlen = expectlen;
}
/* An invariant is trivially well-formed */
if (UTF8_IS_INVARIANT(uv)) {
return uv;
}
/* A continuation character can't start a valid sequence */
if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
if (flags & UTF8_ALLOW_CONTINUATION) {
if (retlen) {
*retlen = 1;
}
return UNICODE_REPLACEMENT;
}
if (! (flags & UTF8_CHECK_ONLY)) {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
}
curlen = 1;
goto malformed;
}
/* Here is not a continuation byte, nor an invariant. The only thing left
* is a start byte (possibly for an overlong) */
#ifdef EBCDIC
uv = NATIVE_UTF8_TO_I8(uv);
#endif
/* Remove the leading bits that indicate the number of bytes in the
* character's whole UTF-8 sequence, leaving just the bits that are part of
* the value */
uv &= UTF_START_MASK(expectlen);
/* Now, loop through the remaining bytes in the character's sequence,
* accumulating each into the working value as we go. Be sure to not look
* past the end of the input string */
send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
for (s = s0 + 1; s < send; s++) {
if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
#ifndef EBCDIC /* Can't overflow in EBCDIC */
if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
/* The original implementors viewed this malformation as more
* serious than the others (though I, khw, don't understand
* why, since other malformations also give very very wrong
* results), so there is no way to turn off checking for it.
* Set a flag, but keep going in the loop, so that we absorb
* the rest of the bytes that comprise the character. */
overflowed = TRUE;
overflow_byte = *s; /* Save for warning message's use */
}
#endif
uv = UTF8_ACCUMULATE(uv, *s);
}
else {
/* Here, found a non-continuation before processing all expected
* bytes. This byte begins a new character, so quit, even if
* allowing this malformation. */
unexpected_non_continuation = TRUE;
break;
}
} /* End of loop through the character's bytes */
/* Save how many bytes were actually in the character */
curlen = s - s0;
/* The loop above finds two types of malformations: non-continuation and/or
* overflow. The non-continuation malformation is really a too-short
* malformation, as it means that the current character ended before it was
* expected to (being terminated prematurely by the beginning of the next
* character, whereas in the too-short malformation there just are too few
* bytes available to hold the character. In both cases, the check below
* that we have found the expected number of bytes would fail if executed.)
* Thus the non-continuation malformation is really unnecessary, being a
* subset of the too-short malformation. But there may be existing
* applications that are expecting the non-continuation type, so we retain
* it, and return it in preference to the too-short malformation. (If this
* code were being written from scratch, the two types might be collapsed
* into one.) I, khw, am also giving priority to returning the
* non-continuation and too-short malformations over overflow when multiple
* ones are present. I don't know of any real reason to prefer one over
* the other, except that it seems to me that multiple-byte errors trumps
* errors from a single byte */
if (UNLIKELY(unexpected_non_continuation)) {
if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
if (! (flags & UTF8_CHECK_ONLY)) {
if (curlen == 1) {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
}
else {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
}
}
goto malformed;
}
uv = UNICODE_REPLACEMENT;
/* Skip testing for overlongs, as the REPLACEMENT may not be the same
* as what the original expectations were. */
do_overlong_test = FALSE;
if (retlen) {
*retlen = curlen;
}
}
else if (UNLIKELY(curlen < expectlen)) {
if (! (flags & UTF8_ALLOW_SHORT)) {
if (! (flags & UTF8_CHECK_ONLY)) {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
}
goto malformed;
}
uv = UNICODE_REPLACEMENT;
do_overlong_test = FALSE;
if (retlen) {
*retlen = curlen;
}
}
#ifndef EBCDIC /* EBCDIC can't overflow */
if (UNLIKELY(overflowed)) {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
goto malformed;
}
#endif
if (do_overlong_test
&& expectlen > (STRLEN) OFFUNISKIP(uv)
&& ! (flags & UTF8_ALLOW_LONG))
{
/* The overlong malformation has lower precedence than the others.
* Note that if this malformation is allowed, we return the actual
* value, instead of the replacement character. This is because this
* value is actually well-defined. */
if (! (flags & UTF8_CHECK_ONLY)) {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
}
goto malformed;
}
/* Here, the input is considered to be well-formed, but it still could be a
* problematic code point that is not allowed by the input parameters. */
if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
&& (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_WARN_ILLEGAL_INTERCHANGE)))
{
if (UNICODE_IS_SURROGATE(uv)) {
/* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
* generation of the sv, since no warnings are raised under CHECK */
if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
&& ckWARN_d(WARN_SURROGATE))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
pack_warn = packWARN(WARN_SURROGATE);
}
if (flags & UTF8_DISALLOW_SURROGATE) {
goto disallowed;
}
}
else if ((uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
&& ckWARN_d(WARN_NON_UNICODE))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
pack_warn = packWARN(WARN_NON_UNICODE);
}
#ifndef EBCDIC /* EBCDIC always allows FE, FF */
/* The first byte being 0xFE or 0xFF is a subset of the SUPER code
* points. We test for these after the regular SUPER ones, and
* before possibly bailing out, so that the more dire warning
* overrides the regular one, if applicable */
if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
&& (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
{
if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
== UTF8_WARN_FE_FF
&& ckWARN_d(WARN_UTF8))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
pack_warn = packWARN(WARN_UTF8);
}
if (flags & UTF8_DISALLOW_FE_FF) {
goto disallowed;
}
}
#endif
if (flags & UTF8_DISALLOW_SUPER) {
goto disallowed;
}
}
else if (UNICODE_IS_NONCHAR(uv)) {
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
&& ckWARN_d(WARN_NONCHAR))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
pack_warn = packWARN(WARN_NONCHAR);
}
if (flags & UTF8_DISALLOW_NONCHAR) {
goto disallowed;
}
}
if (sv) {
outlier_ret = uv; /* Note we don't bother to convert to native,
as all the outlier code points are the same
in both ASCII and EBCDIC */
goto do_warn;
}
/* Here, this is not considered a malformed character, so drop through
* to return it */
}
return UNI_TO_NATIVE(uv);
/* There are three cases which get to beyond this point. In all 3 cases:
* <sv> if not null points to a string to print as a warning.
* <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
* set.
* <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
* This is done by initializing it to 0, and changing it only
* for case 1).
* The 3 cases are:
* 1) The input is valid but problematic, and to be warned about. The
* return value is the resultant code point; <*retlen> is set to
* <curlen>, the number of bytes that comprise the code point.
* <pack_warn> contains the result of packWARN() for the warning
* types. The entry point for this case is the label <do_warn>;
* 2) The input is a valid code point but disallowed by the parameters to
* this function. The return value is 0. If UTF8_CHECK_ONLY is set,
* <*relen> is -1; otherwise it is <curlen>, the number of bytes that
* comprise the code point. <pack_warn> contains the result of
* packWARN() for the warning types. The entry point for this case is
* the label <disallowed>.
* 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
* is set, <*relen> is -1; otherwise it is <curlen>, the number of
* bytes that comprise the malformation. All such malformations are
* assumed to be warning type <utf8>. The entry point for this case
* is the label <malformed>.
*/
malformed:
if (sv && ckWARN_d(WARN_UTF8)) {
pack_warn = packWARN(WARN_UTF8);
}
disallowed:
if (flags & UTF8_CHECK_ONLY) {
if (retlen)
*retlen = ((STRLEN) -1);
return 0;
}
do_warn:
if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
if warnings are to be raised. */
const char * const string = SvPVX_const(sv);
if (PL_op)
Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
else
Perl_warner(aTHX_ pack_warn, "%s", string);
}
if (retlen) {
*retlen = curlen;
}
return outlier_ret;
}
/*
=for apidoc utf8_to_uvchr_buf
Returns the native code point of the first character in the string C<s> which
is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
C<*retlen> will be set to the length, in bytes, of that character.
If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
NULL) to -1. If those warnings are off, the computed value, if well-defined
(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
the next possible position in C<s> that could begin a non-malformed character.
See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
returned.
=cut
*/
UV
Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
{
assert(s < send);
return utf8n_to_uvchr(s, send - s, retlen,
ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
* there are no malformations in the input UTF-8 string C<s>. surrogates,
* non-character code points, and non-Unicode code points are allowed. */
UV
Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
{
UV expectlen = UTF8SKIP(s);
const U8* send = s + expectlen;
UV uv = *s;
PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
PERL_UNUSED_CONTEXT;
if (retlen) {
*retlen = expectlen;
}
/* An invariant is trivially returned */
if (expectlen == 1) {
return uv;
}
#ifdef EBCDIC
uv = NATIVE_UTF8_TO_I8(uv);
#endif
/* Remove the leading bits that indicate the number of bytes, leaving just
* the bits that are part of the value */
uv &= UTF_START_MASK(expectlen);
/* Now, loop through the remaining bytes, accumulating each into the
* working total as we go. (I khw tried unrolling the loop for up to 4
* bytes, but there was no performance improvement) */
for (++s; s < send; s++) {
uv = UTF8_ACCUMULATE(uv, *s);
}
return UNI_TO_NATIVE(uv);
}
/*
=for apidoc utf8_to_uvuni_buf
Only in very rare circumstances should code need to be dealing in Unicode
(as opposed to native) code points. In those few cases, use
C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
Returns the Unicode (not-native) code point of the first character in the
string C<s> which
is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
C<retlen> will be set to the length, in bytes, of that character.
If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
NULL) to -1. If those warnings are off, the computed value if well-defined (or
the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
next possible position in C<s> that could begin a non-malformed character.
See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
=cut
*/
UV
Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
{
PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
assert(send > s);
/* Call the low level routine asking for checks */
return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
}
/*
=for apidoc utf8_length
Return the length of the UTF-8 char encoded string C<s> in characters.
Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
up past C<e>, croaks.
=cut
*/
STRLEN
Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
{
STRLEN len = 0;
PERL_ARGS_ASSERT_UTF8_LENGTH;
/* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
* the bitops (especially ~) can create illegal UTF-8.
* In other words: in Perl UTF-8 is not just for Unicode. */
if (e < s)
goto warn_and_return;
while (s < e) {
s += UTF8SKIP(s);
len++;
}
if (e != s) {
len--;
warn_and_return:
if (PL_op)
Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
"%s in %s", unees, OP_DESC(PL_op));
else
Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
}
return len;
}
/*
=for apidoc utf8_distance
Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
and C<b>.
WARNING: use only if you *know* that the pointers point inside the
same UTF-8 buffer.
=cut
*/
IV
Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
{
PERL_ARGS_ASSERT_UTF8_DISTANCE;
return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
}
/*
=for apidoc utf8_hop
Return the UTF-8 pointer C<s> displaced by C<off> characters, either
forward or backward.
WARNING: do not use the following unless you *know* C<off> is within
the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned