Permalink
Browse files

If strptime() is called with UTf-8 string but legacy format, then dow…

…ngrade the string to match; taking care to handle pos() counts both sides
  • Loading branch information...
leonerd committed Feb 2, 2012
1 parent 579e314 commit 58d6481ddd067a51821da2acd2186161a1f759d5
Showing with 38 additions and 9 deletions.
  1. +30 −8 ext/POSIX/POSIX.xs
  2. +8 −1 ext/POSIX/t/time.t
View
@@ -1860,6 +1860,7 @@ strptime(str, fmt, sec=-1, min=-1, hour=-1, mday=-1, mon=-1, year=-1, wday=-1, y
PPCODE:
{
const char *str_c;
+ const U8 *orig_bytes;
SV *strref = NULL;
MAGIC *posmg = NULL;
int str_offset = 0;
@@ -1895,19 +1896,37 @@ strptime(str, fmt, sec=-1, min=-1, hour=-1, mday=-1, mon=-1, year=-1, wday=-1, y
croak("str is not a reference to a mutable scalar");
}
+ /* If fmt and str differ in UTF-8ness then take a temporary copy
+ * of and regrade it to match fmt, taking care to update the
+ * offset in both cases. */
if(!SvUTF8(str) && SvUTF8(fmt)) {
- /* fmt is UTF-8, str is not. Upgrade a local copy of it, and
- * take care to update str_offset to match. */
str = sv_mortalcopy(str);
sv_utf8_upgrade_nomg(str);
+ str_c = SvPV_nolen(str);
+
if(str_offset) {
- U8 *bytes = SvPV_nolen(str);
- str_offset = utf8_hop(bytes, str_offset) - bytes;
+ str_offset = utf8_hop(str_c, str_offset) - (U8*)str_c;
}
}
+ else if(SvUTF8(str) && !SvUTF8(fmt)) {
+ str = sv_mortalcopy(str);
+ /* If downgrade fails then str must have contained characters
+ * that could not possibly be matched by fmt */
+ if(!sv_utf8_downgrade(str, 1))
+ XSRETURN(0);
- str_c = SvPV_nolen(str);
+ str_c = SvPV_nolen(str);
+
+ if(str_offset) {
+ orig_bytes = SvPV_nolen(strref);
+ str_offset = utf8_distance(orig_bytes + str_offset, orig_bytes);
+ }
+ }
+ else {
+ /* else it doesn't matter if both or neither are, because they'll match */
+ str_c = SvPV_nolen(str);
+ }
remains = strptime(str_c + str_offset, SvPV_nolen(fmt), &tm);
@@ -1920,9 +1939,12 @@ strptime(str, fmt, sec=-1, min=-1, hour=-1, mday=-1, mon=-1, year=-1, wday=-1, y
if(strref) {
if(str != strref) {
- /* str is a UTF-8 upgraded copy of the original non-UTF-8
- * string the caller referred us to in strref */
- str_offset = utf8_distance(remains, str_c);
+ if(SvUTF8(str))
+ /* str is a UTF-8 upgraded copy of the original non-UTF-8
+ * string the caller referred us to in strref */
+ str_offset = utf8_distance(remains, str_c);
+ else
+ str_offset = utf8_hop(orig_bytes, remains - str_c) - orig_bytes;
}
else {
str_offset = remains - str_c;
View
@@ -4,7 +4,7 @@ use strict;
use Config;
use POSIX;
-use Test::More tests => 38;
+use Test::More tests => 41;
# go to UTC to avoid DST issues around the world when testing. SUS3 says that
# null should get you UTC, but some environments want the explicit names.
@@ -107,6 +107,7 @@ is(pos($str), 20, 'strptime() updates pos() magic on SCALAR ref');
my @want = (undef, undef, undef, 1, 2-1, 2012-1900, 3, 31, 0);
+ is_deeply([POSIX::strptime($date_U, $fmt )], \@want, 'strptime() UTF-8 date, legacy fmt');
is_deeply([POSIX::strptime($date, $fmt_U)], \@want, 'strptime() legacy date, UTF-8 fmt');
is_deeply([POSIX::strptime($date_U, $fmt_U)], \@want, 'strptime() UTF-8 date, UTF-8 fmt');
@@ -116,6 +117,12 @@ is(pos($str), 20, 'strptime() updates pos() magic on SCALAR ref');
is_deeply([POSIX::strptime(\$str, $fmt_U)], \@want, 'strptime() legacy data SCALAR ref, UTF-8 fmt');
is(pos($str), 12, 'pos() of legacy data SCALAR after strptime() UTF-8 fmt');
+ utf8::upgrade my $str_U = $str;
+ pos($str_U) = 2;
+
+ is_deeply([POSIX::strptime(\$str_U, $fmt)], \@want, 'strptime() UTF-8 data SCALAR ref, legacy fmt');
+ is(pos($str_U), 12, 'pos() of UTF-8 data SCALAR after strptime() legacy fmt');
+
# High (>U+FF) strings
my $date_UU = "2012\x{1234}02\x{1234}01";
my $fmt_UU = "%Y\x{1234}%m\x{1234}%d";

0 comments on commit 58d6481

Please sign in to comment.