Permalink
Browse files

simplify uri_unescape

  • Loading branch information...
1 parent 926cdec commit 012e8258ff31cbaea69557264c7caa3c81ee09c0 @moritz moritz committed Aug 26, 2012
Showing with 6 additions and 44 deletions.
  1. +6 −44 lib/URI/Escape.pm
View
@@ -30,58 +30,20 @@ package URI::Escape {
# find first sequence of %[89ABCDEF]<.xdigit>
# use algorithm from url to determine if it's valid UTF-8
sub uri_unescape(*@to_unesc, Bool :$no_utf8 = False) is export {
- my @rc;
- for @to_unesc -> $s is copy {
- my $rc = '';
- my $last_pos = 0;
- while $s ~~ m:c/[ '%' (<.xdigit><.xdigit>)]+/ {
- $rc ~= $s.substr($last_pos, $/.from - $last_pos);
-
- # should be a better way with list context
- my @encoded_octets = map { :16( ~.value ) }, $/.caps;
- # common case optimization
- while @encoded_octets and ($no_utf8 or @encoded_octets[0] < 0x80) {
- $rc ~= chr(shift @encoded_octets);
- }
- # if any utf8 ...
- while @encoded_octets {
- my ($code_point, $utf8_len) = utf8_octets_2_codepoint(
- @encoded_octets
- );
- @encoded_octets.splice(0, $utf8_len);
- $rc ~= chr($code_point);
- }
- $last_pos = $/.to;
- }
- $rc ~= $s.substr($last_pos);
- $rc .= trans('+' => ' ');
- @rc.push($rc);
+ my @rc = @to_unesc.map: {
+ .trans('+' => ' ')\
+ .subst(:g, / '%' (<.xdigit> ** 2 ) /, -> $/ {
+ :16(~$0).chr;
+ })
}
+ @rc.=map(*.encode('latin-1').decode('UTF-8')) unless $no_utf8;
return do given @rc.elems { # this might be simplified some day
when 0 { Nil }
when 1 { @rc[0] }
default { @rc }
}
}
-
- # Stole parts from Masak November::CGI and parts from Parrot's UTF-8 decode
- sub utf8_octets_2_codepoint(@octets) {
- if @octets[ 0 ] < 0x80 { # completeness
- return @octets[0], 1
- }
-
- my $len = 1;
-
- while 0x80 +> ++$len +& @octets[0] and $len < 6 {}
-
- my $max_shift = 6 * ($len -1);
- my $code_point = reduce {
- $^a + @octets[ $^b ] +& 0x3F +< ($max_shift - 6 * $^b)
- }, 0x7F +> $len +& @octets[0] +< $max_shift, 1 ..^ $len;
-
- return $code_point, $len;
- }
}
=begin pod

0 comments on commit 012e825

Please sign in to comment.