Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 32 additions & 9 deletions lib/HTML/Entities.pm
Original file line number Diff line number Diff line change
Expand Up @@ -443,23 +443,46 @@ sub encode_entities
} else {
$ref = \$_[0]; # modify in-place
}
my $regex;
if (defined $_[1] and length $_[1]) {
unless (exists $subst{$_[1]}) {
# Because we can't compile regex we fake it with a cached sub
$regex = $subst{$_[1]};
unless (defined $regex) {
my $chars = $_[1];
$chars =~ s,(?<!\\)([]/]),\\$1,g;
$chars =~ s,(?<!\\)\\\z,\\\\,;
my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
$subst{$_[1]} = eval $code;

# keep existing escapes, but also escape any unescaped special character:
# [ (technically unnecessary but included for symmetry with ])
# ] (end of character class)
# \ (escape character)
$chars =~ s{
# capture group 1: things to skip and keep
(
# any escaped character
\\.
|
# either an actual POSIX character class or anything
# similar enough to trigger a regex syntax error
\[: \^? [[:lower:][:digit:]]{3,} :\]
)
|
# capture group 2: things to be escaped
(
[\[\]\\]
)
}{
defined $1 ? $1 : '\\' . $2
}xseg;

$regex = eval { qr/([$chars])/ };
die( $@ . " while trying to turn range: \"$_[1]\"\n "
. "into code: $code\n "
. "into code: /([$chars])/\n "
) if $@;
$subst{$_[1]} = $regex;
}
&{$subst{$_[1]}}($$ref);
} else {
# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
$regex = qr/([^\n\r\t !\#\$%\(-;=?-~])/;
}
$$ref =~ s/$regex/$char2entity{$1} || num_entity($1)/eg;
$$ref;
}

Expand Down
17 changes: 16 additions & 1 deletion t/entities.t
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use warnings;
use utf8;

use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric);
use Test::More tests => 20;
use Test::More tests => 29;

my $x = "V&aring;re norske tegn b&oslash;r &#230res";

Expand Down Expand Up @@ -33,6 +33,21 @@ is(encode_entities($x, '\\/'), "[24&#47;7]\\");
is(encode_entities($x, '\\'), "[24/7]&#92;");
is(encode_entities($x, ']\\'), "[24/7&#93;&#92;");

# https://github.com/libwww-perl/HTML-Parser/issues/44
$x = '<]$a/b\c/d$';
is(encode_entities($x, '$'), '<]&#36;a/b\\c/d&#36;');
is(encode_entities($x, '\\\\/'), '<]$a&#47;b&#92;c&#47;d$');
is(encode_entities($x, '\\\\/$'), '<]&#36;a&#47;b&#92;c&#47;d&#36;');
is(encode_entities($x, '<\\\\]'), '&lt;&#93;$a/b&#92;c/d$');

# POSIX character classes
$x = "<Våre123[=]";
is(encode_entities($x, '[:punct:]'), "&lt;Våre123&#91;&#61;&#93;");
is(encode_entities($x, '^[:^digit:]'), "<Våre&#49;&#50;&#51;[=]");
is(encode_entities($x, '[:lower:][:digit:]'), "<V&aring;&#114;&#101;&#49;&#50;&#51;[=]");
is(encode_entities($x, '=[:^ascii:]<>'), "&lt;V&aring;re123[&#61;]");
is(encode_entities($x, '[:Vlower:]'), "<&#86;å&#114;&#101;123&#91;=&#93;");

# See how well it does against rfc1866...
my $ent = '';
my $plain = '';
Expand Down