libwww-perl · haarg · Jul 24, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/lib/HTML/Entities.pm b/lib/HTML/Entities.pm
@@ -443,23 +443,46 @@ sub encode_entities
     } else {
 	$ref = \$_[0];  # modify in-place
     }
+    my $regex;
     if (defined $_[1] and length $_[1]) {
-	unless (exists $subst{$_[1]}) {
-	    # Because we can't compile regex we fake it with a cached sub
+        $regex = $subst{$_[1]};
+        unless (defined $regex) {
 	    my $chars = $_[1];
-	    $chars =~ s,(?<!\\)([]/]),\\$1,g;
-	    $chars =~ s,(?<!\\)\\\z,\\\\,;
-	    my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
-	    $subst{$_[1]} = eval $code;
+
+            # keep existing escapes, but also escape any unescaped special character:
+            #  [ (technically unnecessary but included for symmetry with ])
+            #  ] (end of character class)
+            #  \ (escape character)
+            $chars =~ s{
+                # capture group 1: things to skip and keep
+                (
+                    # any escaped character
+                    \\.
+                |
+                    # either an actual POSIX character class or anything
+                    # similar enough to trigger a regex syntax error
+                    \[: \^? [[:lower:][:digit:]]{3,} :\]
+                )
+                |
+                # capture group 2: things to be escaped
+                (
+                    [\[\]\\]
+                )
+            }{
+                defined $1 ? $1 : '\\' . $2
+            }xseg;
+
+            $regex = eval { qr/([$chars])/ };
 	    die( $@ . " while trying to turn range: \"$_[1]\"\n "
-	      . "into code: $code\n "
+              . "into code: /([$chars])/\n "
 	    ) if $@;
+            $subst{$_[1]} = $regex;
 	}
-	&{$subst{$_[1]}}($$ref);
     } else {
 	# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
-	$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
+        $regex = qr/([^\n\r\t !\#\$%\(-;=?-~])/;
     }
+    $$ref =~ s/$regex/$char2entity{$1} || num_entity($1)/eg;
     $$ref;
 }
 

diff --git a/t/entities.t b/t/entities.t
@@ -3,7 +3,7 @@ use warnings;
 use utf8;
 
 use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric);
-use Test::More tests => 20;
+use Test::More tests => 29;
 
 my $x = "V&aring;re norske tegn b&oslash;r &#230res";
 
@@ -33,6 +33,21 @@ is(encode_entities($x, '\\/'), "[24&#47;7]\\");
 is(encode_entities($x, '\\'),  "[24/7]&#92;");
 is(encode_entities($x, ']\\'), "[24/7&#93;&#92;");
 
+# https://github.com/libwww-perl/HTML-Parser/issues/44
+$x = '<]$a/b\c/d$';
+is(encode_entities($x, '$'), '<]&#36;a/b\\c/d&#36;');
+is(encode_entities($x, '\\\\/'), '<]$a&#47;b&#92;c&#47;d$');
+is(encode_entities($x, '\\\\/$'), '<]&#36;a&#47;b&#92;c&#47;d&#36;');
+is(encode_entities($x, '<\\\\]'), '&lt;&#93;$a/b&#92;c/d$');
+
+# POSIX character classes
+$x = "<Våre123[=]";
+is(encode_entities($x, '[:punct:]'),          "&lt;Våre123&#91;&#61;&#93;");
+is(encode_entities($x, '^[:^digit:]'),        "<Våre&#49;&#50;&#51;[=]");
+is(encode_entities($x, '[:lower:][:digit:]'), "<V&aring;&#114;&#101;&#49;&#50;&#51;[=]");
+is(encode_entities($x, '=[:^ascii:]<>'),      "&lt;V&aring;re123[&#61;]");
+is(encode_entities($x, '[:Vlower:]'),         "<&#86;å&#114;&#101;123&#91;=&#93;");
+
 # See how well it does against rfc1866...
 my $ent   = '';
 my $plain = '';