Skip to content

Commit

Permalink
Respect CDATA[[ sections when parsing HTML
Browse files Browse the repository at this point in the history
This changes the HTML parser behaviour to properly respect
CDATA[[ sections and to ignore link tags in Javascript code.

The old behaviour can be restored by passing undef as the "marked_sections"
option when creating the WWW::Mechanize object:

    my $mech = WWW::Mechanize->new(
        marked_sections => undef,
    );

The patch also includes a (nasty) test file to check the old and
new behaviour
  • Loading branch information
Max Maischein committed May 13, 2020
1 parent 44cfea7 commit d334732
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 11 deletions.
42 changes: 31 additions & 11 deletions lib/WWW/Mechanize.pm
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,14 @@ later.
Default is off.
=item * C<< marked_sections => [0|1] >>
Globally sets the HTML::Parser marked sections flag which causes HTML
C<< CDATA[[ >> sections to be honoured. This cannot be disabled
later.
Default is on.
=back
To support forms, WWW::Mechanize's constructor pushes POST
Expand All @@ -277,15 +285,16 @@ sub new {
);

my %mech_params = (
autocheck => ($class eq 'WWW::Mechanize' ? 1 : 0),
onwarn => \&WWW::Mechanize::_warn,
onerror => \&WWW::Mechanize::_die,
quiet => 0,
stack_depth => 8675309, # Arbitrarily humongous stack
headers => {},
noproxy => 0,
strict_forms => 0, # pass-through to HTML::Form
verbose_forms => 0, # pass-through to HTML::Form
autocheck => ($class eq 'WWW::Mechanize' ? 1 : 0),
onwarn => \&WWW::Mechanize::_warn,
onerror => \&WWW::Mechanize::_die,
quiet => 0,
stack_depth => 8675309, # Arbitrarily humongous stack
headers => {},
noproxy => 0,
strict_forms => 0, # pass-through to HTML::Form
verbose_forms => 0, # pass-through to HTML::Form
marked_sections => 1,
);

my %passed_params = @_;
Expand Down Expand Up @@ -3003,13 +3012,24 @@ my %link_tags = (
meta => 'content',
);

sub _new_parser {
my $self = shift;
my $content_ref = shift;

my $parser = HTML::TokeParser->new($content_ref);
$parser->marked_sections( $self->{marked_sections});
$parser->xml_mode( $$content_ref=~/^\s*<\?xml/ ); # NOT GENERALLY RELIABLE

return $parser;
}

sub _extract_links {
my $self = shift;


$self->{links} = [];
if ( defined $self->{content} ) {
my $parser = HTML::TokeParser->new(\$self->{content});
my $parser = $self->_new_parser(\$self->{content});
while ( my $token = $parser->get_tag( keys %link_tags ) ) {
my $link = $self->_link_from_token( $token, $parser );
push( @{$self->{links}}, $link ) if $link;
Expand All @@ -3035,7 +3055,7 @@ sub _extract_images {
push( @{$self->{images}}, $self->_images_from_css($self->{content}) );
}
else {
my $parser = HTML::TokeParser->new(\$self->{content});
my $parser = $self->_new_parser(\$self->{content});
while ( my $token = $parser->get_tag() ) {
my ($tag_name, $attrs) = @{$token};
next if $tag_name =~ m{^/};
Expand Down
49 changes: 49 additions & 0 deletions t/find_link_xhtml.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[
<!ATTLIST html
xmlns:xsi CDATA #FIXED "http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation CDATA #IMPLIED > ]>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.w3.org/1999/xhtml
http://www.w3.org/2002/08/xhtml/xhtml1-strict.xsd">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>Hello, World!</title>
</head>
<body>
<p id="thelinks">

<a
href
=
"http://www.example.com/1"
>
One
</a
>
<a id="Two" title="href="></a>
<!--
<a href="http://www.example.com/3">Three</a>
-->
<a title=' href="http://www.example.com/4">Four'
href="http://www.example.com/5">Five</a>
<!--BEGIN-->
<script type="text/javascript">/*<![CDATA[
</script>
*/ console.log(' <a href="http://www.example.com/6">Six</a> '); /*
<!--
]]>*/</script>
<a href="http://www.example.com/7"><![CDATA[Se]]><span
>v&#101;</span>n</a>
<script type="text/javascript">/*<![CDATA[
-->
]]>*/</script>
<![CDATA[
<a href="http://www.example.com/8">Eight</a>
]]>

<!--END--></p>
</body>
</html>
69 changes: 69 additions & 0 deletions t/find_link_xhtml.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!perl -T

use warnings;
use strict;

use Test::More;
use URI::file;

BEGIN {
delete @ENV{qw(PATH IFS CDPATH ENV BASH_ENV)}; # Placates taint-unsafe Cwd.pm in 5.6.1
use_ok( 'WWW::Mechanize' );
}

my $mech = WWW::Mechanize->new( cookie_jar => undef );
isa_ok( $mech, 'WWW::Mechanize' );

my $uri = URI::file->new_abs( 't/find_link_xhtml.html' )->as_string;

$mech->get( $uri );
ok( $mech->success, "Fetched $uri" ) or die q{Can't get test page};

my @links = map {[ $_->text, $_->url ]} $mech->links();
my @expected = (
['One','http://www.example.com/1'],
['Five','http://www.example.com/5'],
['Seven','http://www.example.com/7'],
);

is_deeply \@links, \@expected, "We find exactly the valid links";

# now, test with explicit marked_sections => 1

$mech = WWW::Mechanize->new( cookie_jar => undef, marked_sections => 1 );
isa_ok( $mech, 'WWW::Mechanize' );

$uri = URI::file->new_abs( 't/find_link_xhtml.html' )->as_string;

$mech->get( $uri );
ok( $mech->success, "Fetched $uri" ) or die q{Can't get test page};

@links = map {[ $_->text, $_->url ]} $mech->links();
@expected = (
['One','http://www.example.com/1'],
['Five','http://www.example.com/5'],
['Seven','http://www.example.com/7'],
);

is_deeply \@links, \@expected, "We find exactly the valid links, explicitly";

# now, test with marked_sections => 0, giving us legacy results

$mech = WWW::Mechanize->new( cookie_jar => undef, marked_sections => undef );
isa_ok( $mech, 'WWW::Mechanize' );

$uri = URI::file->new_abs( 't/find_link_xhtml.html' )->as_string;

$mech->get( $uri );
ok( $mech->success, "Fetched $uri" ) or die q{Can't get test page};

@links = map {[ $_->text, $_->url ]} $mech->links();
@expected = (
['One','http://www.example.com/1'],
['Five','http://www.example.com/5'],
['Six','http://www.example.com/6'], # yeah...
);

is_deeply \@links, \@expected, "We can enable the legacy behaviour";

done_testing();

0 comments on commit d334732

Please sign in to comment.