-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Respect CDATA[[ sections when parsing HTML
This changes the HTML parser behaviour to properly respect CDATA[[ sections and to ignore link tags in Javascript code. The old behaviour can be restored by passing undef as the "marked_sections" option when creating the WWW::Mechanize object: my $mech = WWW::Mechanize->new( marked_sections => undef, ); The patch also includes a (nasty) test file to check the old and new behaviour
- Loading branch information
Max Maischein
committed
May 13, 2020
1 parent
44cfea7
commit d334732
Showing
3 changed files
with
149 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | ||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[ | ||
<!ATTLIST html | ||
xmlns:xsi CDATA #FIXED "http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation CDATA #IMPLIED > ]> | ||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://www.w3.org/1999/xhtml | ||
http://www.w3.org/2002/08/xhtml/xhtml1-strict.xsd"> | ||
<head> | ||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | ||
<title>Hello, World!</title> | ||
</head> | ||
<body> | ||
<p id="thelinks"> | ||
|
||
<a | ||
href | ||
= | ||
"http://www.example.com/1" | ||
> | ||
One | ||
</a | ||
> | ||
<a id="Two" title="href="></a> | ||
<!-- | ||
<a href="http://www.example.com/3">Three</a> | ||
--> | ||
<a title=' href="http://www.example.com/4">Four' | ||
href="http://www.example.com/5">Five</a> | ||
<!--BEGIN--> | ||
<script type="text/javascript">/*<![CDATA[ | ||
</script> | ||
*/ console.log(' <a href="http://www.example.com/6">Six</a> '); /* | ||
<!-- | ||
]]>*/</script> | ||
<a href="http://www.example.com/7"><![CDATA[Se]]><span | ||
>ve</span>n</a> | ||
<script type="text/javascript">/*<![CDATA[ | ||
--> | ||
]]>*/</script> | ||
<![CDATA[ | ||
<a href="http://www.example.com/8">Eight</a> | ||
]]> | ||
|
||
<!--END--></p> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#!perl -T | ||
|
||
use warnings; | ||
use strict; | ||
|
||
use Test::More; | ||
use URI::file; | ||
|
||
BEGIN { | ||
delete @ENV{qw(PATH IFS CDPATH ENV BASH_ENV)}; # Placates taint-unsafe Cwd.pm in 5.6.1 | ||
use_ok( 'WWW::Mechanize' ); | ||
} | ||
|
||
my $mech = WWW::Mechanize->new( cookie_jar => undef ); | ||
isa_ok( $mech, 'WWW::Mechanize' ); | ||
|
||
my $uri = URI::file->new_abs( 't/find_link_xhtml.html' )->as_string; | ||
|
||
$mech->get( $uri ); | ||
ok( $mech->success, "Fetched $uri" ) or die q{Can't get test page}; | ||
|
||
my @links = map {[ $_->text, $_->url ]} $mech->links(); | ||
my @expected = ( | ||
['One','http://www.example.com/1'], | ||
['Five','http://www.example.com/5'], | ||
['Seven','http://www.example.com/7'], | ||
); | ||
|
||
is_deeply \@links, \@expected, "We find exactly the valid links"; | ||
|
||
# now, test with explicit marked_sections => 1 | ||
|
||
$mech = WWW::Mechanize->new( cookie_jar => undef, marked_sections => 1 ); | ||
isa_ok( $mech, 'WWW::Mechanize' ); | ||
|
||
$uri = URI::file->new_abs( 't/find_link_xhtml.html' )->as_string; | ||
|
||
$mech->get( $uri ); | ||
ok( $mech->success, "Fetched $uri" ) or die q{Can't get test page}; | ||
|
||
@links = map {[ $_->text, $_->url ]} $mech->links(); | ||
@expected = ( | ||
['One','http://www.example.com/1'], | ||
['Five','http://www.example.com/5'], | ||
['Seven','http://www.example.com/7'], | ||
); | ||
|
||
is_deeply \@links, \@expected, "We find exactly the valid links, explicitly"; | ||
|
||
# now, test with marked_sections => 0, giving us legacy results | ||
|
||
$mech = WWW::Mechanize->new( cookie_jar => undef, marked_sections => undef ); | ||
isa_ok( $mech, 'WWW::Mechanize' ); | ||
|
||
$uri = URI::file->new_abs( 't/find_link_xhtml.html' )->as_string; | ||
|
||
$mech->get( $uri ); | ||
ok( $mech->success, "Fetched $uri" ) or die q{Can't get test page}; | ||
|
||
@links = map {[ $_->text, $_->url ]} $mech->links(); | ||
@expected = ( | ||
['One','http://www.example.com/1'], | ||
['Five','http://www.example.com/5'], | ||
['Six','http://www.example.com/6'], # yeah... | ||
); | ||
|
||
is_deeply \@links, \@expected, "We can enable the legacy behaviour"; | ||
|
||
done_testing(); |