Skip to content

Commit

Permalink
Merge e6f63d6 into e466e74
Browse files Browse the repository at this point in the history
  • Loading branch information
simbabque committed May 6, 2020
2 parents e466e74 + e6f63d6 commit 9554e6f
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 42 deletions.
71 changes: 66 additions & 5 deletions lib/WWW/Mechanize.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3031,11 +3031,28 @@ sub _extract_images {
$self->{images} = [];

if ( defined $self->{content} ) {
my $parser = HTML::TokeParser->new(\$self->{content});
while ( my $token = $parser->get_tag( keys %image_tags ) ) {
my $image = $self->_image_from_token( $token, $parser );
push( @{$self->{images}}, $image ) if $image;
} # while
if ($self->content_type eq 'text/css') {
push( @{$self->{images}}, $self->_images_from_css($self->{content}) );
}
else {
my $parser = HTML::TokeParser->new(\$self->{content});
while ( my $token = $parser->get_tag() ) {
my ($tag_name, $attrs) = @{$token};
next if $tag_name =~ m{^/};

if ($image_tags{$tag_name}) {
my $image = $self->_image_from_token( $token, $parser );
push( @{$self->{images}}, $image ) if $image;
}
elsif ($tag_name eq 'style') {
push( @{$self->{images}}, $self->_images_from_css($parser->get_text) );
}

if ($attrs->{style}) {
push( @{$self->{images}}, $self->_images_from_css($attrs->{style}) );
}
} # while
}
}

return;
Expand Down Expand Up @@ -3068,6 +3085,50 @@ sub _image_from_token {
});
}

my $STYLE_URL_REGEXP = qr{
# ex. "url('/site.css')"
( # capture non url path of the string
url # url
\s* #
\( # (
\s* #
(['"]?) # opening ' or "
)
( # the rest is url
.+? # non greedy "everything"
)
(
\2 # closing ' or "
\s* #
\) # )
)
}xmsi;

sub _images_from_css {
my $self = shift;
my $css = shift;

my @images;
while ($css =~ m/$STYLE_URL_REGEXP/g) {
my $url = $3;
require WWW::Mechanize::Image;
push(
@images,
WWW::Mechanize::Image->new({
tag => 'css',
base => $self->base,
url => $url,
name => undef,
height => undef,
width => undef,
alt => undef,
})
);
}

return @images;
}

sub _link_from_token {
my $self = shift;
my $token = shift;
Expand Down
6 changes: 5 additions & 1 deletion t/dump.t
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ EXPECTED

subtest "dump_images test", sub {
dump_tests('dump_images', 't/image-parse.html', <<'EXPECTED');
/Images/bg-gradient.png
wango.jpg
bongo.gif
linked.gif
Expand All @@ -66,6 +67,9 @@ hacktober.jpg
hacktober.jpg
http://example.org/abs.tif
images/logo.png
inner.jpg
outer.jpg
EXPECTED
};

Expand Down Expand Up @@ -109,7 +113,7 @@ EXPECTED

subtest "dump_text test", sub {
dump_tests('dump_text', 't/image-parse.html', <<'EXPECTED');
Testing image extractionblargle And now, the dreaded wango CNN BBC Blongo!
Testing image extractionblargle And now, the dreaded wango CNN BBC Blongo!Logo
EXPECTED
};

Expand Down
60 changes: 47 additions & 13 deletions t/find_image.t
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,25 @@ ok( $mech->success, "Fetched $uri" ) or die q{Can't get test page};
);

cmp_deeply(
[ map { $_->url } @images ],
[ qw(
wango.jpg
bongo.gif
linked.gif
hacktober.jpg
hacktober.jpg
hacktober.jpg
http://example.org/abs.tif
), undef ],
'... and all seven are in the right order'
[map { $_->url } @images],
[ qw(
/Images/bg-gradient.png
wango.jpg
bongo.gif
linked.gif
hacktober.jpg
hacktober.jpg
hacktober.jpg
http://example.org/abs.tif
),
undef,
qw(
images/logo.png
inner.jpg
outer.jpg
),
],
'... and all ten are in the right order'
);

cmp_deeply(
Expand Down Expand Up @@ -102,6 +110,7 @@ ok( $mech->success, "Fetched $uri" ) or die q{Can't get test page};
# shortcuts for all six images in the website. They can be used instead
# of each array reference.

my $image0 = [ url => '/Images/bg-gradient.png', tag => 'css' ]; # this is the body background from the style tag
my $image1 = [ url => 'wango.jpg', alt => re('world of') ];
my $image2 = [ url => 'bongo.gif', tag => 'input', height => 142 ];
my $image3 = [ url => 'linked.gif', tag => 'img' ];
Expand All @@ -110,8 +119,23 @@ my $image5 = [ url => 'hacktober.jpg', attrs => superhashof( { class => re('my-c
my $image6 = [ url => 'hacktober.jpg', attrs => superhashof( { class => re('my-class-3') } ) ];
my $image7 = [ url => 'http://example.org/abs.tif', attrs => superhashof( { id => 'absolute' } ) ];
my $image8 = [ url => undef, tag => 'img', attrs => superhashof( { 'data-image' => "hacktober.jpg", id => "no-src-regression-269" } ) ];
my $image9 = [ url => 'images/logo.png', tag => 'css' ];
my $image10 = [ url => 'inner.jpg', tag => 'img' ];
my $image11 = [ url => 'outer.jpg', tag => 'css' ];

my $tests = [
{
name => 'CSS',
args => [
tag => 'css',
],
expected_single => $image0,
expected_all => [
$image0,
$image9,
$image11,
],
},
{
name => 'alt',
args => [
Expand Down Expand Up @@ -190,6 +214,7 @@ my $tests = [
$image6,
$image7,
$image8,
$image10,
],
},
{
Expand Down Expand Up @@ -217,6 +242,7 @@ my $tests = [
$image6,
$image7,
$image8,
$image10,
],
},
{
Expand Down Expand Up @@ -281,6 +307,13 @@ my $tests = [
],
expected_single => $image5,
},
{
name => 'inline style background image',
args => [
url_regex => qr/logo/,
],
expected_single => $image9,
},
];

foreach my $test ( @{ $tests } ) {
Expand Down Expand Up @@ -326,6 +359,7 @@ foreach my $arg (qw/alt url url_abs tag id class/) {
);
}

# all of these will find the "wrong" image
{
my $image;
like(
Expand Down Expand Up @@ -357,7 +391,7 @@ foreach my $arg (qw/alt url url_abs tag id class/) {
qr/space-padded and cannot succeed/,
'find_image warns about space-padding'
);
isnt $image->attrs->{id}, 'absolute', '... and ignores this argument';
is $image->attrs, undef, '... and ignores this argument';
}

done_testing;
done_testing;
9 changes: 9 additions & 0 deletions t/image-parse.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
body {
background-color:white;
background-image:url(/Images/bg-gradient.png);
}

.logo {
background: url("images/logo.png") no-repeat;
background-size: 275px 95px;
}
6 changes: 6 additions & 0 deletions t/image-parse.html
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
<html>
<head>
<TITLE>Testing image extraction</TITLE>
<style>
background-color:white;
background-image:url(/Images/bg-gradient.png);
</style>
</head>
<body>
<A HREF="http://blargle.com/">blargle</A>
Expand All @@ -14,6 +18,8 @@
</FORM>
<A HREF="http://www.bbc.co.uk/" NAME="Wilma">BBC</A>
<A HREF="blongo.html">Blongo!</A><img src="hacktober.jpg" id="first-hacktober-image" class="my-class-1"><img src="hacktober.jpg" class="my-class-2 foo"><img src="hacktober.jpg" class="my-class-3 foo bar"><img src="http://example.org/abs.tif" id="absolute"><img data-image="hacktober.jpg" id="no-src-regression-269">
<div style="background:url(images/logo.png) no-repeat;background-size:275px 95px;" id="logo">Logo</div>
<img src="inner.jpg" style="background-image: url(outer.jpg); padding: 5em" id="weird_background_style_edge_case">
</body>
</html>

75 changes: 52 additions & 23 deletions t/image-parse.t
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use warnings;
use strict;

use Test::More tests => 33;
use Test::More tests => 47;
use URI::file;

BEGIN {
Expand All @@ -20,48 +20,77 @@ $mech->get( $uri );
ok( $mech->success, "Fetched $uri" ) or die 'Can\'t get test page';

my @images = $mech->images;
is( scalar @images, 8, 'Only eight images' );
is( scalar @images, 12, 'Exactly twelve images' );

my $first = $images[0];
is( $first->tag, 'img', 'img tag' );
is( $first->url, 'wango.jpg', 'URL matches' );
is( $first->alt, 'The world of the wango', 'alt matches' );
is( $first->url, '/Images/bg-gradient.png', 'Got the background style image' );
is( $first->tag, 'css', 'css tag' );
is( $first->alt, undef, 'alt' );

my $second = $images[1];
is( $second->tag, 'input', 'input tag' );
is( $second->url, 'bongo.gif', 'URL matches' );
is( $second->alt, undef, 'alt matches' );
is( $second->height, 142, 'height' );
is( $second->width, 43, 'width' );
is( $second->tag, 'img', 'img tag' );
is( $second->url, 'wango.jpg', 'URL matches' );
is( $second->alt, 'The world of the wango', 'alt matches' );

my $third = $images[2];
is( $third->url, 'linked.gif', 'Got the third image' );
is( $third->tag, 'img', 'input tag' );
is( $third->alt, undef, 'alt' );
is( $third->tag, 'input', 'input tag' );
is( $third->url, 'bongo.gif', 'URL matches' );
is( $third->alt, undef, 'alt matches' );
is( $third->height, 142, 'height' );
is( $third->width, 43, 'width' );

my $fourth = $images[3];
is( $fourth->url, 'hacktober.jpg', 'Got the fourth image' );
is( $fourth->url, 'linked.gif', 'Got the fourth image' );
is( $fourth->tag, 'img', 'input tag' );
is( $fourth->alt, undef, 'alt' );
is( $fourth->attrs->{id}, 'first-hacktober-image', 'id' );
is( $fourth->attrs->{class}, 'my-class-1', 'class' );

my $fifth = $images[4];
is( $fifth->url, 'hacktober.jpg', 'Got the fifth image' );
is( $fifth->tag, 'img', 'input tag' );
is( $fifth->alt, undef, 'alt' );
is( $fifth->attrs->{id}, undef, 'id' );
is( $fifth->attrs->{class}, 'my-class-2 foo', 'class' );
is( $fifth->attrs->{id}, 'first-hacktober-image', 'id' );
is( $fifth->attrs->{class}, 'my-class-1', 'class' );

my $sixth = $images[5];
is( $sixth->url, 'hacktober.jpg', 'Got the sixth image' );
is( $sixth->tag, 'img', 'input tag' );
is( $sixth->alt, undef, 'alt' );
is( $sixth->attrs->{id}, undef, 'id' );
is( $sixth->attrs->{class}, 'my-class-3 foo bar', 'class' );
is( $sixth->attrs->{class}, 'my-class-2 foo', 'class' );

my $seventh = $images[6];
is( $seventh->url, 'hacktober.jpg', 'Got the seventh image' );
is( $seventh->tag, 'img', 'input tag' );
is( $seventh->alt, undef, 'alt' );
is( $seventh->attrs->{id}, undef, 'id' );
is( $seventh->attrs->{class}, 'my-class-3 foo bar', 'class' );

# regression github #269
my $seventh = $images[7];
is( $seventh->attrs->{id}, 'no-src-regression-269', 'Got the sevenths image');
is( $seventh->url, undef, 'it has no URL');
is( $seventh->attrs->{'data-image'}, 'hacktober.jpg', 'it has an extra attribute');
my $eighth = $images[8];
is( $eighth->attrs->{id}, 'no-src-regression-269', 'Got the eighth image');
is( $eighth->url, undef, 'it has no URL');
is( $eighth->attrs->{'data-image'}, 'hacktober.jpg', 'it has an extra attribute');

my $ninth = $images[9];
is( $ninth->url, 'images/logo.png', 'Got the fifth image' );
is( $ninth->tag, 'css', 'css tag' );
is( $ninth->alt, undef, 'alt' );

# find image in css
$uri = URI::file->new_abs( 't/image-parse.css' )->as_string;

$mech->get( $uri );
ok( $mech->success, "Fetched $uri" ) or die q{Can't get test page};

eval { @images = $mech->find_all_images(); };
is($@,'','survived eval');
is( scalar @images, 2, 'Exactly two images' );

my $css_first = $images[0];
is( $css_first->url, '/Images/bg-gradient.png', 'Got the first image' );
is( $css_first->tag, 'css', 'css tag' );
is( $css_first->alt, undef, 'alt' );

my $css_second = $images[1];
is( $css_second->url, 'images/logo.png', 'Got the second image' );
is( $css_second->tag, 'css', 'css tag' );

0 comments on commit 9554e6f

Please sign in to comment.