Skip to content

Commit

Permalink
#23326 - also extract links from framesets and imagemaps
Browse files Browse the repository at this point in the history
  • Loading branch information
jkraemer committed Dec 24, 2008
1 parent 82a7405 commit 44bc51e
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 7 deletions.
12 changes: 7 additions & 5 deletions lib/rdig/content_extractors/hpricot.rb
Expand Up @@ -53,12 +53,14 @@ def extract_content(doc)
# extracts the href attributes of all a tags, except
# internal links like <a href="#top">
def extract_links(doc)
(doc/'a').map { |link|
href = link['href']
CGI.unescapeHTML(href) if href && href !~ /^#/
}.compact
{'a' => 'href', 'area' => 'href', 'frame' => 'src'}.map do |tag, attr|
(doc/tag).map do |tag|
value = tag[attr]
CGI.unescapeHTML(value) if value && value !~ /^#/
end
end.flatten.compact
end

# Extracts the title from the given html tree
def extract_title(doc)
the_title_tag = title_tag(doc)
Expand Down
13 changes: 13 additions & 0 deletions test/fixtures/html/frameset.html
@@ -0,0 +1,13 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
<title>Sample &amp; Title</title>
</head>
<body>
<frameset>
<frame src="http://test.host/first.html" />
<frame src="/second.html" />
</frameset>
</body>
</html>
13 changes: 13 additions & 0 deletions test/fixtures/html/imagemap.html
@@ -0,0 +1,13 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
<title>Sample &amp; Title</title>
</head>
<body>
<map>
<area href="http://test.host/first.html" />
<area href="/second.html" />
</map>
</body>
</html>
12 changes: 12 additions & 0 deletions test/unit/hpricot_content_extractor_test.rb
Expand Up @@ -52,6 +52,18 @@ def test_custom_content_element
assert_equal '/inside.html', result[:links][1]
assert_equal '/footer.html', result[:links][2]
end

def test_extracts_links_from_frameset
result = @extractor.process(html_doc('frameset'))
assert_equal 'http://test.host/first.html', result[:links].first
assert_equal '/second.html', result[:links].last
end

def test_extracts_links_from_imagemap
result = @extractor.process(html_doc('imagemap'))
assert_equal 'http://test.host/first.html', result[:links].first
assert_equal '/second.html', result[:links].last
end


def test_title_from_dcmeta
Expand Down
4 changes: 2 additions & 2 deletions test/unit/searcher_test.rb
Expand Up @@ -28,8 +28,8 @@ def teardown

def test_search
result = RDig.searcher.search 'some sample text'
assert_equal 3, result[:hitcount]
assert_equal 3, result[:list].size
assert_equal 5, result[:hitcount]
assert_equal 5, result[:list].size
end

end
Expand Down

0 comments on commit 44bc51e

Please sign in to comment.