Skip to content

Commit

Permalink
Added rescue exception when links have hrefs that are not valid URIs
Browse files Browse the repository at this point in the history
  • Loading branch information
gabceb committed Nov 17, 2012
1 parent 92b29d5 commit c351ec6
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 1 deletion.
4 changes: 3 additions & 1 deletion lib/meta_inspector/scraper.rb
Expand Up @@ -41,7 +41,7 @@ def description

# Links found on the page, as absolute URLs
def links
@data.links ||= parsed_links.map { |l| absolutify_url(unrelativize_url(l)) }
@data.links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.reject{|l| l.nil? }
end

# Internal links found on the page, as absolute URLs
Expand Down Expand Up @@ -190,6 +190,8 @@ def absolutify_url(url)
else
URI.parse(@root_url).merge(encode_url(url)).to_s
end
rescue URI::InvalidURIError => e
add_fatal_error "Link parsing exception: #{e.message}" and nil
end

# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
Expand Down
21 changes: 21 additions & 0 deletions spec/fixtures/invalid_href.response
@@ -0,0 +1,21 @@
HTTP/1.1 200 OK
Server: nginx/0.7.67
Date: Fri, 18 Nov 2011 21:46:46 GMT
Content-Type: text/html
Connection: keep-alive
Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
Content-Length: 4987
X-Varnish: 2000423390
Age: 0
Via: 1.1 varnish

<html>
<head>
<title>Sample file non-http links</title>
</head>
<body>
<a href="<p>ftp://ftp.cdrom.com">an FTP link</a>
<a href="skype:joeuser?call">a skype link</a>
<a href="telnet://telnet.cdrom.com">a telnet link</a>
</body>
</html>
17 changes: 17 additions & 0 deletions spec/metainspector_spec.rb
Expand Up @@ -15,6 +15,7 @@
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
Expand Down Expand Up @@ -212,6 +213,22 @@
"http://example.com/search?q=espa%C3%B1a#top"]
end
end

it "should avoid links that contain invalid links as href value" do
m = MetaInspector.new('http://example.com/invalid_href')
m.links.should == [ "skype:joeuser?call",
"telnet://telnet.cdrom.com"]
end

it "should throw errors when links contain invalid href values" do
m = MetaInspector.new('http://example.com/invalid_href')

expect {
links = m.links
}.to change { m.errors.size }

m.errors.first.should == "Link parsing exception: bad URI(is not URI?): %3Cp%3Eftp://ftp.cdrom.com"
end
end

describe 'Non-HTTP links' do
Expand Down

0 comments on commit c351ec6

Please sign in to comment.