Permalink
Browse files

Merge pull request #29 from gabceb/master

Fix for issue #28
  • Loading branch information...
2 parents 92b29d5 + c351ec6 commit 326995be57daef13b40d7f791f51f527f6243684 @jaimeiniesta committed Nov 17, 2012
Showing with 41 additions and 1 deletion.
  1. +3 −1 lib/meta_inspector/scraper.rb
  2. +21 −0 spec/fixtures/invalid_href.response
  3. +17 −0 spec/metainspector_spec.rb
@@ -41,7 +41,7 @@ def description
# Links found on the page, as absolute URLs
def links
- @data.links ||= parsed_links.map { |l| absolutify_url(unrelativize_url(l)) }
+ @data.links ||= parsed_links.map{ |l| absolutify_url(unrelativize_url(l)) }.reject{|l| l.nil? }
end
# Internal links found on the page, as absolute URLs
@@ -190,6 +190,8 @@ def absolutify_url(url)
else
URI.parse(@root_url).merge(encode_url(url)).to_s
end
+ rescue URI::InvalidURIError => e
+ add_fatal_error "Link parsing exception: #{e.message}" and nil
end
# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
@@ -0,0 +1,21 @@
+HTTP/1.1 200 OK
+Server: nginx/0.7.67
+Date: Fri, 18 Nov 2011 21:46:46 GMT
+Content-Type: text/html
+Connection: keep-alive
+Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
+Content-Length: 4987
+X-Varnish: 2000423390
+Age: 0
+Via: 1.1 varnish
+
+<html>
+ <head>
+ <title>Sample file non-http links</title>
+ </head>
+ <body>
+ <a href="<p>ftp://ftp.cdrom.com">an FTP link</a>
+ <a href="skype:joeuser?call">a skype link</a>
+ <a href="telnet://telnet.cdrom.com">a telnet link</a>
+ </body>
+</html>
View
@@ -15,6 +15,7 @@
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
+ FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
@@ -212,6 +213,22 @@
"http://example.com/search?q=espa%C3%B1a#top"]
end
end
+
+ it "should avoid links that contain invalid links as href value" do
+ m = MetaInspector.new('http://example.com/invalid_href')
+ m.links.should == [ "skype:joeuser?call",
+ "telnet://telnet.cdrom.com"]
+ end
+
+ it "should throw errors when links contain invalid href values" do
+ m = MetaInspector.new('http://example.com/invalid_href')
+
+ expect {
+ links = m.links
+ }.to change { m.errors.size }
+
+ m.errors.first.should == "Link parsing exception: bad URI(is not URI?): %3Cp%3Eftp://ftp.cdrom.com"
+ end
end
describe 'Non-HTTP links' do

0 comments on commit 326995b

Please sign in to comment.