Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Recover from malformed href attribute when processing internal and ex…

…ternal links. Fixes #34 and #35
  • Loading branch information...
commit 0da77c1735e13e34d783391b4a6304446050e803 1 parent 47eb87d
@jaimeiniesta authored
View
11 lib/meta_inspector/scraper.rb
@@ -53,12 +53,12 @@ def links
# Internal links found on the page, as absolute URLs
def internal_links
- @internal_links ||= links.select {|link| URI.parse(link).host == host }
+ @internal_links ||= links.select {|link| host_from_url(link) == host }
end
# External links found on the page, as absolute URLs
def external_links
- @external_links ||= links.select {|link| URI.parse(link).host != host }
+ @external_links ||= links.select {|link| host_from_url(link) != host }
end
# Images found on the page, as absolute URLs
@@ -245,6 +245,13 @@ def unrelativize_url(url)
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
end
+ # Extracts the host from a given URL
+ def host_from_url(url)
+ URI.parse(url).host
+ rescue URI::InvalidURIError, URI::InvalidComponentError => e
+ add_fatal_error "Link parsing exception: #{e.message}" and nil
+ end
+
# Look for the first <p> block with 120 characters or more
def secondary_description
first_long_paragraph = parsed_document.search('//p[string-length() >= 120]').first
View
27 spec/fixtures/malformed_href.response
@@ -0,0 +1,27 @@
+HTTP/1.1 200 OK
+Server: nginx/0.7.67
+Date: Fri, 18 Nov 2011 21:46:46 GMT
+Content-Type: text/html
+Connection: keep-alive
+Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
+Content-Length: 4987
+X-Varnish: 2000423390
+Age: 0
+Via: 1.1 varnish
+
+<html>
+ <head>
+ <title>Malformed hrefs</title>
+ </head>
+ <body>
+ <h1>Good links</h1>
+ <a href="/faqs">FAQs</a>
+ <a href="skype:joeuser?call">a skype link</a>
+ <a href="telnet://telnet.cdrom.com">a telnet link</a>
+ <a href="javascript:alert('ok');">ok</a>
+
+ <h1>Bad links due to malformed href</h1>
+ <a href="javascript://">oops</a>
+ <a href="mailto:email(at)example.com">
+ </body>
+</html>
View
50 spec/metainspector_spec.rb
@@ -177,22 +177,44 @@
"http://example.com/search?q=espa%C3%B1a#top"]
end
- it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
- m = MetaInspector.new('http://international.com')
- m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
- "http://international.com/roman%C3%A9e",
- "http://international.com/faqs#cami%C3%B3n",
- "http://international.com/search?q=cami%C3%B3n",
- "http://international.com/search?q=espa%C3%B1a#top"]
+ describe "internal links" do
+ it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
+ m = MetaInspector.new('http://international.com')
+ m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
+ "http://international.com/roman%C3%A9e",
+ "http://international.com/faqs#cami%C3%B3n",
+ "http://international.com/search?q=cami%C3%B3n",
+ "http://international.com/search?q=espa%C3%B1a#top"]
+ end
+
+ it "should not crash when processing malformed hrefs" do
+ m = MetaInspector.new('http://example.com/malformed_href')
+ expect {
+ m.internal_links.should == [ "http://example.com/faqs" ]
+ m.should_not be_ok
+ }.to_not raise_error
+ end
end
- it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
- m = MetaInspector.new('http://international.com')
- m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
- "http://example.com/roman%C3%A9e",
- "http://example.com/faqs#cami%C3%B3n",
- "http://example.com/search?q=cami%C3%B3n",
- "http://example.com/search?q=espa%C3%B1a#top"]
+ describe "external links" do
+ it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
+ m = MetaInspector.new('http://international.com')
+ m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
+ "http://example.com/roman%C3%A9e",
+ "http://example.com/faqs#cami%C3%B3n",
+ "http://example.com/search?q=cami%C3%B3n",
+ "http://example.com/search?q=espa%C3%B1a#top"]
+ end
+
+ it "should not crash when processing malformed hrefs" do
+ m = MetaInspector.new('http://example.com/malformed_href')
+ expect {
+ m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
+ "javascript:alert('ok');", "javascript://",
+ "mailto:email(at)example.com"]
+ m.should_not be_ok
+ }.to_not raise_error
+ end
end
end
View
1  spec/spec_helper.rb
@@ -29,6 +29,7 @@ def fixture_file(filename)
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
+FakeWeb.register_uri(:get, "http://example.com/malformed_href", :response => fixture_file("malformed_href.response"))
FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))
Please sign in to comment.
Something went wrong with that request. Please try again.