Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Recover from malformed href attribute when processing internal and ex…

…ternal links. Fixes #34 and #35
  • Loading branch information...
commit 0da77c1735e13e34d783391b4a6304446050e803 1 parent 47eb87d
Jaime Iniesta authored December 13, 2012
11  lib/meta_inspector/scraper.rb
@@ -53,12 +53,12 @@ def links
53 53
 
54 54
     # Internal links found on the page, as absolute URLs
55 55
     def internal_links
56  
-      @internal_links ||= links.select {|link| URI.parse(link).host == host }
  56
+      @internal_links ||= links.select {|link| host_from_url(link) == host }
57 57
     end
58 58
 
59 59
     # External links found on the page, as absolute URLs
60 60
     def external_links
61  
-      @external_links ||= links.select {|link| URI.parse(link).host != host }
  61
+      @external_links ||= links.select {|link| host_from_url(link) != host }
62 62
     end
63 63
 
64 64
     # Images found on the page, as absolute URLs
@@ -245,6 +245,13 @@ def unrelativize_url(url)
245 245
       url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
246 246
     end
247 247
 
  248
+    # Extracts the host from a given URL
  249
+    def host_from_url(url)
  250
+      URI.parse(url).host
  251
+    rescue URI::InvalidURIError, URI::InvalidComponentError => e
  252
+      add_fatal_error "Link parsing exception: #{e.message}" and nil
  253
+    end
  254
+
248 255
     # Look for the first <p> block with 120 characters or more
249 256
     def secondary_description
250 257
       first_long_paragraph = parsed_document.search('//p[string-length() >= 120]').first
27  spec/fixtures/malformed_href.response
... ...
@@ -0,0 +1,27 @@
  1
+HTTP/1.1 200 OK
  2
+Server: nginx/0.7.67
  3
+Date: Fri, 18 Nov 2011 21:46:46 GMT
  4
+Content-Type: text/html
  5
+Connection: keep-alive
  6
+Last-Modified: Mon, 14 Nov 2011 16:53:18 GMT
  7
+Content-Length: 4987
  8
+X-Varnish: 2000423390
  9
+Age: 0
  10
+Via: 1.1 varnish
  11
+
  12
+<html>
  13
+  <head>
  14
+    <title>Malformed hrefs</title>
  15
+  </head>
  16
+  <body>
  17
+    <h1>Good links</h1>
  18
+    <a href="/faqs">FAQs</a>
  19
+    <a href="skype:joeuser?call">a skype link</a>
  20
+    <a href="telnet://telnet.cdrom.com">a telnet link</a>
  21
+    <a href="javascript:alert('ok');">ok</a>
  22
+
  23
+    <h1>Bad links due to malformed href</h1>
  24
+    <a href="javascript://">oops</a>
  25
+    <a href="mailto:email(at)example.com">
  26
+  </body>
  27
+</html>
50  spec/metainspector_spec.rb
@@ -177,22 +177,44 @@
177 177
                             "http://example.com/search?q=espa%C3%B1a#top"]
178 178
       end
179 179
 
180  
-      it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
181  
-        m = MetaInspector.new('http://international.com')
182  
-        m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
183  
-                                     "http://international.com/roman%C3%A9e",
184  
-                                     "http://international.com/faqs#cami%C3%B3n",
185  
-                                     "http://international.com/search?q=cami%C3%B3n",
186  
-                                     "http://international.com/search?q=espa%C3%B1a#top"]
  180
+      describe "internal links" do
  181
+        it "should get correct internal links, encoding the URLs as needed but respecting # and ?" do
  182
+          m = MetaInspector.new('http://international.com')
  183
+          m.internal_links.should == [ "http://international.com/espa%C3%B1a.asp",
  184
+                                       "http://international.com/roman%C3%A9e",
  185
+                                       "http://international.com/faqs#cami%C3%B3n",
  186
+                                       "http://international.com/search?q=cami%C3%B3n",
  187
+                                       "http://international.com/search?q=espa%C3%B1a#top"]
  188
+        end
  189
+
  190
+        it "should not crash when processing malformed hrefs" do
  191
+          m = MetaInspector.new('http://example.com/malformed_href')
  192
+          expect {
  193
+            m.internal_links.should == [ "http://example.com/faqs" ]
  194
+            m.should_not be_ok
  195
+          }.to_not raise_error
  196
+        end
187 197
       end
188 198
 
189  
-      it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
190  
-        m = MetaInspector.new('http://international.com')
191  
-        m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
192  
-                                     "http://example.com/roman%C3%A9e",
193  
-                                     "http://example.com/faqs#cami%C3%B3n",
194  
-                                     "http://example.com/search?q=cami%C3%B3n",
195  
-                                     "http://example.com/search?q=espa%C3%B1a#top"]
  199
+      describe "external links" do
  200
+        it "should get correct external links, encoding the URLs as needed but respecting # and ?" do
  201
+          m = MetaInspector.new('http://international.com')
  202
+          m.external_links.should == [ "http://example.com/espa%C3%B1a.asp",
  203
+                                       "http://example.com/roman%C3%A9e",
  204
+                                       "http://example.com/faqs#cami%C3%B3n",
  205
+                                       "http://example.com/search?q=cami%C3%B3n",
  206
+                                       "http://example.com/search?q=espa%C3%B1a#top"]
  207
+        end
  208
+
  209
+        it "should not crash when processing malformed hrefs" do
  210
+          m = MetaInspector.new('http://example.com/malformed_href')
  211
+          expect {
  212
+            m.external_links.should == ["skype:joeuser?call", "telnet://telnet.cdrom.com",
  213
+                                        "javascript:alert('ok');", "javascript://",
  214
+                                        "mailto:email(at)example.com"]
  215
+            m.should_not be_ok
  216
+          }.to_not raise_error
  217
+        end
196 218
       end
197 219
     end
198 220
 
1  spec/spec_helper.rb
@@ -29,6 +29,7 @@ def fixture_file(filename)
29 29
 FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
30 30
 FakeWeb.register_uri(:get, "http://example.com/nonhttp", :response => fixture_file("nonhttp.response"))
31 31
 FakeWeb.register_uri(:get, "http://example.com/invalid_href", :response => fixture_file("invalid_href.response"))
  32
+FakeWeb.register_uri(:get, "http://example.com/malformed_href", :response => fixture_file("malformed_href.response"))
32 33
 FakeWeb.register_uri(:get, "http://www.youtube.com/watch?v=iaGSSrp49uc", :response => fixture_file("youtube.response"))
33 34
 FakeWeb.register_uri(:get, "http://markupvalidator.com/faqs", :response => fixture_file("markupvalidator_faqs.response"))
34 35
 FakeWeb.register_uri(:get, "https://twitter.com/markupvalidator", :response => fixture_file("twitter_markupvalidator.response"))

0 notes on commit 0da77c1

Please sign in to comment.
Something went wrong with that request. Please try again.