Skip to content

Commit

Permalink
Fixes a bug that caused relative links not to be properly absolutifie…
Browse files Browse the repository at this point in the history
…d in the presence of a <base> tag. Fixes #46
  • Loading branch information
jaimeiniesta committed Oct 1, 2013
1 parent c0ef38c commit 571a13b
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 1 deletion.
13 changes: 12 additions & 1 deletion lib/meta_inspector/scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -234,12 +234,23 @@ def absolutify_url(uri)
if uri =~ /^\w*\:/i
normalize_url(uri)
else
Addressable::URI.join(@url, uri).normalize.to_s
Addressable::URI.join(base_url, uri).normalize.to_s
end
rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
add_fatal_error "Link parsing exception: #{e.message}" and nil
end

# Returns the base url to absolutify relative links. This can be the one set on a <base> tag,
# or the url of the document if no <base> tag was found.
def base_url
base_href || @url
end

# Returns the value of the href attribute on the <base /> tag, if it exists
def base_href
parsed_document.search('base').first.attributes['href'].value rescue nil
end

# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
def unrelativize_url(url)
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
Expand Down
22 changes: 22 additions & 0 deletions spec/fixtures/relative_links_with_base.response
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
HTTP/1.1 200 OK
Server: nginx/1.0.5
Date: Thu, 29 Dec 2011 23:10:13 GMT
Content-Type: text/html
Content-Length: 15013
Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
Connection: keep-alive
Accept-Ranges: bytes

<!DOCTYPE html>
<html>
<head>
<base href="http://relativewithbase.com/" />
<meta charset="utf-8" />
<title>Relative links</title>
</head>
<body>
<p>Relative links</p>
<a href="about">About</a>
<a href="../sitemap">Sitemap</a>
</body>
</html>
12 changes: 12 additions & 0 deletions spec/metainspector_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,18 @@
end
end

describe 'Relative links with base' do
it 'should get the relative links from a document' do
m = MetaInspector.new('http://relativewithbase.com/company/page2')
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
end

it 'should get the relative links from a directory' do
m = MetaInspector.new('http://relativewithbase.com/company/page2/')
m.internal_links.should == ['http://relativewithbase.com/about', 'http://relativewithbase.com/sitemap']
end
end

describe 'Non-HTTP links' do
before(:each) do
@m = MetaInspector.new('http://example.com/nonhttp')
Expand Down
4 changes: 4 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ def fixture_file(filename)
FakeWeb.register_uri(:get, "http://relative.com/company", :response => fixture_file("relative_links.response"))
FakeWeb.register_uri(:get, "http://relative.com/company/", :response => fixture_file("relative_links.response"))

FakeWeb.register_uri(:get, "http://relativewithbase.com/", :response => fixture_file("relative_links_with_base.response"))
FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2", :response => fixture_file("relative_links_with_base.response"))
FakeWeb.register_uri(:get, "http://relativewithbase.com/company/page2/", :response => fixture_file("relative_links_with_base.response"))

# These examples are used to test the redirections from HTTP to HTTPS and vice versa
# http://facebook.com => https://facebook.com
FakeWeb.register_uri(:get, "http://facebook.com/", :response => fixture_file("facebook.com.response"))
Expand Down

0 comments on commit 571a13b

Please sign in to comment.