Skip to content

Commit

Permalink
Convert protocol-relative URLs and store scheme
Browse files Browse the repository at this point in the history
  • Loading branch information
jaimeiniesta committed Dec 30, 2011
1 parent da4cb57 commit 0fe7010
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 10 deletions.
1 change: 1 addition & 0 deletions README.rdoc
Expand Up @@ -28,6 +28,7 @@ by defaul:
Then you can see the scraped data like this: Then you can see the scraped data like this:


page.url # URL of the page page.url # URL of the page
page.scheme # Scheme of the page (http, https)
page.title # title of the page, as string page.title # title of the page, as string
page.links # array of strings, with every link found on the page page.links # array of strings, with every link found on the page
page.absolute_links # array of all the links converted to absolute urls page.absolute_links # array of all the links converted to absolute urls
Expand Down
14 changes: 10 additions & 4 deletions lib/meta_inspector/scraper.rb
Expand Up @@ -9,12 +9,13 @@
# MetaInspector provides an easy way to scrape web pages and get its elements # MetaInspector provides an easy way to scrape web pages and get its elements
module MetaInspector module MetaInspector
class Scraper class Scraper
attr_reader :url attr_reader :url, :scheme
# Initializes a new instance of MetaInspector, setting the URL to the one given # Initializes a new instance of MetaInspector, setting the URL to the one given
# If no scheme given, set it to http:// by default # If no scheme given, set it to http:// by default
def initialize(url) def initialize(url)
@url = URI.parse(url).scheme.nil? ? 'http://' + url : url @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
@data = Hashie::Rash.new('url' => @url) @scheme = URI.parse(url).scheme || 'http'
@data = Hashie::Rash.new('url' => @url)
end end


# Returns the parsed document title, from the content of the <title> tag. # Returns the parsed document title, from the content of the <title> tag.
Expand Down Expand Up @@ -44,7 +45,7 @@ def images


# Returns the links converted to absolute urls # Returns the links converted to absolute urls
def absolute_links def absolute_links
@data.absolute_links ||= links.map { |l| absolutify_url(l) } @data.absolute_links ||= links.map { |l| absolutify_url(unrelativize_url(l)) }
end end


def absolute_images def absolute_images
Expand Down Expand Up @@ -137,6 +138,11 @@ def absolutify_url(url)
url =~ /^http.*/ ? url : File.join(@url,url) url =~ /^http.*/ ? url : File.join(@url,url)
end end


# Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
def unrelativize_url(url)
url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
end

# Remove mailto links # Remove mailto links
# TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc. # TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
def remove_mailto(links) def remove_mailto(links)
Expand Down
26 changes: 26 additions & 0 deletions spec/fixtures/protocol_relative.response
@@ -0,0 +1,26 @@
HTTP/1.1 200 OK
Server: nginx/1.0.5
Date: Thu, 29 Dec 2011 23:10:13 GMT
Content-Type: text/html
Content-Length: 15013
Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
Connection: keep-alive
Accept-Ranges: bytes

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>Protocol-relative URLs</title>
</head>
<body>
<p>Internal links</p>
<a href="/">Internal: home page</a>
<a href="/faqs">Internal: FAQs</a>
<a href="//protocol-relative.com/contact">Internal: protocol-relative</a>

<p>External links</p>
<a href="http://google.com">External: normal link</a>
<a href="//yahoo.com">External: protocol-relative link</a>
</body>
</html>
39 changes: 33 additions & 6 deletions spec/metainspector_spec.rb
Expand Up @@ -17,6 +17,11 @@
@m = MetaInspector.new('pagerankalert.com') @m = MetaInspector.new('pagerankalert.com')
@m.url.should == 'http://pagerankalert.com' @m.url.should == 'http://pagerankalert.com'
end end

it "should store the scheme" do
MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
end
end end


context 'Doing a basic scrape' do context 'Doing a basic scrape' do
Expand Down Expand Up @@ -76,19 +81,19 @@
@m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/' @m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
end end
end end

context 'Page with missing meta description' do context 'Page with missing meta description' do
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response")) FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))

it "should find secondary description" do it "should find secondary description" do
@m = MetaInspector.new('http://theonion-no-description.com') @m = MetaInspector.new('http://theonion-no-description.com')
@m.description == "SAN FRANCISCO&#8212;In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+ @m.description == "SAN FRANCISCO&#8212;In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
" an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description." " an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
end end

end end


context 'Links' do context 'Links' do
before(:each) do before(:each) do
@m = MetaInspector.new('http://pagerankalert.com') @m = MetaInspector.new('http://pagerankalert.com')
Expand Down Expand Up @@ -119,6 +124,28 @@
end end
end end



context 'Protocol-relative URLs' do
FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))

before(:each) do
@m_http = MetaInspector.new('http://protocol-relative.com')
@m_https = MetaInspector.new('https://protocol-relative.com')
end

it "should convert protocol-relative links to http" do
@m_http.absolute_links.should include('http://protocol-relative.com/contact')
@m_http.absolute_links.should include('http://yahoo.com')
end

it "should convert protocol-relative links to https" do
@m_https.absolute_links.should include('https://protocol-relative.com/contact')
@m_https.absolute_links.should include('https://yahoo.com')
end
end


context 'Getting meta tags by ghost methods' do context 'Getting meta tags by ghost methods' do
before(:each) do before(:each) do
@m = MetaInspector.new('http://pagerankalert.com') @m = MetaInspector.new('http://pagerankalert.com')
Expand Down

0 comments on commit 0fe7010

Please sign in to comment.