Permalink
Browse files

Convert protocol-relative URLs and store scheme

  • Loading branch information...
1 parent da4cb57 commit 0fe70103e9ee4d87fa02f52a37402f88a98aebb2 @jaimeiniesta committed Dec 30, 2011
Showing with 70 additions and 10 deletions.
  1. +1 −0 README.rdoc
  2. +10 −4 lib/meta_inspector/scraper.rb
  3. +26 −0 spec/fixtures/protocol_relative.response
  4. +33 −6 spec/metainspector_spec.rb
View
@@ -28,6 +28,7 @@ by defaul:
Then you can see the scraped data like this:
page.url # URL of the page
+ page.scheme # Scheme of the page (http, https)
page.title # title of the page, as string
page.links # array of strings, with every link found on the page
page.absolute_links # array of all the links converted to absolute urls
@@ -9,12 +9,13 @@
# MetaInspector provides an easy way to scrape web pages and get its elements
module MetaInspector
class Scraper
- attr_reader :url
+ attr_reader :url, :scheme
# Initializes a new instance of MetaInspector, setting the URL to the one given
# If no scheme given, set it to http:// by default
def initialize(url)
- @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
- @data = Hashie::Rash.new('url' => @url)
+ @url = URI.parse(url).scheme.nil? ? 'http://' + url : url
+ @scheme = URI.parse(url).scheme || 'http'
+ @data = Hashie::Rash.new('url' => @url)
end
# Returns the parsed document title, from the content of the <title> tag.
@@ -44,7 +45,7 @@ def images
# Returns the links converted to absolute urls
def absolute_links
- @data.absolute_links ||= links.map { |l| absolutify_url(l) }
+ @data.absolute_links ||= links.map { |l| absolutify_url(unrelativize_url(l)) }
end
def absolute_images
@@ -137,6 +138,11 @@ def absolutify_url(url)
url =~ /^http.*/ ? url : File.join(@url,url)
end
+ # Convert a protocol-relative url to its full form, depending on the scheme of the page that contains it
+ def unrelativize_url(url)
+ url =~ /^\/\// ? "#{scheme}://#{url[2..-1]}" : url
+ end
+
# Remove mailto links
# TODO: convert this to a more generic filter to remove all non-http[s] like ftp, telnet, etc.
def remove_mailto(links)
@@ -0,0 +1,26 @@
+HTTP/1.1 200 OK
+Server: nginx/1.0.5
+Date: Thu, 29 Dec 2011 23:10:13 GMT
+Content-Type: text/html
+Content-Length: 15013
+Last-Modified: Fri, 02 Dec 2011 21:00:49 GMT
+Connection: keep-alive
+Accept-Ranges: bytes
+
+<!DOCTYPE html>
+<html>
+<head>
+ <meta charset="utf-8" />
+ <title>Protocol-relative URLs</title>
+</head>
+<body>
+ <p>Internal links</p>
+ <a href="/">Internal: home page</a>
+ <a href="/faqs">Internal: FAQs</a>
+ <a href="//protocol-relative.com/contact">Internal: protocol-relative</a>
+
+ <p>External links</p>
+ <a href="http://google.com">External: normal link</a>
+ <a href="//yahoo.com">External: protocol-relative link</a>
+</body>
+</html>
View
@@ -17,6 +17,11 @@
@m = MetaInspector.new('pagerankalert.com')
@m.url.should == 'http://pagerankalert.com'
end
+
+ it "should store the scheme" do
+ MetaInspector.new('http://pagerankalert.com').scheme.should == 'http'
+ MetaInspector.new('https://pagerankalert.com').scheme.should == 'https'
+ end
end
context 'Doing a basic scrape' do
@@ -76,19 +81,19 @@
@m.feed.should == 'http://www.tea-tron.com/jbravo/blog/feed/'
end
end
-
+
context 'Page with missing meta description' do
FakeWeb.register_uri(:get, "http://theonion-no-description.com", :response => fixture_file("theonion-no-description.com.response"))
-
- it "should find secondary description" do
+
+ it "should find secondary description" do
@m = MetaInspector.new('http://theonion-no-description.com')
@m.description == "SAN FRANCISCO&#8212;In a move expected to revolutionize the mobile device industry, Apple launched its fastest and most powerful iPhone to date Tuesday,"+
" an innovative new model that can only be seen by the company's hippest and most dedicated customers. This is secondary text picked up because of a missing meta description."
end
-
+
end
-
-
+
+
context 'Links' do
before(:each) do
@m = MetaInspector.new('http://pagerankalert.com')
@@ -119,6 +124,28 @@
end
end
+
+ context 'Protocol-relative URLs' do
+ FakeWeb.register_uri(:get, "http://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
+ FakeWeb.register_uri(:get, "https://protocol-relative.com", :response => fixture_file("protocol_relative.response"))
+
+ before(:each) do
+ @m_http = MetaInspector.new('http://protocol-relative.com')
+ @m_https = MetaInspector.new('https://protocol-relative.com')
+ end
+
+ it "should convert protocol-relative links to http" do
+ @m_http.absolute_links.should include('http://protocol-relative.com/contact')
+ @m_http.absolute_links.should include('http://yahoo.com')
+ end
+
+ it "should convert protocol-relative links to https" do
+ @m_https.absolute_links.should include('https://protocol-relative.com/contact')
+ @m_https.absolute_links.should include('https://yahoo.com')
+ end
+ end
+
+
context 'Getting meta tags by ghost methods' do
before(:each) do
@m = MetaInspector.new('http://pagerankalert.com')

0 comments on commit 0fe7010

Please sign in to comment.