Permalink
Browse files

Deprecate absolute_links and absolute_images in favor of links and im…

…ages
  • Loading branch information...
1 parent 8f1f61e commit c8f26a8621606dc57dbc4b788646796490bf4739 @jaimeiniesta committed Jul 23, 2012
Showing with 62 additions and 74 deletions.
  1. +2 −4 README.rdoc
  2. +23 −13 lib/meta_inspector/scraper.rb
  3. +37 −57 spec/metainspector_spec.rb
View
@@ -35,14 +35,12 @@ Then you can see the scraped data like this:
page.host # Hostname of the page (like, w3clove.com, without the scheme)
page.root_url # Root url (scheme + host, like http://w3clove.com/)
page.title # title of the page, as string
- page.links # array of strings, with every link found on the page
- page.absolute_links # array of all the links converted to absolute urls
+ page.links # array of strings, with every link found on the page as an absolute URL
page.meta_description # meta description, as string
page.description # returns the meta description, or the first long paragraph if no meta description is found
page.meta_keywords # meta keywords, as string
page.image # Most relevant image, if defined with og:image
- page.images # array of strings, with every img found on the page
- page.absolute_images # array of all the images converted to absolute urls
+ page.images # array of strings, with every img found on the page as an absolute URL
page.feed # Get rss or atom links in meta data fields as array
page.meta_og_title # opengraph title
page.meta_og_image # opengraph image
@@ -35,26 +35,24 @@ def description
meta_description.nil? ? secondary_description : meta_description
end
- # Returns the parsed document links
+ # Links found on the page, as absolute URLs
def links
- @data.links ||= parsed_document.search("//a") \
- .map {|link| link.attributes["href"] \
- .to_s.strip}.uniq rescue nil
+ @data.links ||= parsed_links.map { |l| absolutify_url(unrelativize_url(l)) }
end
- def images
- @data.images ||= parsed_document.search('//img') \
- .reject{|i| i.attributes['src'].blank? } \
- .map{ |i| i.attributes['src'].value }.uniq
+ def absolute_links
+ warn "absolute_links is deprecated since 1.9.4 and will be removed, use links instead"
+ links
end
- # Returns the links converted to absolute urls
- def absolute_links
- @data.absolute_links ||= links.map { |l| absolutify_url(unrelativize_url(l)) }
+ # Images found on the page, as absolute URLs
+ def images
+ @data.images ||= parsed_images.map{ |i| absolutify_url(i) }
end
def absolute_images
- @data.absolute_images ||= images.map{ |i| absolutify_url(i) }
+ warn "absolute_images is deprecated since 1.9.4 and will be removed, use images instead"
+ images
end
# Returns the parsed document meta rss links
@@ -83,7 +81,7 @@ def charset
# Returns all parsed data as a nested Hash
def to_hash
# TODO: find a better option to populate the data to the Hash
- image;feed;links;charset;absolute_links;title;meta_keywords
+ image;images;feed;links;charset;title;meta_keywords
@data.to_hash
end
@@ -148,6 +146,18 @@ def method_missing(method_name)
private
+ def parsed_links
+ @parsed_links ||= parsed_document.search("//a") \
+ .map {|link| link.attributes["href"] \
+ .to_s.strip}.uniq rescue nil
+ end
+
+ def parsed_images
+ @parsed_images ||= parsed_document.search('//img') \
+ .reject{|i| i.attributes['src'].blank? } \
+ .map{ |i| i.attributes['src'].value }.uniq
+ end
+
# Stores the error for later inspection
def add_fatal_error(error)
@errors << error
View
@@ -65,8 +65,7 @@
end
it "should find all page images" do
- @m.absolute_images == ["http://pagerankalert.com/images/pagerank_alert.png?1309512337"]
- @m.images == ["/images/pagerank_alert.png?1309512337"]
+ @m.images == ["http://pagerankalert.com/images/pagerank_alert.png?1309512337"]
end
it "should ignore malformed image tags" do
@@ -108,56 +107,41 @@
end
it "should get the links" do
- @m.links.should == [
- "/",
- "/es?language=es",
- "/users/sign_up",
- "/users/sign_in",
- "mailto:pagerankalert@gmail.com",
- "http://pagerankalert.posterous.com",
- "http://twitter.com/pagerankalert",
- "http://twitter.com/share"
- ]
- end
-
- it "should convert links to absolute urls" do
- @m.absolute_links.should == [
- "http://pagerankalert.com/",
- "http://pagerankalert.com/es?language=es",
- "http://pagerankalert.com/users/sign_up",
- "http://pagerankalert.com/users/sign_in",
- "mailto:pagerankalert@gmail.com",
- "http://pagerankalert.posterous.com",
- "http://twitter.com/pagerankalert",
- "http://twitter.com/share"
- ]
+ @m.links.should == [ "http://pagerankalert.com/",
+ "http://pagerankalert.com/es?language=es",
+ "http://pagerankalert.com/users/sign_up",
+ "http://pagerankalert.com/users/sign_in",
+ "mailto:pagerankalert@gmail.com",
+ "http://pagerankalert.posterous.com",
+ "http://twitter.com/pagerankalert",
+ "http://twitter.com/share" ]
end
it "should get correct absolute links for internal pages" do
m = MetaInspector.new('http://w3clove.com/faqs')
- m.absolute_links.should == ["http://w3clove.com/faqs/#",
- "http://w3clove.com/",
- "http://w3clove.com/faqs",
- "http://w3clove.com/plans-and-pricing",
- "http://w3clove.com/contact",
- "http://w3clove.com/charts/errors",
- "http://w3clove.com/credits",
- "http://w3clove.com/signin",
- "http://validator.w3.org",
- "http://www.sitemaps.org/",
- "http://jaimeiniesta.com/",
- "http://mendicantuniversity.org/",
- "http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
- "http://majesticseacreature.com/",
- "http://school.mendicantuniversity.org/alumni/2011",
- "https://github.com/jaimeiniesta/w3clove",
- "http://w3clove.com",
- "http://w3clove.com/api_v1_reference",
- "https://twitter.com/w3clove",
- "http://twitter.com/share",
- "http://w3clove.com/terms_of_service",
- "http://twitter.com/W3CLove",
- "http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609"]
+ m.links.should == [ "http://w3clove.com/faqs/#",
+ "http://w3clove.com/",
+ "http://w3clove.com/faqs",
+ "http://w3clove.com/plans-and-pricing",
+ "http://w3clove.com/contact",
+ "http://w3clove.com/charts/errors",
+ "http://w3clove.com/credits",
+ "http://w3clove.com/signin",
+ "http://validator.w3.org",
+ "http://www.sitemaps.org/",
+ "http://jaimeiniesta.com/",
+ "http://mendicantuniversity.org/",
+ "http://jaimeiniesta.posterous.com/rbmu-a-better-way-to-learn-ruby",
+ "http://majesticseacreature.com/",
+ "http://school.mendicantuniversity.org/alumni/2011",
+ "https://github.com/jaimeiniesta/w3clove",
+ "http://w3clove.com",
+ "http://w3clove.com/api_v1_reference",
+ "https://twitter.com/w3clove",
+ "http://twitter.com/share",
+ "http://w3clove.com/terms_of_service",
+ "http://twitter.com/W3CLove",
+ "http://us4.campaign-archive1.com/home/?u=6af3ab69c286561d0f0f25671&id=04a0dab609" ]
end
end
@@ -176,10 +160,6 @@
"telnet://telnet.cdrom.com"
]
end
-
- it "should return the same links as absolute links do" do
- @m.absolute_links.should == @m.links
- end
end
describe 'Protocol-relative URLs' do
@@ -189,13 +169,13 @@
end
it "should convert protocol-relative links to http" do
- @m_http.absolute_links.should include('http://protocol-relative.com/contact')
- @m_http.absolute_links.should include('http://yahoo.com')
+ @m_http.links.should include('http://protocol-relative.com/contact')
+ @m_http.links.should include('http://yahoo.com')
end
it "should convert protocol-relative links to https" do
- @m_https.absolute_links.should include('https://protocol-relative.com/contact')
- @m_https.absolute_links.should include('https://yahoo.com')
+ @m_https.links.should include('https://protocol-relative.com/contact')
+ @m_https.links.should include('https://yahoo.com')
end
end
@@ -265,7 +245,7 @@
describe 'to_hash' do
it "should return a hash with all the values set" do
@m = MetaInspector.new('http://pagerankalert.com')
- @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "links"=>["/", "/es?language=es", "/users/sign_up", "/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "absolute_links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
+ @m.to_hash.should == {"title"=>"PageRankAlert.com :: Track your PageRank changes", "url"=>"http://pagerankalert.com", "meta"=>{"name"=>{"robots"=>"all,follow", "csrf_param"=>"authenticity_token", "description"=>"Track your PageRank(TM) changes and receive alerts by email", "keywords"=>"pagerank, seo, optimization, google", "csrf_token"=>"iW1/w+R8zrtDkhOlivkLZ793BN04Kr3X/pS+ixObHsE="}, "property"=>{}}, "images"=>["http://pagerankalert.com/images/pagerank_alert.png?1305794559"], "charset"=>"utf-8", "feed"=>"http://feeds.feedburner.com/PageRankAlert", "links"=>["http://pagerankalert.com/", "http://pagerankalert.com/es?language=es", "http://pagerankalert.com/users/sign_up", "http://pagerankalert.com/users/sign_in", "mailto:pagerankalert@gmail.com", "http://pagerankalert.posterous.com", "http://twitter.com/pagerankalert", "http://twitter.com/share"]}
end
end

0 comments on commit c8f26a8

Please sign in to comment.