Using RSpec

jaimeiniesta · May 4, 2011 · e454325 · e454325
1 parent 80aa723
commit e454325
Show file tree

Hide file tree

Showing 6 changed files with 169 additions and 137 deletions.
diff --git a/CHANGELOG.rdoc b/CHANGELOG.rdoc
diff --git a/README.rdoc b/README.rdoc
@@ -18,6 +18,14 @@ If you're on Ubuntu, you might need to install these packages before installing
 
   sudo aptitude install libxslt-dev libxml2 libxml2-dev
 
+Please note that you should use libxml2 version 2.7.4 or later, as there is a bug in earlier versions:
+
+* http://groups.google.com/group/nokogiri-talk/browse_thread/thread/3274c25e394fde68
+
+It also uses the charguess ruby gem, you can install it with:
+
+  sudo gem install charguess
+
 = Installation
 
 Run the following if you haven't already:
@@ -38,10 +46,24 @@ Once scraped, you can see the scraped data like this:
 
   page.address       # URL of the page
   page.title         # title of the page, as string
-  page.description   # meta description, as string
-  page.keywords      # meta keywords, as string
   page.links         # array of strings, with every link found on the page
-
+  page.meta_description   # meta description, as string
+  page.meta_keywords      # meta keywords, as string
+
+MetaInspector uses dynamic methods for meta_tag discovery, so all these will work, and will be converted to a search of a meta tag by the corresponding name, and return its content attribute
+
+  page.meta_description       # <meta name="description" content="..." />
+  page.meta_keywords          # <meta name="keywords" content="..." />
+  page.meta_robots            # <meta name="robots" content="..." />
+  page.meta_generator         # <meta name="generator" content="..." />
+
+It will also work for the meta tags of the form <meta http-equiv="name" ... />, like the following:
+
+  page.meta_content_language  # <meta http-equiv="content-language" content="..." />
+  page.meta_Content_Type      # <meta http-equiv="Content-Type" content="..." />
+
+Please notice that MetaInspector is case sensitive, so page.meta_Content_Type is not the same as page.meta_content_type
+
 The full scraped document if accessible from:
 
   page.document # Nokogiri doc that you can use it to get any element from the page
@@ -53,34 +75,33 @@ You can find some sample scripts on the samples folder, including a basic scrapi
   $ irb
   >> require 'metainspector'
   => true
-  
+
   >> page = MetaInspector.new('http://pagerankalert.com')
   => #<MetaInspector:0x11330c0 @document=nil, @links=nil, @address="http://pagerankalert.com", @description=nil, @keywords=nil, @title=nil>
-  
+
   >> page.title
   => "PageRankAlert.com :: Track your pagerank changes"
-  
-  >> page.description
+
+  >> page.meta_description
   => "Track your PageRank(TM) changes and receive alert by email"
-  
-  >> page.keywords
+
+  >> page.meta_keywords
   => "pagerank, seo, optimization, google"
-  
+
   >> page.links.size
   => 31
-  
+
   >> page.links[30]
   => "http://www.nuvio.cz/"
-  
+
   >> page.document.class
   => Nokogiri::HTML::Document
-    
+
 = To Do
 
 * Get page.base_dir from the address
 * Distinguish between external and internal links, returning page.links for all of them as found, page.external_links and page.internal_links converted to absolute URLs
 * Return array of images in page as absolute URLs
-* Return contents of meta robots tag
 * Be able to set a timeout in seconds
 * Detect charset
 * If keywords seem to be separated by blank spaces, replace them with commas

diff --git a/lib/metainspector.rb b/lib/metainspector.rb
@@ -1,67 +1,96 @@
 require 'open-uri'
 require 'rubygems'
 require 'nokogiri'
-require 'UniversalDetector'
+require 'charguess'
 require 'iconv'
 
 # MetaInspector provides an easy way to scrape web pages and get its elements
 class MetaInspector
-  VERSION = '1.1.5'
-  
+  VERSION = '1.1.6'
+
   attr_reader :address
-  
+
   # Initializes a new instance of MetaInspector, setting the URL address to the one given
-  # TODO: validate address as http URL, dont initialize it if wrong format 
+  # TODO: validate address as http URL, dont initialize it if wrong format
   def initialize(address)
     @address = address
-    
+
     @document = @title = @description = @keywords = @links = nil
   end
-
-  # Returns the parsed document title
+
+  # Returns the parsed document title, from the content of the <title> tag.
+  # This is not the same as the meta_tite tag
   def title
     @title ||= parsed_document.css('title').inner_html rescue nil
   end
-
-  # Returns the parsed document meta description
-  def description
-    @description ||= parsed_document.css("meta[@name='description']").first['content'] rescue nil
-  end
-
-  # Returns the parsed document meta keywords
-  def keywords
-    @keywords ||= parsed_document.css("meta[@name='keywords']").first['content'] rescue nil
-  end
-
+
   # Returns the parsed document links
   def links
     @links ||= parsed_document.search("//a").map {|link| link.attributes["href"].to_s.strip} rescue nil
   end
-
-  # Returns the specified charset, or tries to guess it
+
+  # Returns the charset
+  # TODO: We should trust the charset expressed on the Content-Type meta tag
+  # and only guess it if none given
   def charset
-    @charset ||= UniversalDetector::chardet(document)['encoding'].downcase
+    @charset ||= CharGuess.guess(document).downcase
   end
-  
+
   # Returns the whole parsed document
   def parsed_document
     @parsed_document ||= Nokogiri::HTML(document)
-    
+
     rescue
-      puts 'An exception occurred while trying to scrape the page!'
+      warn 'An exception occurred while trying to scrape the page!'
   end
-  
+
   # Returns the original, unparsed document
   def document
     @document ||= open(@address).read
-    
+
     rescue SocketError
-      puts 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
+      warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)'
       @scraped = false
     rescue TimeoutError
-      puts 'Timeout!!!'
+      warn 'Timeout!!!'
     rescue
-      puts 'An exception occurred while trying to fetch the page!'
+      warn 'An exception occurred while trying to fetch the page!'
+  end
+
+  # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for
+  # meta name: keywords, description, robots, generator
+  # meta http-equiv: content-language, Content-Type
+  #
+  # It will first try with meta name="..." and if nothing found,
+  # with meta http-equiv="...", substituting "_" by "-"
+  # TODO: this should be case unsensitive, so meta_robots gets the results from the HTML for robots, Robots, ROBOTS...
+  # TODO: cache results on instance variables, using ||=
+  # TODO: define respond_to? to return true on the meta_name methods
+  def method_missing(method_name)
+    if method_name.to_s =~ /^meta_(.*)/
+      content = parsed_document.css("meta[@name='#{$1}']").first['content'] rescue nil
+      content = parsed_document.css("meta[@http-equiv='#{$1.gsub("_", "-")}']").first['content'] rescue nil if content.nil?
+
+      content
+    else
+      super
+    end
+  end
+
+  #########################################################################################################
+  # DEPRECATED METHODS
+  # These methods are deprecated and will disappear soonish.
+
+  # DEPRECATED: Returns the parsed document meta description
+  def description
+    warn "DEPRECATION WARNING: description method is deprecated since 1.1.6 and will be removed on 1.2.0, use meta_description instead"
+    @description ||= meta_description rescue nil
+  end
+
+  # DEPRECATED: Returns the parsed document meta keywords
+  def keywords
+    warn "DEPRECATION WARNING: keywords method is deprecated since 1.1.6 and will be removed on 1.2.0, use meta_keywords instead"
+    @keywords ||= meta_keywords rescue nil
   end
 
 end
diff --git a/metainspector.gemspec b/metainspector.gemspec
@@ -1,10 +1,10 @@
 Gem::Specification.new do |s|
   s.name = "metainspector"
-  s.version = "1.1.5"
-  s.date = "2009-06-21"
+  s.version = "1.1.6"
+  s.date = "2009-09-20"
   s.summary = "Ruby gem for web scraping"
   s.email = "jaimeiniesta@gmail.com"
-  s.homepage = "http://github.com/jaimeiniesta/metainspector/tree/master"
+  s.homepage = "http://github.com/jaimeiniesta/metainspector"
   s.description = "MetaInspector is a ruby gem for web scraping purposes, that returns a hash with metadata from a given URL"
   s.has_rdoc = false
   s.authors = ["Jaime Iniesta"]
@@ -16,9 +16,9 @@ Gem::Specification.new do |s|
     "lib/metainspector.rb",
     "samples/basic_scraping.rb",
     "samples/spider.rb"]
-  s.test_files = ["test/test_metainspector.rb"]
+  s.test_files = ["spec/metainspector_spec.rb", "spec/spec_helper.rb"]
   s.rdoc_options = []
   s.extra_rdoc_files = []
-  s.add_dependency("nokogiri", ["> 1.2"])
+  s.add_dependency("nokogiri", ["> 1.3.3"])
   s.add_dependency("chardet", [">= 0.9"])
 end
diff --git a/spec/metainspector_spec.rb b/spec/metainspector_spec.rb
@@ -1,34 +1,89 @@
 require File.join(File.dirname(__FILE__), "/spec_helper")
 
 describe MetaInspector do
+
   context 'Doing a basic scrape' do
     before(:each) do
       @m = MetaInspector.new('http://pagerankalert.com')
     end
-    
+
     it "should get the title" do
-      @m.title.should == 'PageRankAlert.com :: Track your pagerank changes'
-    end
-
-    it "should get the description" do
-      @m.description.should == 'Track your PageRank(TM) changes and receive alert by email'
+      @m.title.should == 'PageRankAlert.com :: Track your PageRank changes'
     end
-
-    it "should get the keywords" do
-      @m.keywords.should == "pagerank, seo, optimization, google"
-    end
-
+
     it "should get the links" do
-      @m.links.size.should == 7
+      @m.links.size.should == 8
     end
-    
+
     it "should have a Nokogiri::HTML::Document as parsed_document" do
       @m.parsed_document.class.should == Nokogiri::HTML::Document
     end
-    
+
     it "should have a String as document" do
       @m.document.class.should == String
     end
   end
-end
 
+  context 'Getting meta tags by ghost methods' do
+    before(:each) do
+      @m = MetaInspector.new('http://pagerankalert.com')
+    end
+
+    it "should get the robots meta tag" do
+      @m.meta_robots.should == 'all,follow'
+    end
+
+    it "should get the description meta tag" do
+      @m.meta_description.should == 'Track your PageRank(TM) changes and receive alerts by email'
+    end
+
+    it "should get the keywords meta tag" do
+      @m.meta_keywords.should == "pagerank, seo, optimization, google"
+    end
+
+    it "should get the content-language meta tag" do
+      pending "mocks"
+      @m.meta_content_language.should == "en"
+    end
+
+    it "should get the Content-Type meta tag" do
+      pending "mocks"
+      @m.meta_Content_Type.should == "text/html; charset=utf-8"
+    end
+
+    it "should get the generator meta tag" do
+      pending "mocks"
+      @m.meta_generator.should == 'WordPress 2.8.4'
+    end
+
+    it "should return nil for nonfound meta_tags" do
+      @m.meta_lollypop.should == nil
+    end
+  end
+
+  context 'Charset detection' do
+    it "should detect windows-1252 charset" do
+      @m = MetaInspector.new('http://www.alazan.com')
+      @m.charset.should == "windows-1252"
+    end
+
+    it "should detect utf-8 charset" do
+      @m = MetaInspector.new('http://www.pagerankalert.com')
+      @m.charset.should == "utf-8"
+    end
+  end
+
+  context 'Deprecated methods still work' do
+    before(:each) do
+      @m = MetaInspector.new('http://pagerankalert.com')
+    end
+
+    it "should get the description as the meta_description" do
+      @m.description.should == @m.meta_description
+    end
+
+    it "should get the keywords as the meta_keywords" do
+      @m.keywords.should == @m.meta_keywords
+    end
+  end
+end