Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
80aa723
commit e454325
Showing
6 changed files
with
169 additions
and
137 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,67 +1,96 @@ | ||
require 'open-uri' | ||
require 'rubygems' | ||
require 'nokogiri' | ||
require 'UniversalDetector' | ||
require 'charguess' | ||
require 'iconv' | ||
|
||
# MetaInspector provides an easy way to scrape web pages and get its elements | ||
class MetaInspector | ||
VERSION = '1.1.5' | ||
VERSION = '1.1.6' | ||
|
||
attr_reader :address | ||
|
||
# Initializes a new instance of MetaInspector, setting the URL address to the one given | ||
# TODO: validate address as http URL, dont initialize it if wrong format | ||
# TODO: validate address as http URL, dont initialize it if wrong format | ||
def initialize(address) | ||
@address = address | ||
|
||
@document = @title = @description = @keywords = @links = nil | ||
end | ||
|
||
# Returns the parsed document title | ||
|
||
# Returns the parsed document title, from the content of the <title> tag. | ||
# This is not the same as the meta_tite tag | ||
def title | ||
@title ||= parsed_document.css('title').inner_html rescue nil | ||
end | ||
|
||
# Returns the parsed document meta description | ||
def description | ||
@description ||= parsed_document.css("meta[@name='description']").first['content'] rescue nil | ||
end | ||
|
||
# Returns the parsed document meta keywords | ||
def keywords | ||
@keywords ||= parsed_document.css("meta[@name='keywords']").first['content'] rescue nil | ||
end | ||
|
||
|
||
# Returns the parsed document links | ||
def links | ||
@links ||= parsed_document.search("//a").map {|link| link.attributes["href"].to_s.strip} rescue nil | ||
end | ||
|
||
# Returns the specified charset, or tries to guess it | ||
|
||
# Returns the charset | ||
# TODO: We should trust the charset expressed on the Content-Type meta tag | ||
# and only guess it if none given | ||
def charset | ||
@charset ||= UniversalDetector::chardet(document)['encoding'].downcase | ||
@charset ||= CharGuess.guess(document).downcase | ||
end | ||
|
||
# Returns the whole parsed document | ||
def parsed_document | ||
@parsed_document ||= Nokogiri::HTML(document) | ||
|
||
rescue | ||
puts 'An exception occurred while trying to scrape the page!' | ||
warn 'An exception occurred while trying to scrape the page!' | ||
end | ||
|
||
# Returns the original, unparsed document | ||
def document | ||
@document ||= open(@address).read | ||
|
||
rescue SocketError | ||
puts 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)' | ||
warn 'MetaInspector exception: The url provided does not exist or is temporarily unavailable (socket error)' | ||
@scraped = false | ||
rescue TimeoutError | ||
puts 'Timeout!!!' | ||
warn 'Timeout!!!' | ||
rescue | ||
puts 'An exception occurred while trying to fetch the page!' | ||
warn 'An exception occurred while trying to fetch the page!' | ||
end | ||
|
||
# Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for | ||
# meta name: keywords, description, robots, generator | ||
# meta http-equiv: content-language, Content-Type | ||
# | ||
# It will first try with meta name="..." and if nothing found, | ||
# with meta http-equiv="...", substituting "_" by "-" | ||
# TODO: this should be case unsensitive, so meta_robots gets the results from the HTML for robots, Robots, ROBOTS... | ||
# TODO: cache results on instance variables, using ||= | ||
# TODO: define respond_to? to return true on the meta_name methods | ||
def method_missing(method_name) | ||
if method_name.to_s =~ /^meta_(.*)/ | ||
content = parsed_document.css("meta[@name='#{$1}']").first['content'] rescue nil | ||
content = parsed_document.css("meta[@http-equiv='#{$1.gsub("_", "-")}']").first['content'] rescue nil if content.nil? | ||
|
||
content | ||
else | ||
super | ||
end | ||
end | ||
|
||
######################################################################################################### | ||
# DEPRECATED METHODS | ||
# These methods are deprecated and will disappear soonish. | ||
|
||
# DEPRECATED: Returns the parsed document meta description | ||
def description | ||
warn "DEPRECATION WARNING: description method is deprecated since 1.1.6 and will be removed on 1.2.0, use meta_description instead" | ||
@description ||= meta_description rescue nil | ||
end | ||
|
||
# DEPRECATED: Returns the parsed document meta keywords | ||
def keywords | ||
warn "DEPRECATION WARNING: keywords method is deprecated since 1.1.6 and will be removed on 1.2.0, use meta_keywords instead" | ||
@keywords ||= meta_keywords rescue nil | ||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,34 +1,89 @@ | ||
require File.join(File.dirname(__FILE__), "/spec_helper") | ||
|
||
describe MetaInspector do | ||
|
||
context 'Doing a basic scrape' do | ||
before(:each) do | ||
@m = MetaInspector.new('http://pagerankalert.com') | ||
end | ||
|
||
it "should get the title" do | ||
@m.title.should == 'PageRankAlert.com :: Track your pagerank changes' | ||
end | ||
|
||
it "should get the description" do | ||
@m.description.should == 'Track your PageRank(TM) changes and receive alert by email' | ||
@m.title.should == 'PageRankAlert.com :: Track your PageRank changes' | ||
end | ||
|
||
it "should get the keywords" do | ||
@m.keywords.should == "pagerank, seo, optimization, google" | ||
end | ||
|
||
|
||
it "should get the links" do | ||
@m.links.size.should == 7 | ||
@m.links.size.should == 8 | ||
end | ||
|
||
it "should have a Nokogiri::HTML::Document as parsed_document" do | ||
@m.parsed_document.class.should == Nokogiri::HTML::Document | ||
end | ||
|
||
it "should have a String as document" do | ||
@m.document.class.should == String | ||
end | ||
end | ||
end | ||
|
||
context 'Getting meta tags by ghost methods' do | ||
before(:each) do | ||
@m = MetaInspector.new('http://pagerankalert.com') | ||
end | ||
|
||
it "should get the robots meta tag" do | ||
@m.meta_robots.should == 'all,follow' | ||
end | ||
|
||
it "should get the description meta tag" do | ||
@m.meta_description.should == 'Track your PageRank(TM) changes and receive alerts by email' | ||
end | ||
|
||
it "should get the keywords meta tag" do | ||
@m.meta_keywords.should == "pagerank, seo, optimization, google" | ||
end | ||
|
||
it "should get the content-language meta tag" do | ||
pending "mocks" | ||
@m.meta_content_language.should == "en" | ||
end | ||
|
||
it "should get the Content-Type meta tag" do | ||
pending "mocks" | ||
@m.meta_Content_Type.should == "text/html; charset=utf-8" | ||
end | ||
|
||
it "should get the generator meta tag" do | ||
pending "mocks" | ||
@m.meta_generator.should == 'WordPress 2.8.4' | ||
end | ||
|
||
it "should return nil for nonfound meta_tags" do | ||
@m.meta_lollypop.should == nil | ||
end | ||
end | ||
|
||
context 'Charset detection' do | ||
it "should detect windows-1252 charset" do | ||
@m = MetaInspector.new('http://www.alazan.com') | ||
@m.charset.should == "windows-1252" | ||
end | ||
|
||
it "should detect utf-8 charset" do | ||
@m = MetaInspector.new('http://www.pagerankalert.com') | ||
@m.charset.should == "utf-8" | ||
end | ||
end | ||
|
||
context 'Deprecated methods still work' do | ||
before(:each) do | ||
@m = MetaInspector.new('http://pagerankalert.com') | ||
end | ||
|
||
it "should get the description as the meta_description" do | ||
@m.description.should == @m.meta_description | ||
end | ||
|
||
it "should get the keywords as the meta_keywords" do | ||
@m.keywords.should == @m.meta_keywords | ||
end | ||
end | ||
end |
Oops, something went wrong.