-
Notifications
You must be signed in to change notification settings - Fork 9
/
spider.rb
43 lines (38 loc) · 1.29 KB
/
spider.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
require File.dirname(__FILE__)+'/spider/spider_instance'
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
# links, and doing it all over again.
class Spider
VERSION_INFO = [0, 5, 4] unless defined?(self::VERSION_INFO)
VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
def self.version
VERSION
end
# Runs the spider starting at the given URL. Also takes a block that is given
# the SpiderInstance. Use the block to define the rules and handlers for
# the discovered Web pages. See SpiderInstance for the possible rules and
# handlers.
#
# Spider.start_at('http://cashcats.biz/') do |s|
# s.add_url_check do |a_url|
# a_url =~ %r{^http://cashcats.biz.*}
# end
#
# s.on 404 do |a_url, resp, prior_url|
# puts "URL not found: #{a_url}"
# end
#
# s.on :success do |a_url, resp, prior_url|
# puts "body: #{resp.body}"
# end
#
# s.on :every do |a_url, resp, prior_url|
# puts "URL returned anything: #{a_url} with this code #{resp.code}"
# end
# end
def self.start_at(a_url, &block)
rules = RobotRules.new("Ruby Spider #{Spider::VERSION}")
a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, [])
block.call(a_spider)
a_spider.start!
end
end