Permalink
Browse files

small bug fixes to allow crawl being called multiple times

  • Loading branch information...
1 parent b83193b commit 82a7405c731c0af067b52872f0ed68a58e2a4d4c @jkraemer committed May 21, 2008
Showing with 5 additions and 4 deletions.
  1. +1 −0 .gitignore
  2. +3 −3 lib/rdig/crawler.rb
  3. +1 −1 lib/rdig/url_filters.rb
View
@@ -1,3 +1,4 @@
+*.tmproj
*.swp
tmp
pkg
View
@@ -5,7 +5,6 @@ class Crawler
def initialize(config = RDig.config, logger = RDig.logger)
@documents = Queue.new
- @etag_filter = ETagFilter.new
@logger = logger
@config = config
end
@@ -22,7 +21,8 @@ def crawl
# check whether we are indexing on-disk or via http
url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
chain_config = RDig.filter_chain[url_type]
-
+
+ @etag_filter = ETagFilter.new
filterchain = UrlFilters::FilterChain.new(chain_config)
@config.crawler.start_urls.each { |url| add_url(url, filterchain) }
@@ -110,7 +110,7 @@ def initialize
end
def apply(document)
- return document unless (document.respond_to?(:etag) && document.etag)
+ return document unless (document.respond_to?(:etag) && !document.etag.blank?)
synchronize do
@etags.add?(document.etag) ? document : nil
end
View
@@ -65,7 +65,7 @@ def apply(document)
# implemented as a thread safe singleton as it has to be shared
# between all crawler threads
class VisitedUrlFilter
- include MonitorMixin, Singleton
+ include MonitorMixin
def initialize
@visited_urls = Set.new
super

0 comments on commit 82a7405

Please sign in to comment.