Skip to content
Browse files

etag filtering

  • Loading branch information...
1 parent 0b74cdf commit b32f192d896271207e9acd2db7cac4f66e81217c @jkraemer committed Mar 24, 2006
Showing with 57 additions and 10 deletions.
  1. +3 −5 lib/rdig/content_extractors.rb
  2. +31 −3 lib/rdig/crawler.rb
  3. +0 −2 lib/rdig/http_client.rb
  4. +23 −0 test/unit/etag_filter_test.rb
View
8 lib/rdig/content_extractors.rb
@@ -78,11 +78,9 @@ def self.process(content)
# extracts the href attributes of all a tags, except
# internal links like <a href="#top">
def self.extract_links(tagsoup)
- links = []
- tagsoup.find_all('a').each { |link|
- links << CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
- }
- links
+ tagsoup.find_all('a').map { |link|
+ CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
+ }.compact
end
def self.title_tag(tagsoup)
View
34 lib/rdig/crawler.rb
@@ -1,12 +1,13 @@
module RDig
-
+
+
class Crawler
def initialize
@documents = Queue.new
+ @etag_filter = ETagFilter.new
end
-
def run
@indexer = Index::Indexer.new(RDig.config.ferret)
filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
@@ -47,6 +48,9 @@ def run
def process_document(doc, filterchain)
doc.fetch
+ # add links from this document to the queue
+ doc.content[:links].each { |url| add_url(url, filterchain, doc) }
+ return unless @etag_filter.apply(doc)
case doc.status
when :success
if doc.content
@@ -102,6 +106,7 @@ class Document
attr_reader :uri
attr_reader :referring_uri
attr_reader :status
+ attr_reader :etag
attr_accessor :redirections
# url: url of this document, may be relative to the referring doc or host.
@@ -131,18 +136,41 @@ def fetch
when Net::HTTPSuccess
@content_type = response['content-type']
@raw_body = response.body
+ @etag = response['etag']
# todo externalize this (another chain ?)
@content = ContentExtractors.process(@raw_body, @content_type)
@status = :success
when Net::HTTPRedirection
@status = :redirect
@content = { :links => [ response['location'] ] }
else
- puts "dunno what to do with response: #{response}"
+ puts "don't know what to do with response: #{response}"
end
end
end
+
+ # checks fetched documents' E-Tag headers against the list of E-Tags
+ # of the documents already indexed.
+ # This is supposed to help against double-indexing documents which can
+ # be reached via different URLs (think http://host.com/ and
+ # http://host.com/index.html )
+ # Documents without ETag are allowed to pass through
+ class ETagFilter
+ include MonitorMixin
+
+ def initialize
+ @etags = Set.new
+ super
+ end
+
+ def apply(document)
+ return document unless document.etag
+ synchronize do
+ @etags.add?(document.etag) ? document : nil
+ end
+ end
+ end
end
View
2 lib/rdig/http_client.rb
@@ -1,5 +1,3 @@
-require 'net/http'
-
module RDig
module HttpClient
View
23 test/unit/etag_filter_test.rb
@@ -0,0 +1,23 @@
+require 'test_helper'
+class UrlFilterTest < Test::Unit::TestCase
+ include TestHelper, RDig
+
+ def setup
+ @filter = ETagFilter.new
+ end
+
+ def test_add
+ d0 = OpenStruct.new(:etag => nil)
+ assert @filter.apply(d0)
+
+ d1 = OpenStruct.new(:etag => 'abc1234')
+ assert @filter.apply(d1)
+ assert !@filter.apply(d1)
+
+ d2 = OpenStruct.new(:etag => 'abc1235')
+ assert @filter.apply(d2)
+ assert !@filter.apply(d2)
+ assert !@filter.apply(d1)
+ end
+
+end

0 comments on commit b32f192

Please sign in to comment.
Something went wrong with that request. Please try again.