Permalink
Browse files

doc enhancements, better uri-normalization, re-add result uri of redi…

…rection into the queue instea of directly indexing the resulting page
  • Loading branch information...
jkraemer committed Apr 26, 2009
1 parent 963a535 commit e1f0b32c0be1d28fee03a63a657e636c0541d377
View
@@ -86,6 +86,13 @@
# Set to 0 to only index the start_urls.
# cfg.crawler.max_depth = nil
+ # default index document to be appended to URIs ending with a trailing '/'
+ # cfg.crawler.normalize_uri.index_document = nil
+ # strip trailing '/' from URIs to avoid double indexing of pages referred by '
+ # Ignored if index_document is set.
+ # Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
+ # cfg.crawler.normalize_uri.remove_trailing_slash = nil
+
# http proxy configuration
# proxy url
# cfg.crawler.http_proxy = nil
View
@@ -61,17 +61,16 @@ module RDig
class << self
- # the filter chains are for limiting the set of indexed documents.
- # there are two chain types - one for http, and one for file system
- # crawling.
- # a document has to survive all filters in the chain to get indexed.
+ # Filter chains are used by the crawler to limit the set of documents being indexed.
+ # There are two chains - one for http, and one for file system crawling.
+ # Each document has to survive all filters in the relevant chain to get indexed.
def filter_chain
@filter_chain ||= {
# filter chain for http crawling
:http => [
:scheme_filter_http,
:fix_relative_uri,
- :normalize_uri,
+ { :normalize_uri => :normalize_uri },
{ RDig::UrlFilters::DepthFilter => :max_depth },
{ :hostname_filter => :include_hosts },
{ RDig::UrlFilters::UrlInclusionFilter => :include_documents },
@@ -121,7 +120,11 @@ def configuration
:wait_before_leave => 10,
:http_proxy => nil,
:http_proxy_user => nil,
- :http_proxy_pass => nil
+ :http_proxy_pass => nil,
+ :normalize_uri => OpenStruct.new(
+ :index_document => nil,
+ :remove_trailing_slash => nil
+ )
),
:content_extraction => OpenStruct.new(
# settings for html content extraction (hpricot)
@@ -23,7 +23,13 @@ def self.extractors; @@extractors ||= [] end
def self.extractor_instances
@@extractor_instances ||= extractors.map { |ex_class|
RDig.logger.info "initializing content extractor: #{ex_class}"
- ex_class.new(RDig.configuration.content_extraction) rescue nil
+ ex = nil
+ begin
+ ex = ex_class.new(RDig.configuration.content_extraction)
+ rescue Exception
+ RDig.logger.error "error: #{$!.message}\n#{$!.backtrace.join("\n")}"
+ end
+ ex
}.compact
end
@@ -13,25 +13,11 @@ def initialize(config)
@wvhtml = 'wvHtml'
@pattern = /^application\/msword/
# html extractor for parsing wvHtml output
- if defined?(HpricotContentExtractor)
- @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
- :hpricot => OpenStruct.new(
- :content_tag_selector => 'body',
- :title_tag_selector => 'title'
- )))
- elsif defined?(RubyfulSoupContentExtractor)
- @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
- :rubyful_soup => OpenStruct.new(
- :content_tag_selector => lambda { |tagsoup|
- tagsoup.html.body
- },
- :title_tag_selector => lambda { |tagsoup|
- tagsoup.html.head.title
- }
- )))
- else
- raise "need at least one html content extractor - please install hpricot or rubyful_soup"
- end
+ @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
+ :hpricot => OpenStruct.new(
+ :content_tag_selector => 'body',
+ :title_tag_selector => 'title'
+ )))
# TODO: better: if $?.exitstatus == 127 (not found)
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
end
@@ -7,7 +7,6 @@
require 'htmlentities'
end
-if defined?(Hpricot)
module RDig
module ContentExtractors
@@ -90,6 +89,7 @@ def tag_from_config(doc, config_key)
def strip_comments(string)
string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
end
+
def strip_tags(string)
string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
Regexp::MULTILINE, 'u'), ''
@@ -103,4 +103,3 @@ def strip_tags(string)
end
end
-end
View
@@ -52,13 +52,21 @@ def crawl
def process_document(doc, filterchain)
@logger.debug "processing document #{doc}"
doc.fetch
- # add links from this document to the queue
- doc.content[:links].each { |url|
- add_url(url, filterchain, doc)
- } unless doc.content[:links].nil?
-
- return unless @etag_filter.apply(doc)
- add_to_index doc
+ case doc.status
+ when :success
+ if @etag_filter.apply(doc)
+ # add links from this document to the queue
+ doc.content[:links].each { |url|
+ add_url(url, filterchain, doc)
+ } unless doc.content[:links].nil?
+ add_to_index doc
+ end
+ when :redirect
+ @logger.debug "redirect to #{doc.content}"
+ add_url(doc.content, filterchain, doc)
+ else
+ @logger.error "unknown doc status #{doc.status}: #{doc}"
+ end
rescue
@logger.error "error processing document #{doc.uri.to_s}: #{$!}"
@logger.debug "Trace: #{$!.backtrace.join("\n")}"
View
@@ -118,16 +118,20 @@ def initialize(args={})
def fetch
RDig.logger.debug "fetching #{@uri.to_s}"
open(@uri.to_s, RDig::open_uri_http_options) do |doc|
- case doc.status.first.to_i
- when 200
- @etag = doc.meta['etag']
- # puts "etag: #{@etag}"
- @content = ContentExtractors.process(doc.read, doc.content_type)
- @status = :success
- when 404
- RDig.logger.info "got 404 for #{@uri}"
+ if @uri.to_s != doc.base_uri.to_s
+ @status = :redirect
+ @content = doc.base_uri
else
- RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
+ case doc.status.first.to_i
+ when 200
+ @etag = doc.meta['etag']
+ @content = ContentExtractors.process(doc.read, doc.content_type)
+ @status = :success
+ when 404
+ RDig.logger.info "got 404 for #{@uri}"
+ else
+ RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
+ end
end
end
rescue
View
@@ -22,7 +22,7 @@ def initialize(chain_config)
end
# add a filter and it's args to the chain
- # when args is a symbol, it is treated as a configuration key
+ # if args is a symbol, it is treated as a configuration key
def add(filter, args=nil)
args = RDig.config.crawler.send(args) if args.is_a? Symbol
case filter
@@ -195,12 +195,17 @@ def UrlFilters.hostname_filter(document, include_hosts)
return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
end
- def UrlFilters.normalize_uri(document)
+ def UrlFilters.normalize_uri(document, cfg)
document.uri.fragment = nil
# document.uri.query = nil
- # append index document if configured and path ends with a slash
- if RDig.config.index_document && document.uri.path =~ /\/$/
- document.uri.path << RDig.config.index_document
+ # trailing slash handling
+ if document.uri.path =~ /\/$/
+ # append index document if configured
+ if cfg.index_document
+ document.uri.path << RDig.config.index_document
+ elsif cfg.remove_trailing_slash
+ document.uri.path.gsub! /\/$/, ''
+ end
end
return document
end

0 comments on commit e1f0b32

Please sign in to comment.