Permalink
Browse files

bugfix, rename readme

  • Loading branch information...
1 parent f7011dc commit 600ffe341756e24112137c7bc6856e11291dae15 @jkraemer committed Apr 11, 2012
View
0 README → README.rdoc
File renamed without changes.
View
0 bin/rdig 100644 → 100755
File mode changed.
View
11 lib/rdig.rb
@@ -84,7 +84,7 @@ def filter_chain
{ RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
]
}
-
+
end
def application
@@ -150,7 +150,7 @@ def configuration
end
end
alias config configuration
-
+
def logger
@logger ||= create_logger
end
@@ -200,12 +200,12 @@ class Application
def options
@options ||= OpenStruct.new
end
-
+
# Display the program usage line.
def usage
puts "rdig -c configfile {options}"
end
-
+
# Display the rake command line help.
def help
usage
@@ -266,8 +266,7 @@ def run
rescue
puts $!.backtrace
fail "No Configfile found!\n#{$!}"
-
- end
+ end
puts "using Ferret #{Ferret::VERSION}"
View
8 lib/rdig/content_extractors.rb
@@ -1,5 +1,5 @@
module RDig
-
+
# Contains classes which are used for extracting content and meta data from
# various content types.
module ContentExtractors
@@ -13,7 +13,7 @@ def self.process(content, content_type)
# Extractors inheriting from this class will be auto-discovered and used
# when can_do returns true
class ContentExtractor
-
+
def self.inherited(extractor)
super(extractor)
self.extractors << extractor
@@ -32,7 +32,7 @@ def self.extractor_instances
ex
}.compact
end
-
+
def self.process(content, content_type)
self.extractor_instances.each { |extractor|
return extractor.process(content) if extractor.can_do(content_type)
@@ -61,7 +61,7 @@ def process(content)
end
result
end
-
+
def as_file(content)
file = Tempfile.new('rdig')
file << content
View
6 lib/rdig/content_extractors/pdf.rb
@@ -8,7 +8,7 @@ module ContentExtractors
#
class PdfContentExtractor < ContentExtractor
include ExternalAppHelper
-
+
def initialize(config)
super(config)
@pattern = /^application\/pdf/
@@ -22,7 +22,7 @@ def initialize(config)
end
}
end
-
+
def process(content)
result = {}
as_file(content) do |file|
@@ -35,7 +35,7 @@ def process(content)
def get_content(path_to_tempfile)
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
end
-
+
# extracts the title from pdf meta data
# needs pdfinfo
# returns the title or nil if no title was found
View
31 lib/rdig/crawler.rb
@@ -1,12 +1,11 @@
module RDig
-
-
class Crawler
-
+
def initialize(config = RDig.config, logger = RDig.logger)
@documents = Queue.new
@logger = logger
@config = config
+ @indexed_documents = 0
end
def run
@@ -22,6 +21,7 @@ def crawl
url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
chain_config = RDig.filter_chain[url_type]
+ # the etag filter operates on the fetched document, thats why we cannot put it into the filter chain right now.
@etag_filter = ETagFilter.new
filterchain = UrlFilters::FilterChain.new(chain_config)
@config.crawler.start_urls.each { |url| add_url(url, filterchain) }
@@ -31,9 +31,11 @@ def crawl
num_threads.times { |i|
group.join_nowait Thread.new("fetcher #{i}") {
filterchain = UrlFilters::FilterChain.new(chain_config)
+ @logger.info "thread #{i} running..."
while (doc = @documents.pop) != :exit
process_document doc, filterchain
end
+ @logger.info "thread #{i} is done."
}
}
@@ -47,20 +49,21 @@ def crawl
@logger.info "waiting for threads to finish..."
group.all_waits
+ @logger.info "indexed #{@indexer.indexed_documents} documents"
end
def process_document(doc, filterchain)
- @logger.debug "processing document #{doc}"
+ @logger.info "processing document #{doc.uri}"
doc.fetch
case doc.status
when :success
- if @etag_filter.apply(doc)
+ if @etag_filter.apply(doc)
# add links from this document to the queue
doc.content[:links].each { |url|
add_url(url, filterchain, doc)
} unless doc.content[:links].nil?
add_to_index doc
- end
+ end
when :redirect
@logger.debug "redirect to #{doc.content}"
add_url(doc.content, filterchain, doc)
@@ -69,14 +72,16 @@ def process_document(doc, filterchain)
end
rescue
@logger.error "error processing document #{doc.uri.to_s}: #{$!}"
- @logger.debug "Trace: #{$!.backtrace.join("\n")}"
+ @logger.info "Trace: #{$!.backtrace.join("\n")}"
end
def add_to_index(doc)
- @indexer << doc if doc.needs_indexing?
+ if doc.needs_indexing?
+ @indexer << doc
+ end
end
-
+
# pipes a new document pointing to url through the filter chain,
# if it survives that, it gets added to the documents queue for further
# processing
@@ -90,19 +95,17 @@ def add_url(url, filterchain, referring_document = nil)
Document.create(url)
end
- doc = filterchain.apply(doc)
-
- if doc
+ if doc = filterchain.apply(doc)
@documents << doc
@logger.debug "url #{url} survived filterchain"
end
rescue
nil
end
-
+
end
-
+
# checks fetched documents' E-Tag headers against the list of E-Tags
# of the documents already indexed.
# This is supposed to help against double-indexing documents which can
View
16 lib/rdig/documents.rb
@@ -1,14 +1,14 @@
module RDig
-
+
#
# Document base class
#
class Document
-
+
attr_reader :uri
attr_reader :content
attr_reader :content_type
-
+
def self.create(url)
return case url
when /^https?:\/\//i
@@ -32,7 +32,7 @@ def initialize(args)
def title; @content[:title] end
def body; @content[:content] end
def links; @content[:links] end
-
+
def needs_indexing?
has_content? && (title || body)
end
@@ -47,7 +47,7 @@ def to_s
end
-
+
#
# Document in a File system
#
@@ -90,8 +90,8 @@ def fetch
end
end
-
-
+
+
#
# Remote Document to be retrieved by HTTP
#
@@ -106,7 +106,7 @@ class HttpDocument < Document
def create_child(uri)
HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
end
-
+
# url: url of this document, may be relative to the referring doc or host.
# referrer: uri of the document we retrieved this link from
def initialize(args={})
View
14 lib/rdig/index.rb
@@ -1,19 +1,22 @@
module RDig
module Index
-
+
# used by the crawler to build the ferret index
class Indexer
include MonitorMixin
-
+
+ attr_reader :indexed_documents
+
def initialize(settings)
+ @indexed_documents = 0
@config = settings
@index_writer = Ferret::Index::IndexWriter.new(
:path => settings.path,
:create => settings.create,
:analyzer => settings.analyzer)
super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
end
-
+
def add_to_index(document)
RDig.logger.debug "add to index: #{document.uri.to_s}"
@config.rewrite_uri.call(document.uri) if @config.rewrite_uri
@@ -25,16 +28,17 @@ def add_to_index(document)
}
synchronize do
@index_writer << doc
+ @indexed_documents += 1
end
end
alias :<< :add_to_index
-
+
def close
@index_writer.optimize
@index_writer.close
@index_writer = nil
end
end
-
+
end
end
View
33 lib/rdig/url_filters.rb
@@ -1,9 +1,10 @@
module RDig
-
+
module UrlFilters
class FilterChain
def initialize(chain_config)
+ @logger = RDig.logger
@filters = []
chain_config.each { |filter|
case filter
@@ -29,11 +30,23 @@ def add(filter, args=nil)
when Symbol
if args.nil?
@filters << lambda { |document|
- UrlFilters.send(filter, document)
+ begin
+ UrlFilters.send(filter, document)
+ rescue Exception
+ @logger.error "error in URL filter #{filter}: #{$!}"
+ @logger.error $!.backtrace.join("\n")
+ nil
+ end
}
else
@filters << lambda { |document|
- UrlFilters.send(filter, document, args)
+ begin
+ UrlFilters.send(filter, document, args)
+ rescue Exception
+ @logger.error "error in URL filter #{filter}: #{$!}"
+ @logger.error $!.backtrace.join("\n")
+ nil
+ end
}
end
when Class
@@ -54,7 +67,13 @@ def add(filter, args=nil)
def apply(document)
@filters.each { |filter|
- return nil unless filter.call(document)
+ @logger.debug "running filter #{filter.inspect} on doc #{document.uri}"
+ unless filter.call(document)
+ @logger.debug "fail"
+ return nil
+ else
+ @logger.debug 'OK'
+ end
}
return document
end
@@ -75,7 +94,7 @@ def initialize
# nil otherwise
def apply(document)
synchronize do
- @visited_urls.add?(document.uri.to_s) ? document : nil
+ @visited_urls.add?(document.uri.to_s) ? document : nil
end
end
end
@@ -174,7 +193,7 @@ def UrlFilters.fix_relative_uri(document)
uri.host = ref.host unless uri.host
uri.port = ref.port unless uri.port || ref.port==ref.default_port
uri.path = ref.path unless uri.path
-
+
old_uri_path = uri.path
if uri.path !~ /^\// || uri.path =~ /^\.\./
ref_path = ref.path || '/'
@@ -202,7 +221,7 @@ def UrlFilters.normalize_uri(document, cfg)
if document.uri.path =~ /\/$/
# append index document if configured
if cfg.index_document
- document.uri.path << RDig.config.index_document
+ document.uri.path << cfg.index_document
elsif cfg.remove_trailing_slash
document.uri.path.gsub! /\/$/, ''
end
View
5 rdig.gemspec
@@ -2,7 +2,7 @@
Gem::Specification.new do |s|
s.name = %q{rdig}
- s.version = "0.3.8"
+ s.version = "0.3.9"
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
s.authors = ["Jens Kraemer"]
@@ -60,6 +60,9 @@ Gem::Specification.new do |s|
test/unit/url_filters_test.rb
test/unit/word_content_extractor_test.rb
)
+ s.executables = %w(
+ rdig
+ )
s.add_dependency(%q<ferret>, [">= 0.11.6"])
s.add_dependency(%q<hpricot>, [">= 0.6"])

0 comments on commit 600ffe3

Please sign in to comment.