Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

initial import

  • Loading branch information...
commit f086307f883bd57869c11c3ae9dec997b937d78a 0 parents
Jens Krämer authored
2  .svnignore
@@ -0,0 +1,2 @@
+tmp
+pkg
2  CHANGES
@@ -0,0 +1,2 @@
+0.1.0
+initial release
20 LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2006 Jens Kraemer
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3  README
@@ -0,0 +1,3 @@
+gem install ferret
+gem install rubyful_soup
+
0  TODO
No changes.
32 bin/rdig
@@ -0,0 +1,32 @@
+#!/usr/bin/env ruby
+
+# run from RAILS_ROOT with
+# ruby -Ilib vendor/plugins/sitesearch/create_index.rb config
+# where config is the name of your config file
+
+begin
+ require 'rdig'
+rescue LoadError
+ require 'rubygems'
+ require 'rdig'
+end
+RDig.application.run
+
+
+#$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
+#$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
+#require 'init'
+
+#if ARGV[0]
+# require ARGV[0]
+#else
+# require 'config'
+#end
+
+#include SiteSearch
+
+
+#puts "creating new index in #{SiteSearch.settings[:index_dir]}"
+
+#crawler = Crawler.new
+#crawler.run
45 doc/examples/config.rb
@@ -0,0 +1,45 @@
+#
+# sample RDig configuration file, edit to taste
+#
+
+RDig.configuration do |cfg|
+
+ ##################################################################
+ # options you should really set
+
+ # provide one or more URLs for the crawler to start from
+ cfg.crawler.start_urls = [ 'http://www.example.com/' ]
+
+ # limit the crawl to these hosts. The crawler will never
+ # follow any links pointing to hosts other than those given here.
+ cfg.crawler.include_hosts = [ 'www.example.com' ]
+
+ # this is the path where the index will be stored
+ # caution, existing contents of this directory will be deleted!
+ cfg.ferret.path = '/path/to/index'
+
+ ##################################################################
+ # options you might want to set, the given values are the defaults
+
+ # nil (index all documents) or a list of Regexps
+ # matching URLs you want to index.
+ # cfg.crawler.include_documents = nil
+
+ # nil (no documents excluded) or a list of Regexps
+ # matching URLs not to index.
+ # this filter is used after the one above, so you only need
+ # to exclude documents here that aren't wanted but would be
+ # included by the inclusion patterns.
+ # cfg.crawler.exclude_documents = nil
+
+ # number of http fetching threads to use
+ # cfg.crawler.num_threads = 2
+
+ # maximum number of http redirections to follow
+ # cfg.crawler.max_redirects = 5
+
+ # number of seconds to wait with an empty url queue before
+ # finishing the crawl. Set to a higher number for slow sites
+ # cfg.crawler.wait_before_leave = 10
+
+end
88 install.rb
@@ -0,0 +1,88 @@
+require 'rbconfig'
+require 'find'
+require 'ftools'
+
+include Config
+
+$ruby = CONFIG['ruby_install_name']
+
+##
+# Install a binary file. We patch in on the way through to
+# insert a #! line. If this is a Unix install, we name
+# the command (for example) 'rdig' and let the shebang line
+# handle running it. Under windows, we add a '.rb' extension
+# and let file associations to their stuff
+#
+
+def installBIN(from, opfile)
+
+ tmp_dir = nil
+ for t in [".", "/tmp", "c:/temp", $bindir]
+ stat = File.stat(t) rescue next
+ if stat.directory? and stat.writable?
+ tmp_dir = t
+ break
+ end
+ end
+
+ fail "Cannot find a temporary directory" unless tmp_dir
+ tmp_file = File.join(tmp_dir, "_tmp")
+
+ File.open(from) do |ip|
+ File.open(tmp_file, "w") do |op|
+ ruby = File.join($realbindir, $ruby)
+ op.puts "#!#{ruby} -w"
+ op.write ip.read
+ end
+ end
+
+ opfile += ".rb" if CONFIG["target_os"] =~ /mswin/i
+ File::install(tmp_file, File.join($bindir, opfile), 0755, true)
+ File::unlink(tmp_file)
+end
+
+$sitedir = CONFIG["sitelibdir"]
+unless $sitedir
+ version = CONFIG["MAJOR"]+"."+CONFIG["MINOR"]
+ $libdir = File.join(CONFIG["libdir"], "ruby", version)
+ $sitedir = $:.find {|x| x =~ /site_ruby/}
+ if !$sitedir
+ $sitedir = File.join($libdir, "site_ruby")
+ elsif $sitedir !~ Regexp.quote(version)
+ $sitedir = File.join($sitedir, version)
+ end
+end
+
+$bindir = CONFIG["bindir"]
+
+$realbindir = $bindir
+
+bindir = CONFIG["bindir"]
+if (destdir = ENV['DESTDIR'])
+ $bindir = destdir + $bindir
+ $sitedir = destdir + $sitedir
+
+ File::makedirs($bindir)
+ File::makedirs($sitedir)
+end
+
+rdig_dest = File.join($sitedir, "rdig")
+File::makedirs(rdig_dest, true)
+File::chmod(0755, rdig_dest)
+
+# The library files
+
+files = Dir.chdir('lib') { Dir['**/*.rb'] }
+
+for fn in files
+ fn_dir = File.dirname(fn)
+ target_dir = File.join($sitedir, fn_dir)
+ if ! File.exist?(target_dir)
+ File.makedirs(target_dir)
+ end
+ File::install(File.join('lib', fn), File.join($sitedir, fn), 0644, true)
+end
+
+# and the executable
+
+installBIN("bin/rdig", "rdig")
20 lib/htmlentities/.config
@@ -0,0 +1,20 @@
+prefix=/usr
+bindir=$prefix/bin
+libdir=$prefix/lib
+datadir=$prefix/share
+mandir=$prefix/share/man
+sysconfdir=/etc
+localstatedir=/var
+libruby=/usr/lib/ruby
+librubyver=/usr/lib/ruby/1.8
+librubyverarch=/usr/lib/ruby/1.8/i486-linux
+siteruby=/usr/local/lib/site_ruby
+siterubyver=/usr/local/lib/site_ruby/1.8
+siterubyverarch=/usr/local/lib/site_ruby/1.8/i486-linux
+rbdir=$siterubyver
+sodir=$siterubyverarch
+rubypath=/usr/bin/ruby1.8
+rubyprog=/usr/bin/ruby1.8
+makeprog=make
+shebang=ruby
+without-ext=no
21 lib/htmlentities/CHANGES
@@ -0,0 +1,21 @@
+== 2.2 (2005-11-07)
+* Important bug fixes -- thanks to Moonwolf
+* Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
+* Decimal decoding edge cases addressed.
+* Test cases added.
+
+== 2.1 (2005-10-31)
+* Removed some unnecessary code in basic entity encoding.
+* Improved handling of encoding: commands are now automatically sorted, so the
+ user doesn't have to worry about their order.
+* Now using setup.rb.
+* Tests moved to separate file.
+
+== 2.0 (2005-08-23)
+* Added encoding to entities.
+* Decoding interface unchanged.
+* Fixed a bug with handling high codepoints.
+
+== 1.0 (2005-08-03)
+* Initial release.
+* Decoding only.
7 lib/htmlentities/COPYING
@@ -0,0 +1,7 @@
+Copyright (c) 2005 Paul Battley
+
+Usage of the works is permitted provided that this instrument is retained
+with the works, so that any entity that uses the works is notified of this
+instrument.
+
+DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
15 lib/htmlentities/README
@@ -0,0 +1,15 @@
+HTML entity encoding and decoding for Ruby
+
+This library extends the String class to allow encoding and decoding of
+HTML/XML entities from/to their corresponding UTF-8 codepoints.
+
+To install (requires root/admin privileges):
+
+# ruby setup.rb
+
+To test:
+
+$ ruby setup.rb test
+
+Comments are welcome. Send an email to pbattley @ gmail.com.
+
281 lib/htmlentities/htmlentities.rb
@@ -0,0 +1,281 @@
+#
+# HTML entity encoding and decoding for Ruby
+#
+# Author:: Paul BATTLEY (pbattley @ gmail.com)
+# Version:: 2.2
+# Date:: 2005-11-07
+#
+# == About
+#
+# This library extends the String class to allow encoding and decoding of
+# HTML/XML entities from/to their corresponding UTF-8 codepoints.
+#
+# == Licence
+#
+# Copyright (c) 2005 Paul Battley
+#
+# Usage of the works is permitted provided that this instrument is retained
+# with the works, so that any entity that uses the works is notified of this
+# instrument.
+#
+# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
+#
+
+module HTMLEntities
+
+ VERSION = '2.2'
+
+ #
+ # MAP is a hash of all the HTML entities I could discover, as taken
+ # from the w3schools page on the subject:
+ # http://www.w3schools.com/html/html_entitiesref.asp
+ # The format is 'entity name' => codepoint where entity name is given
+ # without the surrounding ampersand and semicolon.
+ #
+ MAP = {
+ 'quot' => 34,
+ 'apos' => 39,
+ 'amp' => 38,
+ 'lt' => 60,
+ 'gt' => 62,
+ 'nbsp' => 160,
+ 'iexcl' => 161,
+ 'curren' => 164,
+ 'cent' => 162,
+ 'pound' => 163,
+ 'yen' => 165,
+ 'brvbar' => 166,
+ 'sect' => 167,
+ 'uml' => 168,
+ 'copy' => 169,
+ 'ordf' => 170,
+ 'laquo' => 171,
+ 'not' => 172,
+ 'shy' => 173,
+ 'reg' => 174,
+ 'trade' => 8482,
+ 'macr' => 175,
+ 'deg' => 176,
+ 'plusmn' => 177,
+ 'sup2' => 178,
+ 'sup3' => 179,
+ 'acute' => 180,
+ 'micro' => 181,
+ 'para' => 182,
+ 'middot' => 183,
+ 'cedil' => 184,
+ 'sup1' => 185,
+ 'ordm' => 186,
+ 'raquo' => 187,
+ 'frac14' => 188,
+ 'frac12' => 189,
+ 'frac34' => 190,
+ 'iquest' => 191,
+ 'times' => 215,
+ 'divide' => 247,
+ 'Agrave' => 192,
+ 'Aacute' => 193,
+ 'Acirc' => 194,
+ 'Atilde' => 195,
+ 'Auml' => 196,
+ 'Aring' => 197,
+ 'AElig' => 198,
+ 'Ccedil' => 199,
+ 'Egrave' => 200,
+ 'Eacute' => 201,
+ 'Ecirc' => 202,
+ 'Euml' => 203,
+ 'Igrave' => 204,
+ 'Iacute' => 205,
+ 'Icirc' => 206,
+ 'Iuml' => 207,
+ 'ETH' => 208,
+ 'Ntilde' => 209,
+ 'Ograve' => 210,
+ 'Oacute' => 211,
+ 'Ocirc' => 212,
+ 'Otilde' => 213,
+ 'Ouml' => 214,
+ 'Oslash' => 216,
+ 'Ugrave' => 217,
+ 'Uacute' => 218,
+ 'Ucirc' => 219,
+ 'Uuml' => 220,
+ 'Yacute' => 221,
+ 'THORN' => 222,
+ 'szlig' => 223,
+ 'agrave' => 224,
+ 'aacute' => 225,
+ 'acirc' => 226,
+ 'atilde' => 227,
+ 'auml' => 228,
+ 'aring' => 229,
+ 'aelig' => 230,
+ 'ccedil' => 231,
+ 'egrave' => 232,
+ 'eacute' => 233,
+ 'ecirc' => 234,
+ 'euml' => 235,
+ 'igrave' => 236,
+ 'iacute' => 237,
+ 'icirc' => 238,
+ 'iuml' => 239,
+ 'eth' => 240,
+ 'ntilde' => 241,
+ 'ograve' => 242,
+ 'oacute' => 243,
+ 'ocirc' => 244,
+ 'otilde' => 245,
+ 'ouml' => 246,
+ 'oslash' => 248,
+ 'ugrave' => 249,
+ 'uacute' => 250,
+ 'ucirc' => 251,
+ 'uuml' => 252,
+ 'yacute' => 253,
+ 'thorn' => 254,
+ 'yuml' => 255,
+ 'OElig' => 338,
+ 'oelig' => 339,
+ 'Scaron' => 352,
+ 'scaron' => 353,
+ 'Yuml' => 376,
+ 'circ' => 710,
+ 'tilde' => 732,
+ 'ensp' => 8194,
+ 'emsp' => 8195,
+ 'thinsp' => 8201,
+ 'zwnj' => 8204,
+ 'zwj' => 8205,
+ 'lrm' => 8206,
+ 'rlm' => 8207,
+ 'ndash' => 8211,
+ 'mdash' => 8212,
+ 'lsquo' => 8216,
+ 'rsquo' => 8217,
+ 'sbquo' => 8218,
+ 'ldquo' => 8220,
+ 'rdquo' => 8221,
+ 'bdquo' => 8222,
+ 'dagger' => 8224,
+ 'Dagger' => 8225,
+ 'hellip' => 8230,
+ 'permil' => 8240,
+ 'lsaquo' => 8249,
+ 'rsaquo' => 8250,
+ 'euro' => 8364
+ }
+
+ MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
+ MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
+
+ # Precompile the regexp
+ NAMED_ENTITY_REGEXP =
+ /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
+
+ # Reverse map for converting characters to named entities
+ REVERSE_MAP = MAP.invert
+
+ BASIC_ENTITY_REGEXP = /[<>'"&]/
+
+ UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
+
+end
+
+class String
+
+ # Because there's no need to make the user worry about the order here,
+ # let's handle it.
+ ENCODE_ENTITIES_COMMAND_ORDER = {
+ :basic => 0,
+ :named => 1,
+ :decimal => 2,
+ :hexadecimal => 3
+ }
+
+ #
+ # Decode XML and HTML 4.01 entities in a string into their UTF-8
+ # equivalents. Obviously, if your string is not already in UTF-8, you'd
+ # better convert it before using this method, or the output will be mixed
+ # up.
+ # Unknown named entities are not converted
+ #
+ def decode_entities
+ return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
+ HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
+ }.gsub(/&#([0-9]{1,7});/) {
+ [$1.to_i].pack('U')
+ }.gsub(/&#x([0-9a-f]{1,6});/i) {
+ [$1.to_i(16)].pack('U')
+ }
+ end
+
+ #
+ # Encode codepoints into their corresponding entities. Various operations
+ # are possible, and may be specified in order:
+ #
+ # :basic :: Convert the five XML entities ('"<>&)
+ # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
+ # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
+ # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
+ #
+ # You can specify the commands in any order, but they will be executed in
+ # the order listed above to ensure that entity ampersands are not
+ # clobbered and that named entities are replaced before numeric ones.
+ #
+ # If no instructions are specified, :basic will be used.
+ #
+ # Examples:
+ # str.encode_entities - XML-safe
+ # str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
+ # str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
+ # non-ASCII characters replaced with their named entity where possible, and
+ # decimal equivalents otherwise.
+ #
+ # Note: It is the program's responsibility to ensure that the string
+ # contains valid UTF-8 before calling this method.
+ #
+ def encode_entities(*instructions)
+ str = nil
+ if (instructions.empty?)
+ instructions = [:basic]
+ else
+ instructions.each do |instr|
+ unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
+ raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
+ end
+ end
+ instructions.sort! { |a,b|
+ ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
+ ENCODE_ENTITIES_COMMAND_ORDER[b]
+ }
+ end
+ instructions.each do |instruction|
+ case instruction
+ when :basic
+ # Handled as basic ASCII
+ str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
+ # It's safe to use the simpler [0] here because we know
+ # that the basic entities are ASCII.
+ '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
+ }
+ when :named
+ # Test everything except printable ASCII
+ str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
+ cp = $&.unpack('U')[0]
+ (e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
+ }
+ when :decimal
+ str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
+ "&##{$&.unpack('U')[0]};"
+ }
+ when :hexadecimal
+ str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
+ "&#x#{$&.unpack('U')[0].to_s(16)};"
+ }
+ end
+ end
+ return str
+ end
+
+end
191 lib/rdig.rb
@@ -0,0 +1,191 @@
+#!/usr/bin/env ruby
+
+#--
+# Copyright (c) 2006 Jens Kraemer
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+#
+
+RDIGVERSION = '0.1.0'
+
+
+require 'thread'
+require 'thwait'
+require 'singleton'
+require 'monitor'
+require 'ostruct'
+require 'uri'
+require 'cgi'
+require 'net/http'
+require 'getoptlong'
+
+begin
+ require 'rubyful_soup'
+ require 'ferret'
+rescue LoadError
+ require 'rubygems'
+ require 'rubyful_soup'
+ require 'ferret'
+end
+
+require 'htmlentities/htmlentities'
+
+require 'rdig/http_client'
+require 'rdig/content_extractors'
+require 'rdig/url_filters'
+require 'rdig/ferret'
+require 'rdig/crawler'
+
+$KCODE = 'u'
+require 'jcode'
+
+module RDig
+
+ class << self
+
+ # the filter chain each URL has to run through before being crawled.
+ def filter_chain
+ @filter_chain ||= [
+ { :maximum_redirect_filter => :max_redirects },
+ :fix_relative_uri,
+ :normalize_uri,
+ { :hostname_filter => :include_hosts },
+ { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
+ { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
+ RDig::UrlFilters::VisitedUrlFilter
+ ]
+ end
+
+ def application
+ @application ||= Application.new
+ end
+
+ def config
+ @config ||= OpenStruct.new(
+ :crawler => OpenStruct.new(
+ :start_urls => [ "http://localhost:3000/" ],
+ :include_hosts => [ "localhost" ],
+ :include_documents => nil,
+ :exclude_documents => nil,
+ :index_document => nil,
+ :num_threads => 2,
+ :max_redirects => 5,
+ :wait_before_leave => 10
+ ),
+ :ferret => OpenStruct.new(
+ :path => "index/",
+ :create => true
+ )
+ )
+ end
+
+ # RDig.configuration do |config| ...
+ def configuration
+ yield config
+ end
+
+ end
+
+ class Application
+
+ OPTIONS = [
+ ['--config', '-c', GetoptLong::REQUIRED_ARGUMENT,
+ "Read aplication configuration from CONFIG."],
+ ['--help', '-h', GetoptLong::NO_ARGUMENT,
+ "Display this help message."],
+ ['--version', '-v', GetoptLong::NO_ARGUMENT,
+ "Display the program version."],
+ ]
+
+ # Application options from the command line
+ def options
+ @options ||= OpenStruct.new
+ end
+
+ # Display the program usage line.
+ def usage
+ puts "rdig -c configfile {options}"
+ end
+
+ # Display the rake command line help.
+ def help
+ usage
+ puts
+ puts "Options are ..."
+ puts
+ OPTIONS.sort.each do |long, short, mode, desc|
+ if mode == GetoptLong::REQUIRED_ARGUMENT
+ if desc =~ /\b([A-Z]{2,})\b/
+ long = long + "=#{$1}"
+ end
+ end
+ printf " %-20s (%s)\n", long, short
+ printf " %s\n", desc
+ end
+ end
+
+ # Return a list of the command line options supported by the
+ # program.
+ def command_line_options
+ OPTIONS.collect { |lst| lst[0..-2] }
+ end
+
+ # Do the option defined by +opt+ and +value+.
+ def do_option(opt, value)
+ case opt
+ when '--help'
+ help
+ exit
+ when '--config'
+ options.config_file = value
+ when '--version'
+ puts "rdig, version #{RDIGVERSION}"
+ exit
+ else
+ fail "Unknown option: #{opt}"
+ end
+ end
+
+ # Read and handle the command line options.
+ def handle_options
+ opts = GetoptLong.new(*command_line_options)
+ opts.each { |opt, value| do_option(opt, value) }
+ end
+
+ # Load the configuration
+ def load_configfile
+ load File.expand_path(options.config_file)
+ end
+
+ # Run the +rdig+ application.
+ def run
+ handle_options
+ begin
+ load_configfile
+ rescue
+ fail "No Configfile found!"
+ end
+
+ @crawler = Crawler.new
+ @crawler.run
+ end
+ end
+end
86 lib/rdig/content_extractors.rb
@@ -0,0 +1,86 @@
+# override some methods concered with entity resolving
+# to convert them to strings
+class BeautifulStoneSoup
+ # resolve unknown html entities using the htmlentities lib
+ alias :orig_unknown_entityref :unknown_entityref
+ def unknown_entityref(ref)
+ if HTMLEntities::MAP.has_key?(ref)
+ handle_data [HTMLEntities::MAP[ref]].pack('U')
+ else
+ orig_unknown_entityref ref
+ end
+ end
+
+ # resolve numeric entities to utf8
+ def handle_charref(ref)
+ handle_data( ref.gsub(/([0-9]{1,7})/) {
+ [$1.to_i].pack('U')
+ }.gsub(/x([0-9a-f]{1,6})/i) {
+ [$1.to_i(16)].pack('U')
+ } )
+ end
+end
+
+module RDig
+
+ # todo support at least pdf, too
+ module ContentExtractors
+
+ def ContentExtractors.process(content, content_type)
+ case content_type
+ when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
+ return HtmlContentExtractor.process(content)
+ else
+ puts "unable to handle content type #{content_type}"
+ end
+ return nil
+ end
+
+ class HtmlContentExtractor
+
+ # returns:
+ # { :content => 'extracted clear text',
+ # :meta => { :title => 'Title' },
+ # :links => [array of urls] }
+ def self.process(content)
+ result = { :title => '' }
+ tag_soup = BeautifulSoup.new(content)
+ titleTag = tag_soup.html.head.title
+ result[:title] = titleTag.string.strip if titleTag
+ content = ''
+ result[:links] = links = []
+
+ process_child = lambda { |child|
+ if child.is_a? Tag and child.name == 'a'
+ links << CGI.unescapeHTML(child['href']) if child['href']
+ end
+ if child.is_a? NavigableString
+ value = self.strip_comments(child)
+ value.strip!
+ unless value.empty?
+ content << value
+ content << ' '
+ end
+ elsif child.string # it's a Tag, and it has some content string
+ value = child.string.strip
+ unless value.empty?
+ content << value
+ content << ' '
+ end
+ else
+ child.children(&process_child)
+ end
+ true
+ }
+ tag_soup.html.body.children(&process_child)
+ result[:content] = content.strip #CGI.unescapeHTML(content.strip)
+ return result
+ end
+
+ def self.strip_comments(string)
+ string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
+ end
+ end
+
+ end
+end
147 lib/rdig/crawler.rb
@@ -0,0 +1,147 @@
+module RDig
+
+ class Crawler
+
+ def initialize
+ @documents = Queue.new
+ end
+
+
+ def run
+ @indexer = Indexer.new(RDig.config.ferret)
+ RDig.config.crawler.start_urls.each { |url| add_url(url) }
+
+ num_threads = RDig.config.crawler.num_threads
+ group = ThreadsWait.new
+ num_threads.times { |i|
+ group.join_nowait Thread.new("fetcher #{i}") {
+ filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
+ while (doc = @documents.pop) != :exit
+ process_document doc, filterchain
+ end
+ }
+ }
+
+ # dilemma: suppose we have 1 start url and two threads t1 and t2:
+ # t1 pops the start url from the queue which now is empty
+ # as the queue is empty now, t2 blocks until t1 adds the links
+ # retrieved from his document.
+ #
+ # But we need the 'queue empty' condition as a sign for us to stop
+ # waiting for new entries, too.
+
+ # check every now and then for an empty queue
+ sleep_interval = RDig.config.crawler.wait_before_leave
+ begin
+ sleep sleep_interval
+ end until @documents.empty?
+ # nothing to do any more, tell the threads to exit
+ num_threads.times { @documents << :exit }
+
+ puts "waiting for threads to finish..."
+ group.all_waits
+ ensure
+ @indexer.close if @indexer
+ end
+
+ def process_document(doc, filterchain)
+ doc.fetch
+ case doc.status
+ when :success
+ if doc.content
+ if doc.content[:links]
+ doc.content[:links].each { |url| add_url(url, filterchain, doc) }
+ end
+ @indexer << doc
+ #else
+ #puts "success but no content: #{doc.uri.to_s}"
+ end
+ when :redirect
+ # links contains the url we were redirected to
+ doc.content[:links].each { |url| add_url(url, filterchain, doc) }
+ end
+ rescue
+ puts "error processing document #{doc.uri.to_s}: #{$!}"
+ end
+
+
+ # pipes a new document pointing to url through the filter chain,
+ # if it survives that, it gets added to the documents queue for further
+ # processing
+ def add_url(url, filterchain = nil, referring_document = nil)
+ return if url.nil? || url.empty?
+ if referring_document
+ doc = Document.new(url, referring_document.uri)
+ # keep redirect count
+ if referring_document.status == :redirect
+ doc.redirections = referring_document.redirections + 1
+ end
+ else
+ doc = Document.new(url)
+ end
+
+ doc = filterchain.apply(doc) if filterchain
+
+ if doc
+ puts "added url #{url}"
+ #else
+ #puts "skipping url #{url}"
+ end
+ @documents << doc if doc
+ end
+
+ end
+
+
+ class Document
+ include HttpClient
+
+ attr_reader :content
+ attr_reader :content_type
+ attr_reader :uri
+ attr_reader :referring_uri
+ attr_reader :status
+ attr_accessor :redirections
+
+ # url: url of this document, may be relative to the referring doc or host.
+ # referrer: uri of the document we retrieved this link from
+ def initialize(url, referrer = nil)
+ @redirections = 0
+ begin
+ @uri = URI.parse(url)
+ rescue URI::InvalidURIError
+ raise "Cannot create document using invalid URL: #{url}"
+ end
+ @referring_uri = referrer
+ end
+
+ def has_content?
+ !self.content.nil?
+ end
+
+ def title; @content[:title] end
+ def body; @content[:content] end
+ def url; @uri.to_s end
+
+ def fetch
+ puts "fetching #{@uri.to_s}"
+ response = do_get(@uri)
+ case response
+ when Net::HTTPSuccess
+ @content_type = response['content-type']
+ @raw_body = response.body
+ # todo externalize this (another chain ?)
+ @content = ContentExtractors.process(@raw_body, @content_type)
+ @status = :success
+ when Net::HTTPRedirection
+ @status = :redirect
+ @content = { :links => [ response['location'] ] }
+ else
+ puts "dunno what to do with response: #{response}"
+ end
+
+ end
+
+ end
+
+end
36 lib/rdig/ferret.rb
@@ -0,0 +1,36 @@
+module RDig
+
+ class Indexer
+ include MonitorMixin, Ferret::Index, Ferret::Document
+
+ def initialize(settings)
+ @ferret_config = settings
+ @index_writer = IndexWriter.new(@ferret_config.path,
+ :create => @ferret_config.create)
+ super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
+ end
+
+ def add_to_index(document)
+ puts "add to index: #{document.uri.to_s}"
+ doc = Ferret::Document::Document.new
+ doc << Field.new("url", document.url,
+ Field::Store::YES, Field::Index::UNTOKENIZED)
+ doc << Field.new("title", document.title,
+ Field::Store::YES, Field::Index::TOKENIZED)
+ doc << Field.new("data", document.body,
+ Field::Store::YES, Field::Index::TOKENIZED)
+ synchronize do
+ @index_writer << doc
+ end
+ end
+ alias :<< :add_to_index
+
+ def close
+ @index_writer.optimize
+ @index_writer.close
+ @index_writer = nil
+ end
+
+ end
+
+end
24 lib/rdig/http_client.rb
@@ -0,0 +1,24 @@
+require 'net/http'
+
+module RDig
+
+ module HttpClient
+ def do_get(uri, user_agent='RDig crawler')
+ # Set up the appropriate http headers
+ headers = { "User-Agent" => user_agent }
+ result = {}
+
+ begin
+ Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
+ final_uri = uri.path
+ final_uri += ('?' + uri.query) if uri.query
+ return http.get(final_uri, headers)
+ }
+ rescue => error
+ puts error
+ end
+ end
+ end
+
+end
+
171 lib/rdig/url_filters.rb
@@ -0,0 +1,171 @@
+module RDig
+
+ module UrlFilters
+
+ class FilterChain
+ def initialize(chain_config)
+ @filters = []
+ chain_config.each { |filter|
+ case filter
+ when Hash
+ filter.each_pair { |f, args|
+ add(f, args)
+ }
+ when Array
+ args = filter
+ filter = args.shift
+ add(filter, args)
+ else
+ add(filter)
+ end
+ }
+ end
+
+ # add a filter and it's args to the chain
+ # when args is a symbol, it is treated as a configuration key
+ def add(filter, args=nil)
+ args = RDig.config.crawler.send(args) if args.is_a? Symbol
+ case filter
+ when Symbol
+ if args.nil?
+ @filters << lambda { |document|
+ UrlFilters.send(filter, document)
+ }
+ else
+ @filters << lambda { |document|
+ UrlFilters.send(filter, document, args)
+ }
+ end
+ when Class
+ if args.nil?
+ if filter.respond_to?(:instance)
+ filter_instance = filter.instance
+ else
+ filter_instance = filter.new
+ end
+ else
+ filter_instance = filter.new(args)
+ end
+ @filters << lambda { |document|
+ filter_instance.apply(document)
+ }
+ end
+ end
+
+ def apply(document)
+ @filters.each { |filter|
+ return nil unless filter.call(document)
+ }
+ return document
+ end
+ end
+
+ # takes care of a list of all Urls visited during a crawl, to avoid
+ # indexing pages more than once
+ # implemented as a thread safe singleton as it has to be shared
+ # between all crawler threads
+ class VisitedUrlFilter
+ include MonitorMixin, Singleton
+ def initialize
+ @visited_urls = Set.new
+ super
+ end
+
+ # return document if this document's url has not been visited yet,
+ # nil otherwise
+ def apply(document)
+ synchronize do
+ @visited_urls.add?(document.uri.to_s) ? document : nil
+ end
+ end
+ end
+
+
+ # base class for url inclusion / exclusion filters
+ class UrlPatternFilter
+ # takes an Array of Regexps, or nil to disable the filter
+ def initialize(args=nil)
+ unless args.nil?
+ @patterns = []
+ if args.respond_to? :each
+ args.each { |pattern|
+ # cloning because unsure if regexps are thread safe...
+ @patterns << pattern.clone
+ }
+ else
+ @patterns << args.clone
+ end
+ end
+ end
+ end
+ class UrlExclusionFilter < UrlPatternFilter
+ # returns nil if any of the patterns matches it's URL,
+ # the document itself otherwise
+ def apply(document)
+ return document unless @patterns
+ @patterns.each { |p|
+ return nil if document.uri.to_s =~ p
+ }
+ return document
+ end
+ end
+ class UrlInclusionFilter < UrlPatternFilter
+ # returns nil if any of the patterns matches it's URL,
+ # the document itself otherwise
+ def apply(document)
+ return document unless @patterns
+ @patterns.each { |p|
+ return document if document.uri.to_s =~ p
+ }
+ return nil
+ end
+ end
+
+
+
+
+ # checks redirect count of the given document
+ # takes it out of the chain if number of redirections exceeds the
+ # max_redirects setting
+ def UrlFilters.maximum_redirect_filter(document, max_redirects)
+ return nil if document.redirections > max_redirects
+ return document
+ end
+
+ # expands both href="/path/xyz.html" and href="affe.html"
+ # to full urls
+ def UrlFilters.fix_relative_uri(document)
+ return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^http/i
+ ref = document.referring_uri
+ return document unless ref
+ uri = document.uri
+ uri.scheme = ref.scheme unless uri.scheme
+ uri.host = ref.host unless uri.host
+ uri.port = ref.port unless uri.port || ref.port==ref.default_port
+ uri.path = ref.path unless uri.path
+
+ if uri.path !~ /^\//
+ ref_path = ref.path || '/'
+ ref_path << '/' if ref_path.empty?
+ uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
+ end
+ return document
+ end
+
+ def UrlFilters.hostname_filter(document, include_hosts)
+ return document if include_hosts.include?(document.uri.host)
+ return nil
+ end
+
+ def UrlFilters.normalize_uri(document)
+ document.uri.fragment = nil
+ # document.uri.query = nil
+ # append index document if configured and path ends with a slash
+ if RDig.config.index_document && document.uri.path =~ /\/$/
+ document.uri.path << RDig.config.index_document
+ end
+ return document
+ end
+
+ end
+end
321 rakefile
@@ -0,0 +1,321 @@
+# rakefile for RDig.
+# large parts borrowed from rake's Rakefile
+
+begin
+ require 'rubygems'
+ require 'rake/gempackagetask'
+rescue Exception
+ nil
+end
+require 'rake'
+require 'rake/testtask'
+require 'rake/rdoctask'
+require 'rake/packagetask'
+require 'rake/contrib/rubyforgepublisher'
+
+def announce(msg='')
+ STDERR.puts msg
+end
+
+
+PKG_NAME = 'rdig'
+
+# Determine the current version of the software
+if `ruby -Ilib ./bin/rdig --version` =~ /rdig, version ([0-9.]+)$/
+ CURRENT_VERSION = $1
+else
+ CURRENT_VERSION = "0.0.0"
+end
+
+if ENV['REL']
+ PKG_VERSION = ENV['REL']
+else
+ PKG_VERSION = CURRENT_VERSION
+end
+
+SRC_RB = FileList['lib/**/*.rb']
+
+PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
+
+RELEASE_NAME = "REL #{PKG_VERSION}"
+
+RUBY_FORGE_PROJECT = "rdig"
+RUBY_FORGE_USER = "jkraemer"
+
+PKG_FILES = FileList[
+ "bin/**/*",
+ "lib/**/*",
+ "test/**/*",
+ "doc/**/*",
+ "[A-Z]*",
+ "install.rb",
+ "rakefile"
+].exclude(/\.svn|~$|\.swp$/)
+
+
+desc "Default Task"
+task :default => [ :test_all ]
+
+# Test Tasks -------------------------------------------------------------
+
+task :ta => :test_all
+task :tf => :test_functional
+task :tu => :test_units
+
+# Run all tests
+Rake::TestTask.new("test_all") { |t|
+ t.test_files = FileList[
+ 'test/unit/*_test.rb',
+ 'test/functional/*_test.rb'
+ ]
+ t.libs << "test"
+ #t.warning = true
+ t.verbose = true
+}
+
+# Run unit tests
+Rake::TestTask.new("test_units") { |t|
+ t.test_files = FileList[ 'test/unit/*_test.rb' ]
+ t.libs << "test"
+ #t.warning = true
+ t.verbose = true
+}
+
+# Run functional tests
+Rake::TestTask.new("test_functional") { |t|
+ t.test_files = FileList[ 'test/functional/*_test.rb' ]
+ t.libs << "test"
+ #t.warning = true
+ t.verbose = true
+}
+
+
+
+# Generate the RDoc documentation ----------------------------------------
+
+rd = Rake::RDocTask.new { |rdoc|
+ rdoc.rdoc_dir = 'doc/html'
+ rdoc.title = "RDig - Ferret based full text search for web sites"
+ rdoc.options << '--line-numbers' << '--inline-source'
+ rdoc.options << '--main' << 'README'
+ rdoc.template = "#{ENV['template']}.rb" if ENV['template']
+ rdoc.rdoc_files.include('README', 'CHANGES', 'LICENSE', 'TODO')
+ rdoc.rdoc_files.include('lib/**/*.rb')
+}
+
+
+# packaging --------------------------------------------------------------
+
+# ====================================================================
+# Create a task that will package the Rake software into distributable
+# tar, zip and gem files.
+
+if ! defined?(Gem)
+ puts "Package Target requires RubyGEMs"
+else
+ spec = Gem::Specification.new do |s|
+
+ #### Basic information.
+
+ s.name = 'rdig'
+ s.version = PKG_VERSION
+ s.summary = "Ruby based web site indexing and searching library."
+ s.description = <<-EOF
+ EOF
+
+ #### Dependencies and requirements.
+
+ s.add_dependency('ferret', '>= 0.3.2')
+ s.add_dependency('rubyful_soup', '>= 1.0.4')
+ #s.requirements << ""
+
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
+
+ s.files = PKG_FILES.to_a
+
+ #### Load-time details: library and application (you will need one or both).
+
+ s.require_path = 'lib' # Use these for libraries.
+ s.bindir = "bin" # Use these for applications.
+ s.executables = ["rdig"]
+ s.default_executable = "rdig"
+
+ #### Documentation and testing.
+
+ s.has_rdoc = true
+ s.extra_rdoc_files = rd.rdoc_files.reject { |fn| fn =~ /\.rb$/ }.to_a
+ s.rdoc_options <<
+ '--title' << 'Rake -- Ruby Make' <<
+ '--main' << 'README' <<
+ '--line-numbers'
+
+ #### Author and project details.
+
+ s.author = "Jens Kraemer"
+ s.email = "jk@jkraemer.net"
+ s.homepage = "http://rdig.rubyforge.org"
+ s.rubyforge_project = "rdig"
+# if ENV['CERT_DIR']
+# s.signing_key = File.join(ENV['CERT_DIR'], 'gem-private_key.pem')
+# s.cert_chain = [File.join(ENV['CERT_DIR'], 'gem-public_cert.pem')]
+# end
+ end
+
+ package_task = Rake::GemPackageTask.new(spec) do |pkg|
+ pkg.need_zip = true
+ pkg.need_tar = true
+ end
+end
+
+
+
+# misc ----------------------------------------------------------------
+
+def count_lines(filename)
+ lines = 0
+ codelines = 0
+ open(filename) { |f|
+ f.each do |line|
+ lines += 1
+ next if line =~ /^\s*$/
+ next if line =~ /^\s*#/
+ codelines += 1
+ end
+ }
+ [lines, codelines]
+end
+
+def show_line(msg, lines, loc)
+ printf "%6s %6s %s\n", lines.to_s, loc.to_s, msg
+end
+
+desc "Count lines in the main rake file"
+task :lines do
+ total_lines = 0
+ total_code = 0
+ show_line("File Name", "LINES", "LOC")
+ SRC_RB.each do |fn|
+ lines, codelines = count_lines(fn)
+ show_line(fn, lines, codelines)
+ total_lines += lines
+ total_code += codelines
+ end
+ show_line("TOTAL", total_lines, total_code)
+end
+
+# Define an optional publish target in an external file. If the
+# publish.rf file is not found, the publish targets won't be defined.
+
+load "publish.rf" if File.exist? "publish.rf"
+
+
+# Support Tasks ------------------------------------------------------
+
+desc "Look for TODO and FIXME tags in the code"
+task :todo do
+ FileList['**/*.rb'].exclude('pkg').egrep /#.*(FIXME|TODO|TBD)/
+end
+
+desc "Look for Debugging print lines"
+task :dbg do
+ FileList['**/*.rb'].egrep /\bDBG|\bbreakpoint\b/
+end
+
+desc "List all ruby files"
+task :rubyfiles do
+ puts Dir['**/*.rb'].reject { |fn| fn =~ /^pkg/ }
+ puts Dir['bin/*'].reject { |fn| fn =~ /CVS|(~$)|(\.rb$)/ }
+end
+task :rf => :rubyfiles
+
+
+# --------------------------------------------------------------------
+# Creating a release
+
+desc "Make a new release"
+task :release => [
+ :prerelease,
+ :clobber,
+ :test_all,
+ :update_version,
+ :package,
+ :tag] do
+
+ announce
+ announce "**************************************************************"
+ announce "* Release #{PKG_VERSION} Complete."
+ announce "* Packages ready to upload."
+ announce "**************************************************************"
+ announce
+end
+
+# Validate that everything is ready to go for a release.
+task :prerelease do
+ announce
+ announce "**************************************************************"
+ announce "* Making RubyGem Release #{PKG_VERSION}"
+ announce "* (current version #{CURRENT_VERSION})"
+ announce "**************************************************************"
+ announce
+
+ # Is a release number supplied?
+ unless ENV['REL']
+ fail "Usage: rake release REL=x.y.z [REUSE=tag_suffix]"
+ end
+
+ # Is the release different than the current release.
+ # (or is REUSE set?)
+ if PKG_VERSION == CURRENT_VERSION && ! ENV['REUSE']
+ fail "Current version is #{PKG_VERSION}, must specify REUSE=tag_suffix to reuse version"
+ end
+
+ # Are all source files checked in?
+ if ENV['RELTEST']
+ announce "Release Task Testing, skipping checked-in file test"
+ else
+ announce "Checking for unchecked-in files..."
+ data = `svn st`
+ unless data =~ /^$/
+ fail "SVN status is not clean ... do you have unchecked-in files?"
+ end
+ announce "No outstanding checkins found ... OK"
+ end
+end
+
+task :update_version => [:prerelease] do
+ if PKG_VERSION == CURRENT_VERSION
+ announce "No version change ... skipping version update"
+ else
+ announce "Updating RDig version to #{PKG_VERSION}"
+ open("lib/rdig.rb") do |rakein|
+ open("lib/rdig.rb.new", "w") do |rakeout|
+ rakein.each do |line|
+ if line =~ /^RDIGVERSION\s*=\s*/
+ rakeout.puts "RDIGVERSION = '#{PKG_VERSION}'"
+ else
+ rakeout.puts line
+ end
+ end
+ end
+ end
+ mv "lib/rake.rb.new", "lib/rake.rb"
+ if ENV['RELTEST']
+ announce "Release Task Testing, skipping commiting of new version"
+ else
+ sh %{svn commit -m "Updated to version #{PKG_VERSION}" lib/rake.rb}
+ end
+ end
+end
+
+desc "Tag all files with the latest release number (REL=x.y.z)"
+task :tag => [:prerelease] do
+ reltag = "REL_#{PKG_VERSION.gsub(/\./, '_')}"
+ reltag << ENV['REUSE'].gsub(/\./, '_') if ENV['REUSE']
+ announce "Tagging with [#{reltag}]"
+ if ENV['RELTEST']
+ announce "Release Task Testing, skipping CVS tagging"
+ else
+ sh %{cd ..; svn copy trunk tags/#{reltag}}
+ end
+end
+
15 test/fixtures/html/entities.html
@@ -0,0 +1,15 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+ <head>
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
+ <title>Sample &amp; Title</title>
+ </head>
+ <body>
+ <h1>Some &gt; Links</h1>
+ <p>don't&nbsp;break me!</p>
+ <a href="http://test.host/affe.html?b=a&amp;c=d">Affe</a>
+ <a href="http://test.host/affe2.html?b=a&c=d">Affe</a>
+ <h1>&Uuml;ml&auml;uts</h1>
+ <p>hei&szlig; hier &#223;</p>
+ </body>
+</html>
17 test/fixtures/html/simple.html
@@ -0,0 +1,17 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+ <head>
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
+ <title>Sample Title</title>
+ </head>
+ <body>
+ <h1>A Link</h1>
+ <a href="http://test.host/affe.html">Affe</a>
+ <h1>Some sample <span>text</span></h1>
+ <!-- invalid markup follows -->
+ <p>Lorem<br>
+ <!-- another comment
+ here -->
+ ipsum
+ </body>
+</html>
18 test/test_helper.rb
@@ -0,0 +1,18 @@
+require 'test/unit'
+require 'rdig'
+#File.expand_path(File.dirname(__FILE__) + "/../init.rb")
+# require File.expand_path(File.dirname(__FILE__) + "/../init.rb")
+
+module TestHelper
+ include RDig
+
+ def read_fixture(path)
+ File.open("#{File.expand_path(File.dirname(__FILE__))}/fixtures/#{path}") { |f|
+ f.read
+ }
+ end
+
+ def html_doc(name)
+ read_fixture("html/#{name}.html")
+ end
+end
30 test/unit/html_content_extractor_test.rb
@@ -0,0 +1,30 @@
+require 'test_helper'
+class HtmlContentExtractorTest < Test::Unit::TestCase
+ include TestHelper
+
+ def setup
+ @extractor = ContentExtractors::HtmlContentExtractor
+ @nbsp = [160].pack('U') # non breaking space
+ end
+
+ def test_simple
+ result = @extractor.process(html_doc('simple'))
+ assert_not_nil result
+ assert_equal 'Sample Title', result[:title]
+ assert_not_nil result[:content]
+ assert_not_nil result[:links]
+ assert_equal 1, result[:links].size
+ assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
+ assert_equal 'http://test.host/affe.html', result[:links].first
+ end
+
+ def test_entities
+ result = @extractor.process(html_doc('entities'))
+ assert_equal 'Sample & Title', result[:title]
+ assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
+ assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
+ assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
+ end
+
+end
+
86 test/unit/url_filters_test.rb
@@ -0,0 +1,86 @@
+require 'test_helper'
+class UrlFilterTest < Test::Unit::TestCase
+ include TestHelper, RDig
+
+ def setup
+ end
+
+ # test a chain configured with direct parameters
+ def test_filterchain
+ cfg = [
+ { UrlFilters::UrlInclusionFilter => /.+html$/ },
+ { :hostname_filter => 'test.host' }
+ ]
+ chain = UrlFilters::FilterChain.new(cfg)
+
+ assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
+ assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
+ assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
+ end
+
+ # test default chain config
+ def test_default_filterchain
+ chain = UrlFilters::FilterChain.new(RDig.filter_chain)
+ assert_nil chain.apply(Document.new("http://www.example.com/affe.htm"))
+ assert_not_nil chain.apply(Document.new("http://localhost:3000/affe.html"))
+ assert_nil chain.apply(Document.new("http://localhost.com/affe.html"))
+ end
+
+ # check lookup of chain parameters from config
+ def test_filterchain_config
+ RDig.configuration do |conf|
+ conf.crawler.include_patterns = /.+html$/
+ conf.crawler.include_hosts = 'test.host'
+ end
+ cfg = [
+ { UrlFilters::UrlInclusionFilter => :include_patterns },
+ { :hostname_filter => :include_hosts }
+ ]
+ chain = UrlFilters::FilterChain.new(cfg)
+
+ assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
+ assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
+ assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
+ end
+
+ def test_urlpattern_filter
+ f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
+ assert_nil f.apply(Document.new("http://test.host/affe.htm"))
+ assert_not_nil f.apply(Document.new("http://test.host/affe.html"))
+ f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
+ assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
+ assert_nil f.apply(Document.new("http://test.host/affe.html"))
+ assert_nil f.apply(Document.new("http://test.host/affe.aspx"))
+ end
+
+ def test_hostname_filter
+ include_hosts = [ 'test.host', 'localhost' ]
+ assert_nil UrlFilters.hostname_filter(Document.new('http://google.com/'), include_hosts)
+ assert_not_nil UrlFilters.hostname_filter(Document.new('http://test.host/file.html'), include_hosts)
+ assert_not_nil UrlFilters.hostname_filter(Document.new('http://localhost/file.html'), include_hosts)
+ end
+
+ def test_fix_relative_uri
+ doc = Document.new('http://test.host/dir/file.html')
+ assert_equal('http://test.host/dir/another.html',
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
+ assert_equal('http://test.host/dir/../another.html',
+ UrlFilters.fix_relative_uri(Document.new('../another.html', doc.uri)).uri.to_s)
+ assert_equal('http://test.host/dir/another.html',
+ UrlFilters.fix_relative_uri(Document.new('/dir/another.html', doc.uri)).uri.to_s)
+ assert_equal('http://test.host/dir/another.html',
+ UrlFilters.fix_relative_uri(Document.new('http://test.host/dir/another.html', doc.uri)).uri.to_s)
+ assert_equal('HTTP://test.host/dir/another.html',
+ UrlFilters.fix_relative_uri(Document.new('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
+ doc = Document.new('https://test.host/dir/')
+ assert_equal('https://test.host/dir/another.html',
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
+ doc = Document.new('https://test.host/')
+ assert_equal('https://test.host/another.html',
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
+ doc = Document.new('https://test.host')
+ assert_equal('https://test.host/another.html',
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
+ end
+end
+
Please sign in to comment.
Something went wrong with that request. Please try again.