Browse files

rel 0.1.0

  • Loading branch information...
1 parent b32f192 commit 2ffe8f14908ad91d1d2c78334069bfac8664ccb4 @jkraemer committed Mar 25, 2006
View
72 README
@@ -1,51 +1,61 @@
-dependencies:
-ferret >= 0.3.2
-rubyful_soup
+= RDig
-gem install rdig
+RDig provides an HTTP crawler and content extraction utilities
+to help building a site search for web sites or intranets. Internally,
+Ferret is used for the full text indexing. After creating a config file
+for your site, the index can be built with a single call to rdig.
-basic usage
-===========
+RDig depends on Ferret (>= 0.3.2) and the RubyfulSoup library (>= 1.0.4).
-- create a config file based on the template in doc/examples
+== basic usage
-- to create an index:
-rdig -c CONFIGFILE
+=== Index creation
+- create a config file based on the template in doc/examples
+- to create an index:
+ rdig -c CONFIGFILE
- to run a query against the index (just to try it out)
-rdig -c CONFIGFILE -q 'your query'
+ rdig -c CONFIGFILE -q 'your query'
+ this will dump the first 10 search results to STDOUT
-this will dump the results hash to STDOUT
+=== Handle search in your application:
+ require 'rdig'
+ require 'rdig_config' # load your config file here
+ search_results = RDig.searcher.search(query, options={})
-- in your application:
-require 'rdig'
-require 'rdig_config' # load your config file here
+see RDig::Search::Searcher for more information.
-- use RDig.searcher.search(query, options={})
-
-usage in rails
-==============
+== usage in rails
- add to config/environment.rb :
+ require 'rdig'
+ require 'rdig_config'
+- place rdig_config.rb into config/ directory.
+- build index:
+ rdig -c config/rdig_config.rb
+- in your controller that handles the search form:
+ search_results = RDig.searcher.search(params[:query])
+ @results = search_results[:list]
+ @hitcount = search_results[:hitcount]
+
+=== search result paging
+Use the :first_doc and :num_docs options to implement
+paging through search results.
+(:num_docs is 10 by default, so without using these options only the first 10
+results will be retrieved)
-require 'rdig'
-require 'rdig_config'
-- place rdig_config.rb into config/ directory.
+== sample configuration
-- build index:
-rdig -c config/rdig_config.rb
+from doc/examples/config.rb. The tag_selector properties are called
+with a BeautifulSoup instance as parameter. See the RubyfulSoup Site[http://www.crummy.com/software/RubyfulSoup/documentation.html] for more info about this cool lib.
+You can also have a look at the +html_content_extractor+ unit test.
-- in your controller handling the search form:
+See [] for API documentation of the
+Rubyful Soup lib used
-search_results = RDig.searcher.search(params[:query])
-@results = search_results[:list]
-@hitcount = search_results[:hitcount]
+:include:doc/examples/config.rb
-use the :first_doc and :num_docs options to implement
-paging through search results
-(:num_docs is 10 by default, so without using these options only the first 10
-results will be retrieved)
View
1 doc/.svnignore
@@ -0,0 +1 @@
+html
View
8 doc/examples/config.rb
@@ -1,7 +1,3 @@
-#
-# sample RDig configuration file, edit to taste
-#
-
RDig.configuration do |cfg|
##################################################################
@@ -33,11 +29,11 @@
# crawler options
- # nil (index all documents) or a list of Regexps
+ # nil (index all documents) or an array of Regexps
# matching URLs you want to index.
# cfg.crawler.include_documents = nil
- # nil (no documents excluded) or a list of Regexps
+ # nil (no documents excluded) or an array of Regexps
# matching URLs not to index.
# this filter is used after the one above, so you only need
# to exclude documents here that aren't wanted but would be
View
1 install.rb
@@ -13,6 +13,7 @@
# handle running it. Under windows, we add a '.rb' extension
# and let file associations to their stuff
#
+# based on install.rb from the Rake distribution
def installBIN(from, opfile)
View
96 lib/rdig.rb
@@ -34,6 +34,7 @@
require 'ostruct'
require 'uri'
require 'cgi'
+require 'set'
require 'net/http'
require 'getoptlong'
@@ -58,6 +59,7 @@
$KCODE = 'u'
require 'jcode'
+# See README for basic usage information
module RDig
class << self
@@ -83,46 +85,52 @@ def searcher
@searcher ||= Search::Searcher.new(config.ferret)
end
- def config
- @config ||= OpenStruct.new(
- :crawler => OpenStruct.new(
- :start_urls => [ "http://localhost:3000/" ],
- :include_hosts => [ "localhost" ],
- :include_documents => nil,
- :exclude_documents => nil,
- :index_document => nil,
- :num_threads => 2,
- :max_redirects => 5,
- :wait_before_leave => 10
- ),
- :content_extraction => OpenStruct.new(
- # settings for html content extraction
- :html => OpenStruct.new(
- # select the html element that contains the content to index
- # by default, we index all inside the body tag:
- :content_tag_selector => lambda { |tagsoup|
- tagsoup.html.body
- },
- # select the html element containing the title
- :title_tag_selector => lambda { |tagsoup|
- tagsoup.html.head.title
- }
+ # RDig configuration
+ #
+ # may be used with a block:
+ # RDig.configuration do |config| ...
+ #
+ # see doc/examples/config.rb for a commented example configuration
+ def configuration
+ if block_given?
+ yield configuration
+ else
+ @config ||= OpenStruct.new(
+ :crawler => OpenStruct.new(
+ :start_urls => [ "http://localhost:3000/" ],
+ :include_hosts => [ "localhost" ],
+ :include_documents => nil,
+ :exclude_documents => nil,
+ :index_document => nil,
+ :num_threads => 2,
+ :max_redirects => 5,
+ :wait_before_leave => 10
+ ),
+ :content_extraction => OpenStruct.new(
+ # settings for html content extraction
+ :html => OpenStruct.new(
+ # select the html element that contains the content to index
+ # by default, we index all inside the body tag:
+ :content_tag_selector => lambda { |tagsoup|
+ tagsoup.html.body
+ },
+ # select the html element containing the title
+ :title_tag_selector => lambda { |tagsoup|
+ tagsoup.html.head.title
+ }
+ )
+ ),
+ :ferret => OpenStruct.new(
+ :path => "index/",
+ :create => true,
+ :handle_parse_errors => true,
+ :analyzer => Ferret::Analysis::StandardAnalyzer.new,
+ :occur_default => Ferret::Search::BooleanClause::Occur::MUST
)
- ),
- :ferret => OpenStruct.new(
- :path => "index/",
- :create => true,
- :handle_parse_errors => true,
- :analyzer => Ferret::Analysis::StandardAnalyzer.new,
- :occur_default => Ferret::Search::BooleanClause::Occur::MUST
)
- )
- end
-
- # RDig.configuration do |config| ...
- def configuration
- yield config
+ end
end
+ alias config configuration
end
@@ -214,9 +222,19 @@ def run
if options.query
# query the index
- puts "executing query #{options.query}"
- p RDig.searcher.search(options.query)
+ puts "executing query >#{options.query}<"
+ results = RDig.searcher.search(options.query)
+ puts "total results: #{results[:hitcount]}"
+ results[:list].each { |result|
+ puts <<-EOF
+#{result[:url]}
+ #{result[:title]}
+ #{result[:extract]}
+
+ EOF
+ }
else
+ # rebuild index
@crawler = Crawler.new
@crawler.run
end
View
91 lib/rdig/content_extractors.rb
@@ -23,9 +23,13 @@ def handle_charref(ref)
module RDig
- # todo support at least pdf, too
+ # Contains Classes which are used for extracting content and meta data from
+ # various content types.
+ #
+ # TODO: support at least pdf, too.
module ContentExtractors
+ # process the given +content+ depending on it's +content_type+.
def ContentExtractors.process(content, content_type)
case content_type
when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
@@ -36,43 +40,36 @@ def ContentExtractors.process(content, content_type)
return nil
end
+ # extracts title, content and links from html documents
class HtmlContentExtractor
# returns:
# { :content => 'extracted clear text',
# :meta => { :title => 'Title' },
# :links => [array of urls] }
def self.process(content)
- result = { :title => '' }
+ result = { }
tag_soup = BeautifulSoup.new(content)
- the_title_tag = title_tag(tag_soup)
- result[:title] = the_title_tag.string.strip if the_title_tag
+ result[:title] = extract_title(tag_soup)
result[:links] = extract_links(tag_soup)
- content = ''
+ result[:content] = extract_content(tag_soup)
+ return result
+ end
- # links sollten aber von ganzen dokument bezogen werden, nicht bloss vom content
- process_child = lambda { |child|
- if child.is_a? NavigableString
- value = self.strip_comments(child)
- value.strip!
- unless value.empty?
- content << value
- content << ' '
- end
- elsif child.string # it's a Tag, and it has some content string
- value = child.string.strip
- unless value.empty?
- content << value
- content << ' '
- end
- else
- child.children(&process_child)
- end
- true
+ # Extracts textual content from the HTML tree.
+ #
+ # - First, the root element to use is determined using the
+ # +content_element+ method, which itself uses the content_tag_selector
+ # from RDig.configuration.
+ # - Then, this element is processed by +extract_text+, which will give
+ # all textual content contained in the root element and all it's
+ # children.
+ def self.extract_content(tag_soup)
+ content = ''
+ content_element(tag_soup).children { |child|
+ extract_text(child, content)
}
- content_element(tag_soup).children(&process_child)
- result[:content] = content.strip #CGI.unescapeHTML(content.strip)
- return result
+ return content.strip
end
# extracts the href attributes of all a tags, except
@@ -83,6 +80,44 @@ def self.extract_links(tagsoup)
}.compact
end
+ # Extracts the title from the given html tree
+ def self.extract_title(tagsoup)
+ title = ''
+ the_title_tag = title_tag(tagsoup)
+ if the_title_tag.is_a? String
+ the_title_tag
+ else
+ extract_text(the_title_tag).strip if the_title_tag
+ end
+ end
+
+ # Recursively extracts all text contained in the given element,
+ # and appends it to content.
+ def self.extract_text(element, content='')
+ if element.is_a? NavigableString
+ value = strip_comments(element)
+ value.strip!
+ unless value.empty?
+ content << value
+ content << ' '
+ end
+ elsif element.string # it's a Tag, and it has some content string
+ value = element.string.strip
+ unless value.empty?
+ content << value
+ content << ' '
+ end
+ else
+ element.children { |child|
+ extract_text(child, content)
+ }
+ end
+ end
+
+ # Returns the element to extract the title from.
+ #
+ # This may return a string, e.g. an attribute value selected from a meta
+ # tag, too.
def self.title_tag(tagsoup)
if RDig.config.content_extraction.html.title_tag_selector
RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
@@ -91,6 +126,7 @@ def self.title_tag(tagsoup)
end
end
+ # Retrieve the root element to extract document content from
def self.content_element(tagsoup)
if RDig.config.content_extraction.html.content_tag_selector
RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
@@ -99,6 +135,7 @@ def self.content_element(tagsoup)
end
end
+ # Return the given string minus all html comments
def self.strip_comments(string)
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
end
View
20 lib/rdig/search.rb
@@ -1,18 +1,22 @@
module RDig
module Search
- # used to search the index
+ # This class is used to search the index.
+ # Call RDig::searcher to retrieve an instance ready for use.
class Searcher
include Ferret::Search
-
+
+ # the query parser used to parse query strings
attr_reader :query_parser
+ # takes the ferret section of the rdig configuration as a parameter.
def initialize(settings)
@ferret_config = settings
@query_parser = Ferret::QueryParser.new('*', settings.marshal_dump)
ferret_searcher
end
+ # returns the Ferret::Search::IndexSearcher instance used internally.
def ferret_searcher
if @ferret_searcher and !@ferret_searcher.reader.latest?
# reopen searcher
@@ -26,9 +30,15 @@ def ferret_searcher
@ferret_searcher
end
- # options:
- # first_doc: first document in result list to retrieve (0-based)
- # num_docs : number of documents to retrieve
+ # run a search.
+ # +query+ usually will be a user-entered string. See the Ferret query
+ # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
+ # for more information on queries.
+ # A Ferret::Search::Query instance may be given, too.
+ #
+ # Otions are:
+ # first_doc:: first document in result list to retrieve (0-based). The default is 0.
+ # num_docs:: number of documents to retrieve. The default is 10.
def search(query, options={})
result = {}
query = query_parser.parse(query) if query.is_a?(String)
View
8 rakefile
@@ -107,7 +107,7 @@ rd = Rake::RDocTask.new { |rdoc|
# packaging --------------------------------------------------------------
# ====================================================================
-# Create a task that will package the Rake software into distributable
+# Create a task that will package the software into distributable
# tar, zip and gem files.
if ! defined?(Gem)
@@ -121,6 +121,10 @@ else
s.version = PKG_VERSION
s.summary = "Ruby based web site indexing and searching library."
s.description = <<-EOF
+ RDig provides an HTTP crawler and content extraction utilities
+ to help building a site search for web sites or intranets. Internally,
+ Ferret is used for the full text indexing. After creating a config file
+ for your site, the index can be built with a single call to rdig.
EOF
#### Dependencies and requirements.
@@ -153,7 +157,7 @@ else
s.author = "Jens Kraemer"
s.email = "jk@jkraemer.net"
- s.homepage = "http://rdig.rubyforge.org"
+ s.homepage = "http://rdig.rubyforge.org/"
s.rubyforge_project = "rdig"
# if ENV['CERT_DIR']
# s.signing_key = File.join(ENV['CERT_DIR'], 'gem-private_key.pem')
View
1 test/fixtures/html/custom_tag_selectors.html
@@ -2,6 +2,7 @@
<html>
<head>
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
+ <meta name="DC.title" content="Title from DC meta data" />
<title>Sample Title</title>
</head>
<body>
View
2 test/unit/etag_filter_test.rb
@@ -1,5 +1,5 @@
require 'test_helper'
-class UrlFilterTest < Test::Unit::TestCase
+class ETagFilterTest < Test::Unit::TestCase
include TestHelper, RDig
def setup
View
10 test/unit/html_content_extractor_test.rb
@@ -49,6 +49,16 @@ def test_custom_content_element
assert_equal '/inside.html', result[:links][1]
assert_equal '/footer.html', result[:links][2]
end
+
+ def test_title_from_dcmeta
+ RDig.configuration do |config|
+ config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
+ tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
+ end
+ end
+ result = @extractor.process(html_doc('custom_tag_selectors'))
+ assert_equal 'Title from DC meta data', result[:title]
+ end
end

0 comments on commit 2ffe8f1

Please sign in to comment.