rel 0.1.0

jkraemer · Mar 25, 2006 · 2ffe8f1 · 2ffe8f1
1 parent b32f192
commit 2ffe8f1
Show file tree

Hide file tree

Showing 11 changed files with 199 additions and 111 deletions.
diff --git a/README b/README
@@ -1,51 +1,61 @@
-dependencies:
-ferret >= 0.3.2
-rubyful_soup 
+= RDig
 
-gem install rdig
+RDig provides an HTTP crawler and content extraction utilities
+to help building a site search for web sites or intranets. Internally,
+Ferret is used for the full text indexing. After creating a config file 
+for your site, the index can be built with a single call to rdig.
 
-basic usage
-===========
+RDig depends on Ferret (>= 0.3.2) and the RubyfulSoup library (>= 1.0.4).
 
-- create a config file based on the template in doc/examples
+== basic usage
 
-- to create an index:
-rdig -c CONFIGFILE
 
+=== Index creation
+- create a config file based on the template in doc/examples
+- to create an index:
+    rdig -c CONFIGFILE
 - to run a query against the index (just to try it out)
-rdig -c CONFIGFILE -q 'your query'
+    rdig -c CONFIGFILE -q 'your query'
+  this will dump the first 10 search results to STDOUT
 
-this will dump the results hash to STDOUT
+=== Handle search in your application:
+  require 'rdig'
+  require 'rdig_config'   # load your config file here
+  search_results = RDig.searcher.search(query, options={})
 
-- in your application:
-require 'rdig'
-require 'rdig_config'   # load your config file here
+see RDig::Search::Searcher for more information.
 
-- use RDig.searcher.search(query, options={}) 
 
-
-usage in rails
-==============
+== usage in rails
 
 - add to config/environment.rb :
+    require 'rdig'
+    require 'rdig_config'
+- place rdig_config.rb into config/ directory.
+- build index:
+    rdig -c config/rdig_config.rb
+- in your controller that handles the search form:
+    search_results = RDig.searcher.search(params[:query])
+    @results = search_results[:list]
+    @hitcount = search_results[:hitcount]
+
+=== search result paging
+Use the :first_doc and :num_docs options to implement 
+paging through search results. 
+(:num_docs is 10 by default, so without using these options only the first 10
+results will be retrieved)
 
-require 'rdig'
-require 'rdig_config'
 
-- place rdig_config.rb into config/ directory.
+== sample configuration
 
-- build index:
-rdig -c config/rdig_config.rb
+from doc/examples/config.rb. The tag_selector properties are called 
+with a BeautifulSoup instance as parameter. See the RubyfulSoup Site[http://www.crummy.com/software/RubyfulSoup/documentation.html] for more info about this cool lib.
+You can also have a look at the +html_content_extractor+ unit test.
 
-- in your controller handling the search form:
+See [] for API documentation of the 
+Rubyful Soup lib used 
 
-search_results = RDig.searcher.search(params[:query])
-@results = search_results[:list]
-@hitcount = search_results[:hitcount]
+:include:doc/examples/config.rb
 
-use the :first_doc and :num_docs options to implement 
-paging through search results 
 
-(:num_docs is 10 by default, so without using these options only the first 10
-results will be retrieved)
 
diff --git a/doc/.svnignore b/doc/.svnignore
@@ -0,0 +1 @@
+html
diff --git a/doc/examples/config.rb b/doc/examples/config.rb
@@ -1,7 +1,3 @@
-#
-# sample RDig configuration file, edit to taste
-#
-
 RDig.configuration do |cfg|
 
   ##################################################################
@@ -33,11 +29,11 @@
 
   # crawler options
 
-  # nil (index all documents) or a list of Regexps 
+  # nil (index all documents) or an array of Regexps 
   # matching URLs you want to index.
   # cfg.crawler.include_documents = nil
 
-  # nil (no documents excluded) or a list of Regexps 
+  # nil (no documents excluded) or an array of Regexps 
   # matching URLs not to index.
   # this filter is used after the one above, so you only need
   # to exclude documents here that aren't wanted but would be 

diff --git a/install.rb b/install.rb
@@ -13,6 +13,7 @@
 # handle running it. Under windows, we add a '.rb' extension
 # and let file associations to their stuff
 #
+# based on install.rb from the Rake distribution
 
 def installBIN(from, opfile)
 

diff --git a/lib/rdig.rb b/lib/rdig.rb
@@ -34,6 +34,7 @@
 require 'ostruct'
 require 'uri'
 require 'cgi'
+require 'set'
 require 'net/http'
 require 'getoptlong'
 
@@ -58,6 +59,7 @@
 $KCODE = 'u'
 require 'jcode'
 
+# See README for basic usage information
 module RDig
 
   class << self
@@ -83,46 +85,52 @@ def searcher
       @searcher ||= Search::Searcher.new(config.ferret)
     end
 
-    def config
-      @config ||= OpenStruct.new(
-        :crawler           => OpenStruct.new(
-          :start_urls        => [ "http://localhost:3000/" ],
-          :include_hosts     => [ "localhost" ],
-          :include_documents => nil,
-          :exclude_documents => nil,
-          :index_document    => nil,
-          :num_threads       => 2,
-          :max_redirects     => 5,
-          :wait_before_leave => 10
-        ),
-        :content_extraction  => OpenStruct.new(
-          # settings for html content extraction
-          :html => OpenStruct.new(
-            # select the html element that contains the content to index
-            # by default, we index all inside the body tag:
-            :content_tag_selector => lambda { |tagsoup|
-              tagsoup.html.body
-            },
-            # select the html element containing the title 
-            :title_tag_selector         => lambda { |tagsoup|
-              tagsoup.html.head.title
-            }
+    # RDig configuration
+    #
+    # may be used with a block:
+    #   RDig.configuration do |config| ...
+    #
+    # see doc/examples/config.rb for a commented example configuration
+    def configuration
+      if block_given?
+        yield configuration
+      else
+        @config ||= OpenStruct.new(
+          :crawler           => OpenStruct.new(
+            :start_urls        => [ "http://localhost:3000/" ],
+            :include_hosts     => [ "localhost" ],
+            :include_documents => nil,
+            :exclude_documents => nil,
+            :index_document    => nil,
+            :num_threads       => 2,
+            :max_redirects     => 5,
+            :wait_before_leave => 10
+          ),
+          :content_extraction  => OpenStruct.new(
+            # settings for html content extraction
+            :html => OpenStruct.new(
+              # select the html element that contains the content to index
+              # by default, we index all inside the body tag:
+              :content_tag_selector => lambda { |tagsoup|
+                tagsoup.html.body
+              },
+              # select the html element containing the title 
+              :title_tag_selector         => lambda { |tagsoup|
+                tagsoup.html.head.title
+              }
+            )
+          ),
+          :ferret                => OpenStruct.new( 
+            :path                => "index/", 
+            :create              => true,
+            :handle_parse_errors => true,
+            :analyzer            => Ferret::Analysis::StandardAnalyzer.new,
+            :occur_default       => Ferret::Search::BooleanClause::Occur::MUST
           )
-        ),
-        :ferret                => OpenStruct.new( 
-          :path                => "index/", 
-          :create              => true,
-          :handle_parse_errors => true,
-          :analyzer            => Ferret::Analysis::StandardAnalyzer.new,
-          :occur_default       => Ferret::Search::BooleanClause::Occur::MUST
         )
-      )
-    end
-
-    # RDig.configuration do |config| ...
-    def configuration
-      yield config
+      end
     end
+    alias config configuration
 
   end
 
@@ -214,9 +222,19 @@ def run
 
       if options.query
         # query the index
-        puts "executing query #{options.query}"
-        p RDig.searcher.search(options.query)
+        puts "executing query >#{options.query}<"
+        results = RDig.searcher.search(options.query)
+        puts "total results: #{results[:hitcount]}"
+        results[:list].each { |result|
+          puts <<-EOF
+#{result[:url]}
+  #{result[:title]}
+  #{result[:extract]}
+
+          EOF
+        }
       else
+        # rebuild index
         @crawler = Crawler.new
         @crawler.run
       end

diff --git a/lib/rdig/content_extractors.rb b/lib/rdig/content_extractors.rb
@@ -23,9 +23,13 @@ def handle_charref(ref)
 
 module RDig
 
-  # todo support at least pdf, too
+  # Contains Classes which are used for extracting content and meta data from
+  # various content types.
+  #
+  # TODO: support at least pdf, too.
   module ContentExtractors
 
+    # process the given +content+ depending on it's +content_type+.
     def ContentExtractors.process(content, content_type)
       case content_type
       when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
@@ -36,43 +40,36 @@ def ContentExtractors.process(content, content_type)
       return nil
     end
 
+    # extracts title, content and links from html documents
     class HtmlContentExtractor
 
       # returns: 
       # { :content => 'extracted clear text',
       #   :meta => { :title => 'Title' },
       #   :links => [array of urls] }
       def self.process(content)
-        result = { :title => '' }
+        result = { }
         tag_soup = BeautifulSoup.new(content)
-        the_title_tag = title_tag(tag_soup)
-        result[:title] = the_title_tag.string.strip if the_title_tag
+        result[:title] = extract_title(tag_soup)
         result[:links] = extract_links(tag_soup)
-        content = ''
+        result[:content] = extract_content(tag_soup)
+        return result
+      end
 
-        # links sollten aber von ganzen dokument bezogen werden, nicht bloss vom content
-        process_child = lambda { |child|
-          if child.is_a? NavigableString
-            value = self.strip_comments(child)
-            value.strip!
-            unless value.empty?
-              content << value
-              content << ' '
-            end
-          elsif child.string  # it's a Tag, and it has some content string
-            value = child.string.strip 
-            unless value.empty?
-              content << value
-              content << ' '
-            end
-          else
-            child.children(&process_child)
-          end
-          true
+      # Extracts textual content from the HTML tree.
+      #
+      # - First, the root element to use is determined using the 
+      # +content_element+ method, which itself uses the content_tag_selector
+      # from RDig.configuration.
+      # - Then, this element is processed by +extract_text+, which will give
+      # all textual content contained in the root element and all it's
+      # children.
+      def self.extract_content(tag_soup)
+        content = ''
+        content_element(tag_soup).children { |child| 
+          extract_text(child, content)
         }
-        content_element(tag_soup).children(&process_child)
-        result[:content] = content.strip #CGI.unescapeHTML(content.strip)
-        return result
+        return content.strip
       end
 
       # extracts the href attributes of all a tags, except 
@@ -83,6 +80,44 @@ def self.extract_links(tagsoup)
         }.compact
       end
 
+      # Extracts the title from the given html tree
+      def self.extract_title(tagsoup)
+        title = ''
+        the_title_tag = title_tag(tagsoup)
+        if the_title_tag.is_a? String
+          the_title_tag
+        else
+          extract_text(the_title_tag).strip if the_title_tag
+        end
+      end
+
+      # Recursively extracts all text contained in the given element, 
+      # and appends it to content.
+      def self.extract_text(element, content='')
+        if element.is_a? NavigableString
+          value = strip_comments(element)
+          value.strip!
+          unless value.empty?
+            content << value
+            content << ' '
+          end
+        elsif element.string  # it's a Tag, and it has some content string
+          value = element.string.strip 
+          unless value.empty?
+            content << value
+            content << ' '
+          end
+        else
+          element.children { |child|
+            extract_text(child, content)
+          }
+        end
+      end
+
+      # Returns the element to extract the title from.
+      #
+      # This may return a string, e.g. an attribute value selected from a meta
+      # tag, too.
       def self.title_tag(tagsoup)
         if RDig.config.content_extraction.html.title_tag_selector
           RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
@@ -91,6 +126,7 @@ def self.title_tag(tagsoup)
         end
       end
 
+      # Retrieve the root element to extract document content from
       def self.content_element(tagsoup)
         if RDig.config.content_extraction.html.content_tag_selector
           RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
@@ -99,6 +135,7 @@ def self.content_element(tagsoup)
         end
       end
 
+      # Return the given string minus all html comments
       def self.strip_comments(string)
         string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
       end