Skip to content

Commit

Permalink
rel 0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
jkraemer committed Mar 25, 2006
1 parent b32f192 commit 2ffe8f1
Show file tree
Hide file tree
Showing 11 changed files with 199 additions and 111 deletions.
72 changes: 41 additions & 31 deletions README
@@ -1,51 +1,61 @@
dependencies:
ferret >= 0.3.2
rubyful_soup
= RDig

gem install rdig
RDig provides an HTTP crawler and content extraction utilities
to help building a site search for web sites or intranets. Internally,
Ferret is used for the full text indexing. After creating a config file
for your site, the index can be built with a single call to rdig.

basic usage
===========
RDig depends on Ferret (>= 0.3.2) and the RubyfulSoup library (>= 1.0.4).

- create a config file based on the template in doc/examples
== basic usage

- to create an index:
rdig -c CONFIGFILE

=== Index creation
- create a config file based on the template in doc/examples
- to create an index:
rdig -c CONFIGFILE
- to run a query against the index (just to try it out)
rdig -c CONFIGFILE -q 'your query'
rdig -c CONFIGFILE -q 'your query'
this will dump the first 10 search results to STDOUT

this will dump the results hash to STDOUT
=== Handle search in your application:
require 'rdig'
require 'rdig_config' # load your config file here
search_results = RDig.searcher.search(query, options={})

- in your application:
require 'rdig'
require 'rdig_config' # load your config file here
see RDig::Search::Searcher for more information.

- use RDig.searcher.search(query, options={})


usage in rails
==============
== usage in rails

- add to config/environment.rb :
require 'rdig'
require 'rdig_config'
- place rdig_config.rb into config/ directory.
- build index:
rdig -c config/rdig_config.rb
- in your controller that handles the search form:
search_results = RDig.searcher.search(params[:query])
@results = search_results[:list]
@hitcount = search_results[:hitcount]

=== search result paging
Use the :first_doc and :num_docs options to implement
paging through search results.
(:num_docs is 10 by default, so without using these options only the first 10
results will be retrieved)

require 'rdig'
require 'rdig_config'

- place rdig_config.rb into config/ directory.
== sample configuration

- build index:
rdig -c config/rdig_config.rb
from doc/examples/config.rb. The tag_selector properties are called
with a BeautifulSoup instance as parameter. See the RubyfulSoup Site[http://www.crummy.com/software/RubyfulSoup/documentation.html] for more info about this cool lib.
You can also have a look at the +html_content_extractor+ unit test.

- in your controller handling the search form:
See [] for API documentation of the
Rubyful Soup lib used

search_results = RDig.searcher.search(params[:query])
@results = search_results[:list]
@hitcount = search_results[:hitcount]
:include:doc/examples/config.rb

use the :first_doc and :num_docs options to implement
paging through search results

(:num_docs is 10 by default, so without using these options only the first 10
results will be retrieved)

1 change: 1 addition & 0 deletions doc/.svnignore
@@ -0,0 +1 @@
html
8 changes: 2 additions & 6 deletions doc/examples/config.rb
@@ -1,7 +1,3 @@
#
# sample RDig configuration file, edit to taste
#

RDig.configuration do |cfg|

##################################################################
Expand Down Expand Up @@ -33,11 +29,11 @@

# crawler options

# nil (index all documents) or a list of Regexps
# nil (index all documents) or an array of Regexps
# matching URLs you want to index.
# cfg.crawler.include_documents = nil

# nil (no documents excluded) or a list of Regexps
# nil (no documents excluded) or an array of Regexps
# matching URLs not to index.
# this filter is used after the one above, so you only need
# to exclude documents here that aren't wanted but would be
Expand Down
1 change: 1 addition & 0 deletions install.rb
Expand Up @@ -13,6 +13,7 @@
# handle running it. Under windows, we add a '.rb' extension
# and let file associations to their stuff
#
# based on install.rb from the Rake distribution

def installBIN(from, opfile)

Expand Down
96 changes: 57 additions & 39 deletions lib/rdig.rb
Expand Up @@ -34,6 +34,7 @@
require 'ostruct'
require 'uri'
require 'cgi'
require 'set'
require 'net/http'
require 'getoptlong'

Expand All @@ -58,6 +59,7 @@
$KCODE = 'u'
require 'jcode'

# See README for basic usage information
module RDig

class << self
Expand All @@ -83,46 +85,52 @@ def searcher
@searcher ||= Search::Searcher.new(config.ferret)
end

def config
@config ||= OpenStruct.new(
:crawler => OpenStruct.new(
:start_urls => [ "http://localhost:3000/" ],
:include_hosts => [ "localhost" ],
:include_documents => nil,
:exclude_documents => nil,
:index_document => nil,
:num_threads => 2,
:max_redirects => 5,
:wait_before_leave => 10
),
:content_extraction => OpenStruct.new(
# settings for html content extraction
:html => OpenStruct.new(
# select the html element that contains the content to index
# by default, we index all inside the body tag:
:content_tag_selector => lambda { |tagsoup|
tagsoup.html.body
},
# select the html element containing the title
:title_tag_selector => lambda { |tagsoup|
tagsoup.html.head.title
}
# RDig configuration
#
# may be used with a block:
# RDig.configuration do |config| ...
#
# see doc/examples/config.rb for a commented example configuration
def configuration
if block_given?
yield configuration
else
@config ||= OpenStruct.new(
:crawler => OpenStruct.new(
:start_urls => [ "http://localhost:3000/" ],
:include_hosts => [ "localhost" ],
:include_documents => nil,
:exclude_documents => nil,
:index_document => nil,
:num_threads => 2,
:max_redirects => 5,
:wait_before_leave => 10
),
:content_extraction => OpenStruct.new(
# settings for html content extraction
:html => OpenStruct.new(
# select the html element that contains the content to index
# by default, we index all inside the body tag:
:content_tag_selector => lambda { |tagsoup|
tagsoup.html.body
},
# select the html element containing the title
:title_tag_selector => lambda { |tagsoup|
tagsoup.html.head.title
}
)
),
:ferret => OpenStruct.new(
:path => "index/",
:create => true,
:handle_parse_errors => true,
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
:occur_default => Ferret::Search::BooleanClause::Occur::MUST
)
),
:ferret => OpenStruct.new(
:path => "index/",
:create => true,
:handle_parse_errors => true,
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
:occur_default => Ferret::Search::BooleanClause::Occur::MUST
)
)
end

# RDig.configuration do |config| ...
def configuration
yield config
end
end
alias config configuration

end

Expand Down Expand Up @@ -214,9 +222,19 @@ def run

if options.query
# query the index
puts "executing query #{options.query}"
p RDig.searcher.search(options.query)
puts "executing query >#{options.query}<"
results = RDig.searcher.search(options.query)
puts "total results: #{results[:hitcount]}"
results[:list].each { |result|
puts <<-EOF
#{result[:url]}
#{result[:title]}
#{result[:extract]}
EOF
}
else
# rebuild index
@crawler = Crawler.new
@crawler.run
end
Expand Down
91 changes: 64 additions & 27 deletions lib/rdig/content_extractors.rb
Expand Up @@ -23,9 +23,13 @@ def handle_charref(ref)

module RDig

# todo support at least pdf, too
# Contains Classes which are used for extracting content and meta data from
# various content types.
#
# TODO: support at least pdf, too.
module ContentExtractors

# process the given +content+ depending on it's +content_type+.
def ContentExtractors.process(content, content_type)
case content_type
when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
Expand All @@ -36,43 +40,36 @@ def ContentExtractors.process(content, content_type)
return nil
end

# extracts title, content and links from html documents
class HtmlContentExtractor

# returns:
# { :content => 'extracted clear text',
# :meta => { :title => 'Title' },
# :links => [array of urls] }
def self.process(content)
result = { :title => '' }
result = { }
tag_soup = BeautifulSoup.new(content)
the_title_tag = title_tag(tag_soup)
result[:title] = the_title_tag.string.strip if the_title_tag
result[:title] = extract_title(tag_soup)
result[:links] = extract_links(tag_soup)
content = ''
result[:content] = extract_content(tag_soup)
return result
end

# links sollten aber von ganzen dokument bezogen werden, nicht bloss vom content
process_child = lambda { |child|
if child.is_a? NavigableString
value = self.strip_comments(child)
value.strip!
unless value.empty?
content << value
content << ' '
end
elsif child.string # it's a Tag, and it has some content string
value = child.string.strip
unless value.empty?
content << value
content << ' '
end
else
child.children(&process_child)
end
true
# Extracts textual content from the HTML tree.
#
# - First, the root element to use is determined using the
# +content_element+ method, which itself uses the content_tag_selector
# from RDig.configuration.
# - Then, this element is processed by +extract_text+, which will give
# all textual content contained in the root element and all it's
# children.
def self.extract_content(tag_soup)
content = ''
content_element(tag_soup).children { |child|
extract_text(child, content)
}
content_element(tag_soup).children(&process_child)
result[:content] = content.strip #CGI.unescapeHTML(content.strip)
return result
return content.strip
end

# extracts the href attributes of all a tags, except
Expand All @@ -83,6 +80,44 @@ def self.extract_links(tagsoup)
}.compact
end

# Extracts the title from the given html tree
def self.extract_title(tagsoup)
title = ''
the_title_tag = title_tag(tagsoup)
if the_title_tag.is_a? String
the_title_tag
else
extract_text(the_title_tag).strip if the_title_tag
end
end

# Recursively extracts all text contained in the given element,
# and appends it to content.
def self.extract_text(element, content='')
if element.is_a? NavigableString
value = strip_comments(element)
value.strip!
unless value.empty?
content << value
content << ' '
end
elsif element.string # it's a Tag, and it has some content string
value = element.string.strip
unless value.empty?
content << value
content << ' '
end
else
element.children { |child|
extract_text(child, content)
}
end
end

# Returns the element to extract the title from.
#
# This may return a string, e.g. an attribute value selected from a meta
# tag, too.
def self.title_tag(tagsoup)
if RDig.config.content_extraction.html.title_tag_selector
RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
Expand All @@ -91,6 +126,7 @@ def self.title_tag(tagsoup)
end
end

# Retrieve the root element to extract document content from
def self.content_element(tagsoup)
if RDig.config.content_extraction.html.content_tag_selector
RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
Expand All @@ -99,6 +135,7 @@ def self.content_element(tagsoup)
end
end

# Return the given string minus all html comments
def self.strip_comments(string)
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
end
Expand Down

0 comments on commit 2ffe8f1

Please sign in to comment.