Permalink
Browse files

Update the spider.rb and query.rb examples

 - Updated for new API.
 - store the filesize
 - allow queries on fields
 - --order-by-filesize option on query.rb
 - improve benchmark measurements
 - replace Dir.glob with index queue for faster start
  • Loading branch information...
1 parent e22a909 commit be0b105a93286e22f4595f46ee5d89fae38f71a2 @johnl committed Aug 16, 2009
Showing with 78 additions and 21 deletions.
  1. +34 −6 examples/query.rb
  2. +44 −15 examples/spider.rb
View
@@ -1,16 +1,44 @@
#!/usr/bin/ruby
#
+# Example file spider index searcher using XapianFu. Conducts a search
+# on ./spider.db created with spider.rb.
+#
+# --order-by-filesize sorts the results by the file size, largest
+# first. Default is to sort by relevance.
+#
+# All other command line arguments are used as the search query:
+#
+# query.rb --order-by-filesize mammoth -woolley
+#
+# You can limit queries to particular fields:
+#
+# query.rb filename:LICENSE text:BSD
+#
require 'rubygems'
require 'benchmark'
require 'lib/xapian_fu'
-query_string = ARGV.join(" ")
-db = XapianFu::XapianDb.new(:dir => 'spider.db')
+order = nil
+reverse = false
+if ARGV.delete('--order-by-filesize')
+ order = :filesize
+ reverse = true
+end
+query = ARGV.join(" ")
+db = XapianFu::XapianDb.new(:dir => 'spider.db', :fields => [:text, :filesize, :filename])
+puts "Xapian Database has #{db.size} docs in total"
+puts "Largest filesize recorded is #{db.documents.max(:filesize).values[:filesize].to_i / 1024}k"
+puts "Searching for '#{query}'"
results = nil
-bm = Benchmark.measure { results = db.search(query_string) }
-puts "Weight\tFilename"
+bm = Benchmark.measure do
+ results = db.search(query, :order => order, :reverse => reverse)
+end
+puts "Returned #{results.size} of #{results.total_entries} total hits"
+puts "Weight\tFilename\tFilesize"
results.each do |result|
- puts "%.2f\t%s" % [result.weight, result.fields[:filename]]
+ filename = result.values[:filename]
+ filesize = result.values[:filesize].to_i / 1024
+ puts "%.2f\t%s\t%ik" % [result.weight, filename, filesize]
end
-puts "Search took %.5f seconds" % bm.total
+puts "Search took %.5f seconds" % bm.real
View
@@ -1,28 +1,57 @@
#!/usr/bin/ruby
+#
+# Example file spider using XapianFu. Overwrites the index on each run (./spider.db)
+#
+# spider.rb /path/to/index
require 'rubygems'
require 'benchmark'
require 'lib/xapian_fu'
-db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => :filename,
+db = XapianFu::XapianDb.new(:dir => 'spider.db', :store => [:filename, :filesize],
:overwrite => true)
base_path = ARGV[0] || '.'
-docs = 0
+index_queue = [base_path]
+total_file_count = 0
indexing_time = 0.0
-Dir.glob(File.join(base_path, "/**/*")) do |filename|
- next unless File.file?(filename)
- next unless filename =~ /\.(txt|doc|README|c|h|rb|py|note|xml)$/i
- puts "Indexing #{filename}"
- text = File.open(filename) { |f| f.read(10 * 1024) }
- bm = Benchmark.measure do
- db << XapianFu::XapianDoc.new({:text => text, :filename => filename,
- :filesize => File.size(filename) })
+STDERR.write "Indexing\n"
+while dir = index_queue.shift
+ STDERR.write " - #{dir}: "
+ file_count = 0
+ file_data = 0
+ Dir.foreach(dir) do |filename|
+ # skip . and ..
+ next if filename =~ /^[.]{1,2}$/
+ filename = File.join(dir, filename)
+ # Put any directories we find onto the queue for indexing
+ if File.directory?(filename)
+ index_queue << filename
+ next
+ end
+ next unless File.file?(filename)
+ next unless filename =~ /(txt|doc|README|c|h|pl|sh|rb|py|note|xml)$/i
+ file_count += 1
+
+ # Read the first 10k of data
+ text = File.open(filename) { |f| f.read(10 * 1024) }
+ file_data += text.size
+ # Index the data, filename and filesize
+ bm = Benchmark.measure do
+ db << {
+ :text => text,
+ :filename => filename,
+ :filesize => File.size(filename)
+ }
+ end
+ indexing_time += bm.real
end
- indexing_time += bm.total
- docs += 1
- break if docs == 10000
+ STDERR.write("#{file_data / 1024}k in #{file_count} files\n")
+ total_file_count += file_count
end
-indexing_time += Benchmark.measure { db.flush }.total
-puts "#{docs} docs indexed in #{indexing_time} seconds (#{docs / indexing_time} docs per second)"
+
+files_per_second = (total_file_count / indexing_time).round
+puts "#{total_file_count} files indexed in #{indexing_time.round} seconds (#{files_per_second} per second)"
+flush_time = Benchmark.measure { db.flush }.real
+puts "Flush to disk took #{flush_time.round} seconds"

0 comments on commit be0b105

Please sign in to comment.