Browse files

Added support for the "scan" search in the Ruby API

Given the support for search types (since karmi/retire@d764c0d), we can now
expose the "scan" search type in the Ruby API and DSL.

Example
-------

    # Let's index some data
    documents = (1..100).map { |i| { id: i, type: 'test', title: "Document #{i}" } }

    Tire.index 'scantest' do
      delete
      create :settings => { :number_of_shards => 1, :number_of_replicas => 0 }
      import documents
      refresh
    end

    # Let's initiate a scan search request
    s = Tire.scan 'scantest'

    # The #each method returns the document batches as received from ElasticSearch,
    # where invididual hits are wrapped as Results::Item
    s.each do |results|
      puts results.map(&:title)
    end

    # The #each_document method returns each individual documents, as received from ElasticSearch,
    # wrapped as a Results::Item
    s.each_document do |document|
      puts "#{count}: #{document.title}"
      count += 1
    end

Information
-----------

The object being returned is fully Enumerable-compatible.

See the unit and integration tests for details on usage.

Inspired by Michael Grassotti's and Erick Tryzelaar's work at:

* grasscode/tire@63fefb9
* https://gist.github.com/2240721

See the ElasticSearch documentation:

* http://www.elasticsearch.org/guide/reference/api/search/search-type.html
* http://www.elasticsearch.org/guide/reference/api/search/scroll.html

Closes #196.
  • Loading branch information...
1 parent 5fb37e7 commit c1544740f7cd52eecb49274d3ab9d73f8374bc28 @karmi committed Mar 30, 2012
Showing with 310 additions and 0 deletions.
  1. +1 −0 lib/tire.rb
  2. +4 −0 lib/tire/dsl.rb
  3. +114 −0 lib/tire/search/scan.rb
  4. +56 −0 test/integration/scan_test.rb
  5. +113 −0 test/unit/search_scan_test.rb
  6. +22 −0 test/unit/tire_test.rb
View
1 lib/tire.rb
@@ -23,6 +23,7 @@
require 'tire/search/facet'
require 'tire/search/filter'
require 'tire/search/highlight'
+require 'tire/search/scan'
require 'tire/results/pagination'
require 'tire/results/collection'
require 'tire/results/item'
View
4 lib/tire/dsl.rb
@@ -31,5 +31,9 @@ def index(name, &block)
Index.new(name, &block)
end
+ def scan(names, options={}, &block)
+ Search::Scan.new(names, options, &block)
+ end
+
end
end
View
114 lib/tire/search/scan.rb
@@ -0,0 +1,114 @@
+module Tire
+ module Search
+
+
+ # Performs a "scan/scroll" search request, which obtains a `scroll_id`
+ # and keeps returning documents matching the passed query (or all documents) in batches.
+ #
+ # You may want to iterate over the batches being returned:
+ #
+ # search = Tire::Search::Scan.new('articles')
+ # search.each do |results|
+ # puts results.map(&:title)
+ # end
+ #
+ # The scan object has a fully Enumerable-compatible interface, so you may
+ # call methods like `map` or `each_with_index` on it.
+ #
+ # To iterate over individual documents, use the `each_document` method:
+ #
+ # search.each_document do |document|
+ # puts document.title
+ # end
+ #
+ # You may limit the result set being returned by a regular Tire DSL query
+ # (or a hash, if you prefer), passed as a second argument:
+ #
+ # search = Tire::Search::Scan.new('articles') do
+ # query { term 'author.exact', 'John Smith' }
+ # end
+ #
+ # The feature is also exposed in the Tire top-level DSL:
+ #
+ # search = Tire.scan 'articles' do
+ # query { term 'author.exact', 'John Smith' }
+ # end
+ #
+ # See ElasticSearch documentation for further reference:
+ #
+ # * http://www.elasticsearch.org/guide/reference/api/search/search-type.html
+ # * http://www.elasticsearch.org/guide/reference/api/search/scroll.html
+ #
+ class Scan
+ include Enumerable
+
+ attr_reader :indices, :options, :search
+
+ def initialize(indices=nil, options={}, &block)
+ @indices = Array(indices)
+ @options = options.update(:search_type => 'scan', :scroll => '10m')
+ @seen = 0
+ @search = Search.new(@indices, @options, &block)
+ end
+
+ def url; Configuration.url + "/_search/scroll"; end
+ def params; @options.empty? ? '' : '?' + @options.to_param; end
+ def results; @results || (__perform; @results); end
+ def response; @response || (__perform; @response); end
+ def json; @json || (__perform; @json); end
+ def total; @total || (__perform; @total); end
+ def seen; @seen || (__perform; @seen); end
+
+ def scroll_id
+ @scroll_id ||= @search.perform.json['_scroll_id']
+ end
+
+ def each
+ until results.empty?
+ yield results.results
+ __perform
+ end
+ end
+
+ def each_document
+ until results.empty?
+ results.each { |item| yield item }
+ __perform
+ end
+ end
+
+ def size
+ results.size
+ end
+
+ def __perform
+ @response = Configuration.client.get [url, params].join, scroll_id
+ @json = MultiJson.decode @response.body
+ @results = Results::Collection.new @json, @options
+ @total = @json['hits']['total'].to_i
+ @seen += @results.size
+ @scroll_id = @json['_scroll_id']
+ return self
+ ensure
+ __logged
+ end
+
+ def to_a; results; end; alias :to_ary :to_a
+ def to_curl; %Q|curl -X GET "#{url}?pretty=true" -d '#{@scroll_id}'|; end
+
+ def __logged(error=nil)
+ if Configuration.logger
+ Configuration.logger.log_request 'scroll', nil, to_curl
+
+ took = @json['took'] rescue nil
+ code = @response.code rescue nil
+ body = "#{@seen}/#{@total} (#{@seen/@total.to_f*100}%)" rescue nil
+
+ Configuration.logger.log_response code || 'N/A', took || 'N/A', body
+ end
+ end
+
+ end
+
+ end
+end
View
56 test/integration/scan_test.rb
@@ -0,0 +1,56 @@
+require 'test_helper'
+
+module Tire
+
+ class ScanIntegrationTest < Test::Unit::TestCase
+ include Test::Integration
+
+ context "Scan" do
+ setup do
+ documents = (1..100).map { |i| { id: i, type: 'test', title: "Document #{i}" } }
+
+ Tire.index 'scantest' do
+ delete
+ create :settings => { :number_of_shards => 1, :number_of_replicas => 0 }
+ import documents
+ refresh
+ end
+ end
+
+ teardown { Index.new('scantest').delete }
+
+ should "iterate over batches of documents" do
+ count = 0
+
+ s = Tire.scan 'scantest'
+ s.each { |results| count += 1 }
+
+ assert_equal 10, count
+ end
+
+ should "iterate over individual documents" do
+ count = 0
+
+ s = Tire.scan 'scantest'
+ s.each_document { |results| count += 1 }
+
+ assert_equal 100, count
+ end
+
+ should "limit the returned results by query" do
+ count = 0
+
+ s = Tire.scan('scantest') { query { string '10*' } }
+ s.each do |results|
+ count += 1
+ assert_equal ['Document 10', 'Document 100'], results.map(&:title)
+ end
+
+ assert_equal 1, count
+ end
+
+ end
+
+ end
+
+end
View
113 test/unit/search_scan_test.rb
@@ -0,0 +1,113 @@
+require 'test_helper'
+
+module Tire
+ module Search
+ class ScanTest < Test::Unit::TestCase
+
+ context "Scan" do
+ setup do
+ Configuration.reset
+ @results = {
+ "_scroll_id" => "abc123",
+ "took" => 3,
+ "hits" => {
+ "total" => 10,
+ "hits" => [
+ { "_id" => "1", "_source" => { "title" => "Test" } }
+ ]
+ }
+ }
+ @empty_results = @results.merge('hits' => {'hits' => []})
+ @default_response = mock_response @results.to_json, 200
+ end
+
+ should "initialize the search object with the indices" do
+ s = Scan.new(['index1', 'index2'])
+ assert_instance_of Tire::Search::Search, s.search
+ end
+
+ should "fetch the initial scroll ID" do
+ s = Scan.new('index1')
+ s.search.expects(:perform)
+ .returns(stub :json => { '_scroll_id' => 'abc123' })
+
+ assert_equal 'abc123', s.scroll_id
+ end
+
+ should "perform the request lazily" do
+ s = Scan.new('dummy')
+
+ s.expects(:scroll_id)
+ .returns('abc123')
+ .at_least_once
+
+ Configuration.client.expects(:get)
+ .with { |url,id| url =~ %r|_search/scroll.*search_type=scan| && id == 'abc123' }
+ .returns(@default_response)
+ .once
+
+ assert_not_nil s.results
+ assert_not_nil s.response
+ assert_not_nil s.json
+ end
+
+ should "set the total and seen variables" do
+ s = Scan.new('dummy')
+ s.expects(:scroll_id).returns('abc123').at_least_once
+ Configuration.client.expects(:get).returns(@default_response).at_least_once
+
+ assert_equal 10, s.total
+ assert_equal 1, s.seen
+ end
+
+ should "log the request and response" do
+ Tire.configure { logger STDERR }
+
+ s = Scan.new('dummy')
+ s.expects(:scroll_id).returns('abc123').at_least_once
+ Configuration.client.expects(:get).returns(@default_response).at_least_once
+
+ Configuration.logger.expects(:log_request)
+ .with { |(endpoint, params, curl)| endpoint == 'scroll' }
+
+ Configuration.logger.expects(:log_response)
+ .with { |code, took, body| code == 200 && took == 3 && body == '1/10 (10.0%)' }
+
+ s.__perform
+ end
+
+ context "results" do
+ setup do
+ @search = Scan.new('dummy')
+ @search.expects(:results)
+ .returns(Results::Collection.new @results)
+ .then
+ .returns(Results::Collection.new @empty_results)
+ .at_least_once
+ @search.results
+ end
+
+ should "be iterable" do
+ assert_respond_to @search, :each
+ assert_respond_to @search, :size
+
+ assert_nothing_raised do
+ @search.each { |batch| p batch; assert_equal 'Test', batch.first.title }
+ end
+ end
+
+ should "be iterable by individual documents" do
+ assert_respond_to @search, :each_document
+
+ assert_nothing_raised do
+ @search.each_document { |item| assert_equal 'Test', item.title }
+ end
+ end
+
+ end
+
+ end
+
+ end
+ end
+end
View
22 test/unit/tire_test.rb
@@ -64,6 +64,28 @@ class TireTest < Test::Unit::TestCase
end
+ context "when scanning an index" do
+ should "initiate the scan" do
+ Search::Scan.expects(:new).with { |index| index == 'dummy' }
+
+ Tire.scan('dummy')
+ end
+
+ should "allow to pass the query as a block to scan" do
+ Search::Scan.expects(:new).with { |index| index == 'dummy' }
+
+ Tire.scan('dummy') { query { string 'foo' } }
+ end
+
+ should "allow to pass the query as a hash to scan" do
+ payload = { :query => { :query_string => { :query => 'foo' } } }
+ Search::Scan.expects(:new).with('dummy', payload)
+
+ Tire.scan 'dummy', payload
+ end
+
+ end
+
end
context "utils" do

0 comments on commit c154474

Please sign in to comment.