Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Added support for the "scan" search in the Ruby API

Given the support for search types (since karmi/retire@d764c0d), we can now
expose the "scan" search type in the Ruby API and DSL.

Example
-------

    # Let's index some data
    documents = (1..100).map { |i| { id: i, type: 'test', title: "Document #{i}" } }

    Tire.index 'scantest' do
      delete
      create :settings => { :number_of_shards => 1, :number_of_replicas => 0 }
      import documents
      refresh
    end

    # Let's initiate a scan search request
    s = Tire.scan 'scantest'

    # The #each method returns the document batches as received from ElasticSearch,
    # where invididual hits are wrapped as Results::Item
    s.each do |results|
      puts results.map(&:title)
    end

    # The #each_document method returns each individual documents, as received from ElasticSearch,
    # wrapped as a Results::Item
    s.each_document do |document|
      puts "#{count}: #{document.title}"
      count += 1
    end

Information
-----------

The object being returned is fully Enumerable-compatible.

See the unit and integration tests for details on usage.

Inspired by Michael Grassotti's and Erick Tryzelaar's work at:

* grasscode/tire@63fefb9
* https://gist.github.com/2240721

See the ElasticSearch documentation:

* http://www.elasticsearch.org/guide/reference/api/search/search-type.html
* http://www.elasticsearch.org/guide/reference/api/search/scroll.html

Closes #196.
  • Loading branch information...
commit c1544740f7cd52eecb49274d3ab9d73f8374bc28 1 parent 5fb37e7
@karmi authored
View
1  lib/tire.rb
@@ -23,6 +23,7 @@
require 'tire/search/facet'
require 'tire/search/filter'
require 'tire/search/highlight'
+require 'tire/search/scan'
require 'tire/results/pagination'
require 'tire/results/collection'
require 'tire/results/item'
View
4 lib/tire/dsl.rb
@@ -31,5 +31,9 @@ def index(name, &block)
Index.new(name, &block)
end
+ def scan(names, options={}, &block)
+ Search::Scan.new(names, options, &block)
+ end
+
end
end
View
114 lib/tire/search/scan.rb
@@ -0,0 +1,114 @@
+module Tire
+ module Search
+
+
+ # Performs a "scan/scroll" search request, which obtains a `scroll_id`
+ # and keeps returning documents matching the passed query (or all documents) in batches.
+ #
+ # You may want to iterate over the batches being returned:
+ #
+ # search = Tire::Search::Scan.new('articles')
+ # search.each do |results|
+ # puts results.map(&:title)
+ # end
+ #
+ # The scan object has a fully Enumerable-compatible interface, so you may
+ # call methods like `map` or `each_with_index` on it.
+ #
+ # To iterate over individual documents, use the `each_document` method:
+ #
+ # search.each_document do |document|
+ # puts document.title
+ # end
+ #
+ # You may limit the result set being returned by a regular Tire DSL query
+ # (or a hash, if you prefer), passed as a second argument:
+ #
+ # search = Tire::Search::Scan.new('articles') do
+ # query { term 'author.exact', 'John Smith' }
+ # end
+ #
+ # The feature is also exposed in the Tire top-level DSL:
+ #
+ # search = Tire.scan 'articles' do
+ # query { term 'author.exact', 'John Smith' }
+ # end
+ #
+ # See ElasticSearch documentation for further reference:
+ #
+ # * http://www.elasticsearch.org/guide/reference/api/search/search-type.html
+ # * http://www.elasticsearch.org/guide/reference/api/search/scroll.html
+ #
+ class Scan
+ include Enumerable
+
+ attr_reader :indices, :options, :search
+
+ def initialize(indices=nil, options={}, &block)
+ @indices = Array(indices)
+ @options = options.update(:search_type => 'scan', :scroll => '10m')
+ @seen = 0
+ @search = Search.new(@indices, @options, &block)
+ end
+
+ def url; Configuration.url + "/_search/scroll"; end
+ def params; @options.empty? ? '' : '?' + @options.to_param; end
+ def results; @results || (__perform; @results); end
+ def response; @response || (__perform; @response); end
+ def json; @json || (__perform; @json); end
+ def total; @total || (__perform; @total); end
+ def seen; @seen || (__perform; @seen); end
+
+ def scroll_id
+ @scroll_id ||= @search.perform.json['_scroll_id']
+ end
+
+ def each
+ until results.empty?
+ yield results.results
+ __perform
+ end
+ end
+
+ def each_document
+ until results.empty?
+ results.each { |item| yield item }
+ __perform
+ end
+ end
+
+ def size
+ results.size
+ end
+
+ def __perform
+ @response = Configuration.client.get [url, params].join, scroll_id
+ @json = MultiJson.decode @response.body
+ @results = Results::Collection.new @json, @options
+ @total = @json['hits']['total'].to_i
+ @seen += @results.size
+ @scroll_id = @json['_scroll_id']
+ return self
+ ensure
+ __logged
+ end
+
+ def to_a; results; end; alias :to_ary :to_a
+ def to_curl; %Q|curl -X GET "#{url}?pretty=true" -d '#{@scroll_id}'|; end
+
+ def __logged(error=nil)
+ if Configuration.logger
+ Configuration.logger.log_request 'scroll', nil, to_curl
+
+ took = @json['took'] rescue nil
+ code = @response.code rescue nil
+ body = "#{@seen}/#{@total} (#{@seen/@total.to_f*100}%)" rescue nil
+
+ Configuration.logger.log_response code || 'N/A', took || 'N/A', body
+ end
+ end
+
+ end
+
+ end
+end
View
56 test/integration/scan_test.rb
@@ -0,0 +1,56 @@
+require 'test_helper'
+
+module Tire
+
+ class ScanIntegrationTest < Test::Unit::TestCase
+ include Test::Integration
+
+ context "Scan" do
+ setup do
+ documents = (1..100).map { |i| { id: i, type: 'test', title: "Document #{i}" } }
+
+ Tire.index 'scantest' do
+ delete
+ create :settings => { :number_of_shards => 1, :number_of_replicas => 0 }
+ import documents
+ refresh
+ end
+ end
+
+ teardown { Index.new('scantest').delete }
+
+ should "iterate over batches of documents" do
+ count = 0
+
+ s = Tire.scan 'scantest'
+ s.each { |results| count += 1 }
+
+ assert_equal 10, count
+ end
+
+ should "iterate over individual documents" do
+ count = 0
+
+ s = Tire.scan 'scantest'
+ s.each_document { |results| count += 1 }
+
+ assert_equal 100, count
+ end
+
+ should "limit the returned results by query" do
+ count = 0
+
+ s = Tire.scan('scantest') { query { string '10*' } }
+ s.each do |results|
+ count += 1
+ assert_equal ['Document 10', 'Document 100'], results.map(&:title)
+ end
+
+ assert_equal 1, count
+ end
+
+ end
+
+ end
+
+end
View
113 test/unit/search_scan_test.rb
@@ -0,0 +1,113 @@
+require 'test_helper'
+
+module Tire
+ module Search
+ class ScanTest < Test::Unit::TestCase
+
+ context "Scan" do
+ setup do
+ Configuration.reset
+ @results = {
+ "_scroll_id" => "abc123",
+ "took" => 3,
+ "hits" => {
+ "total" => 10,
+ "hits" => [
+ { "_id" => "1", "_source" => { "title" => "Test" } }
+ ]
+ }
+ }
+ @empty_results = @results.merge('hits' => {'hits' => []})
+ @default_response = mock_response @results.to_json, 200
+ end
+
+ should "initialize the search object with the indices" do
+ s = Scan.new(['index1', 'index2'])
+ assert_instance_of Tire::Search::Search, s.search
+ end
+
+ should "fetch the initial scroll ID" do
+ s = Scan.new('index1')
+ s.search.expects(:perform)
+ .returns(stub :json => { '_scroll_id' => 'abc123' })
+
+ assert_equal 'abc123', s.scroll_id
+ end
+
+ should "perform the request lazily" do
+ s = Scan.new('dummy')
+
+ s.expects(:scroll_id)
+ .returns('abc123')
+ .at_least_once
+
+ Configuration.client.expects(:get)
+ .with { |url,id| url =~ %r|_search/scroll.*search_type=scan| && id == 'abc123' }
+ .returns(@default_response)
+ .once
+
+ assert_not_nil s.results
+ assert_not_nil s.response
+ assert_not_nil s.json
+ end
+
+ should "set the total and seen variables" do
+ s = Scan.new('dummy')
+ s.expects(:scroll_id).returns('abc123').at_least_once
+ Configuration.client.expects(:get).returns(@default_response).at_least_once
+
+ assert_equal 10, s.total
+ assert_equal 1, s.seen
+ end
+
+ should "log the request and response" do
+ Tire.configure { logger STDERR }
+
+ s = Scan.new('dummy')
+ s.expects(:scroll_id).returns('abc123').at_least_once
+ Configuration.client.expects(:get).returns(@default_response).at_least_once
+
+ Configuration.logger.expects(:log_request)
+ .with { |(endpoint, params, curl)| endpoint == 'scroll' }
+
+ Configuration.logger.expects(:log_response)
+ .with { |code, took, body| code == 200 && took == 3 && body == '1/10 (10.0%)' }
+
+ s.__perform
+ end
+
+ context "results" do
+ setup do
+ @search = Scan.new('dummy')
+ @search.expects(:results)
+ .returns(Results::Collection.new @results)
+ .then
+ .returns(Results::Collection.new @empty_results)
+ .at_least_once
+ @search.results
+ end
+
+ should "be iterable" do
+ assert_respond_to @search, :each
+ assert_respond_to @search, :size
+
+ assert_nothing_raised do
+ @search.each { |batch| p batch; assert_equal 'Test', batch.first.title }
+ end
+ end
+
+ should "be iterable by individual documents" do
+ assert_respond_to @search, :each_document
+
+ assert_nothing_raised do
+ @search.each_document { |item| assert_equal 'Test', item.title }
+ end
+ end
+
+ end
+
+ end
+
+ end
+ end
+end
View
22 test/unit/tire_test.rb
@@ -64,6 +64,28 @@ class TireTest < Test::Unit::TestCase
end
+ context "when scanning an index" do
+ should "initiate the scan" do
+ Search::Scan.expects(:new).with { |index| index == 'dummy' }
+
+ Tire.scan('dummy')
+ end
+
+ should "allow to pass the query as a block to scan" do
+ Search::Scan.expects(:new).with { |index| index == 'dummy' }
+
+ Tire.scan('dummy') { query { string 'foo' } }
+ end
+
+ should "allow to pass the query as a hash to scan" do
+ payload = { :query => { :query_string => { :query => 'foo' } } }
+ Search::Scan.expects(:new).with('dummy', payload)
+
+ Tire.scan 'dummy', payload
+ end
+
+ end
+
end
context "utils" do
Please sign in to comment.
Something went wrong with that request. Please try again.