Permalink
Browse files

added ruby gem layer

  • Loading branch information...
Dhruv Bansal
Dhruv Bansal committed Nov 15, 2012
1 parent 5410c6e commit 8288f5e2e4e48e667e04a8f6f03a37980b4b133e
Showing with 262 additions and 0 deletions.
  1. +2 −0 .rspec
  2. +61 −0 bin/wu-hadoop-es
  3. +3 −0 lib/wonderdog.rb
  4. +16 −0 lib/wonderdog/configuration.rb
  5. +145 −0 lib/wonderdog/driver.rb
  6. +3 −0 lib/wonderdog/version.rb
  7. +32 −0 wonderdog.gemspec
View
2 .rspec
@@ -0,0 +1,2 @@
--format=documentation
--color
View
@@ -0,0 +1,61 @@
#!/usr/bin/env ruby
require 'wonderdog'
settings = Wukong::Elasticsearch::Configuration
settings.use(:commandline)
def settings.usage()
"usage: #{File.basename($0)} [ --param=value | -p value | --param | -p] PROCESSOR|FLOW [PROCESSOR|FLOW]"
end
settings.description = <<EOF
wu-hadoop-es is a tool that works in combination with wu-hadoop to run
Hadoop jobs powered by Wukong using data read from or written to
ElasticSearch.
The details of how wu-hadoop-es interacts with Hadoop are the same as
for wu-hadoop, so see that tool's reference for more details.
wu-hadoop-es is just a layer over wu-hadoop that makes it easier to
connect to ElasticSearch for reading or writing data. It does this by
providing a simple way to specify ElasticSearch as an output in
Hadoop:
$ wu-hadoop-es mapper.rb reducer.rb --input=/hdfs/input/path --output=es://my_index/my_type
with the es:// scheme-prefix implying that the --output path should be
interpreted as an ElasticSearch index ("my_index") and type
("my_type"). You can have more fine-grained control over how data is
written, see the various "field" options.
You can also easily read data:
$ wu-hadoop-es mapper.rb reducer.rb --input=es://my_index/my_type --output=/hdfs/output/path
and even specify a query
$ wu-hadoop-es mapper.rb reducer.rb --input=es://my_index/my_type --output=/hdfs/output/path --query='{"query_string": {"query": "hi +there"}}'
There are several tuning options available. ElasticSearch
configuration (and how to specify which cluster to find and join) are
handled using ElasticSearch's usual configuration file based approach,
specified by the --es_config option.
EOF
# These options we want to pass on to wu-hadoop but sometimes we want
# to mess with one first so we'll have to reconstruct them later
# ourselves.
settings.define(:input, :description => "Comma-separated list of input paths: local, HDFS, or ElasticSearch", :es => true)
settings.define(:output, :description => "Output path: local, HDFS, or ElasticSearch", :es => true)
settings.define(:query, :description => "Query to use when defining input splits for ElasticSearch input", :es => true)
require 'wukong/boot' ; Wukong.boot!(settings)
if settings.rest.empty?
settings.dump_help
exit(1)
end
Wukong::Elasticsearch::Driver.run(settings, settings.rest)
View
@@ -0,0 +1,3 @@
require 'wukong'
require 'wonderdog/configuration'
require 'wonderdog/driver'
@@ -0,0 +1,16 @@
module Wukong
module Elasticsearch
Configuration = Configliere::Param.new unless defined?(Configuration)
Configuration.define(:tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :es => true)
Configuration.define(:config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :es => true)
Configuration.define(:input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :es => true)
Configuration.define(:request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :es => true)
Configuration.define(:scroll_timeout, :description => "Amount of time to wait on a scroll", :es => true)
Configuration.define(:index_field, :description => "Field to use from each record to override the default index", :es => true)
Configuration.define(:type_field, :description => "Field to use from each record to override the default type", :es => true)
Configuration.define(:id_field, :description => "If this field is present in a record, make an update request, otherwise make a create request", :es => true)
Configuration.define(:bulk_size, :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :es => true)
end
end
View
@@ -0,0 +1,145 @@
require 'shellwords'
require 'uri'
module Wukong
module Elasticsearch
class Driver
STREAMING_INPUT_FORMAT = "com.infochimps.elasticsearch.ElasticSearchStreamingInputFormat"
STREAMING_OUTPUT_FORMAT = "com.infochimps.elasticsearch.ElasticSearchStreamingOutputFormat"
class IndexAndType
attr_reader :index, :type
def initialize uri
self.uri = uri
end
def self.uri= u
begin
@uri = URI.parse(u)
path = File.join(@uri.host, @uri.path)
@index = path.split('/')[0]
@type = path.split('/')[1]
rescue URI::InvalidURIError => e
raise Wukong::Error.new("Could not parse '#{u}' as an ElasticSearch /index/type specification")
end
end
end
attr_accessor :settings
def self.run(settings, *extra_args)
begin
new(settings, *extra_args).run!
rescue Wukong::Error => e
$stderr.puts e.message
exit(127)
end
end
def initialize(settings, *args)
@settings = settings
end
def read?
settings[:input] && settings[:input] =~ %r!^es://!
end
def write?
settings[:output] && settings[:output] =~ %r!^es://!
end
def input
@input ||= IndexAndType.new(settings[:input])
end
def output
@output ||= IndexAndType.new(settings[:output])
end
def run!
# puts wu_hadoop_command
exec wu_hadoop_command
end
def wu_hadoop_command
[
'wu-hadoop',
wu_hadoop_input,
wu_hadoop_output,
java_opts,
non_es_params,
args
].flatten.compact.join(' ')
end
def args
settings.rest
end
def wu_hadoop_input
case
when settings[:input] && read?
"--input=#{tmp_filename(input)} --input_format=#{STREAMING_INPUT_FORMAT}"
when settings[:input]
"--input=#{settings[:input]}"
end
end
def wu_hadoop_output
case
when settings[:output] && write?
"--output=#{tmp_filename(output)} --output_format=#{STREAMING_OUTPUT_FORMAT}"
when settings[:output]
"--output=#{settings[:output]}"
end
end
def non_es_params
settings.reject{ |param, val| settings.definition_of(param, :es) }.map{ |param,val| "--#{param}=#{val}" }
end
def tmp_filename io
cleaner = %r{[^\w/\.\-\+]+}
io_part = [io.index, io.type].compact.map { |s| s.gsub(cleaner, '') }.join('/')
job_part = settings.rest.map { |s| File.basename(s, '.rb').gsub(cleaner,'') }.join('---')
File.join(settings[:tmp_dir], io_part, job_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
end
def java_opts
return unless read? || write?
opts = [].tap do |o|
o << java_opt('es.config', settings[:config]) if settings[:config]
if read?
o << java_opt('elasticsearch.input.index', input.index)
o << java_opt('elasticsearch.input.type', input.type)
o << java_opt('elasticsearch.input.splits', settings[:input_splits]) if settings[:input_splits]
o << java_opt('elasticsearch.input.query', settings[:query]) if settings[:query]
o << java_opt('elasticsearch.input.request_size', settings[:request_size]) if settings[:request_size]
o << java_opt('elasticsearch.input.scroll_timeout', settings[:scroll_timeout]) if settings[:scroll_timeout]
end
if write?
o << java_opt('elasticsearch.output.index', output.index)
o << java_opt('elasticsearch.output.type', output.type)
o << java_opt('elasticsearch.output.index.field', settings[:index_field]) if settings[:index_field]
o << java_opt('elasticsearch.output.type.field', settings[:type_field]) if settings[:type_field]
o << java_opt('elasticsearch.output.id.field', settings[:id_field]) if settings[:id_field]
o << java_opt('elasticsearch.output.bulk_size', settings[:bulk_size]) if settings[:bulk_size]
end
end
return if opts.empty?
joined = opts.join(' ')
"--java_opts='#{joined}'"
end
def java_opt name, value
return if value.nil?
"-D #{name}=#{Shellwords.escape(value.to_s)}"
end
end
end
end
View
@@ -0,0 +1,3 @@
module Wonderdog
VERSION = '0.0.1'
end
View
@@ -0,0 +1,32 @@
# -*- encoding: utf-8 -*-
require File.expand_path('../lib/wonderdog/version', __FILE__)
Gem::Specification.new do |gem|
gem.name = 'wonderdog'
gem.homepage = 'https://github.com/infochimps-labs/wonderdog'
gem.licenses = ["Apache 2.0"]
gem.email = 'coders@infochimps.com'
gem.authors = ['Infochimps', 'Philip (flip) Kromer', 'Jacob Perkins', 'Travis Dempsey', 'Dhruv Bansal']
gem.version = Wonderdog::VERSION
gem.summary = 'Make Hadoop and ElasticSearch play together nicely.'
gem.description = <<-EOF
Wonderdog provides code in both Ruby and Java to make ElasticSearch
a more fully-fledged member of both the Hadoop and Wukong
ecosystems.
For the Java side, Wonderdog provides InputFormat and OutputFormat
classes for use with Hadoop (esp. Hadoop Streaming) and Pig.
For the Ruby side, Wonderdog provides a simple wrapper for wu-hadoop
to make running Hadoop Streaming jobs written in Wukong against
ElasticSearch easier.
EOF
gem.files = `git ls-files`.split("\n")
gem.executables = ['wu-hadoop-es']
gem.test_files = gem.files.grep(/^spec/)
gem.require_paths = ['lib']
gem.add_dependency('configliere', '~> 0.4')
end

0 comments on commit 8288f5e

Please sign in to comment.