spec/wukong-hadoop/hadoop_mode_spec.rb

require 'spec_helper'

describe Wukong::Hadoop::HadoopInvocation do
  
  let(:map_only)   { hadoop_runner('regexp',          input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
  let(:map_reduce) { hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
  let(:complex)    { hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye'], :reduce_tasks => 20) }
  let(:custom_io)  { hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
  let(:many_files) { hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', files: %w[/file/1 /file/2], archives: %w[/archive/1 /archive/2], jars: %w[/jar/1 /jar/2])}

  context "defining input paths" do
    it "raises an error unless given an --input option" do
      lambda { hadoop_runner('regexp', output: '/tmp/output') }.should raise_error(Wukong::Error, /--input.*required/)
    end
    it "sets its input paths correctly" do
      map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
    end
    it "sets its input format given the --input_format option" do
      custom_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
    end
  end
  
  context "defining its output path" do
    it "raises an error unless given an --output option" do
      lambda { hadoop_runner('regexp', input: '/tmp/output') }.should raise_error(Wukong::Error, /--output.*required/)
    end
    it "sets its output path correctly" do
      map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
    end
    it "sets its output format given the --output_format option" do
      custom_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
    end
  end

  context "defining its mapper and reducer" do
    it "sets its mapper correctly" do
      map_reduce.hadoop_commandline.should match(%r{-mapper\s+'wu-local regexp'})
    end
    it "sets its reducer correctly" do
      map_reduce.hadoop_commandline.should match(%r{-reducer\s+'wu-local count'})
    end
    it "uses a blank reducer for a map-only job" do
      map_only.hadoop_commandline.should match(%r{-reducer\s+''})
    end
  end

  context "setting the number of reduce tasks" do
    it "does nothing on a map/reduce job" do
      map_reduce.hadoop_commandline.should_not match(%r{-D mapred.reduce.tasks})
    end
    it "respects the option when given" do
      complex.hadoop_commandline.should  match(%r{-D mapred.reduce.tasks=20})
    end
    it "sets reduce tasks to 0 for a map-only job" do
      map_only.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=0})
    end
  end

  context "defining Hadoop JobConf options" do
    it "translates friendly names into native ones" do
      complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
      complex.hadoop_commandline.should include("-D mapred.map.tasks=100")
    end
    it "passes options in the given --java_opts option" do
      complex.hadoop_commandline.should include('-D foo.bar=3','-D baz.booz=hello','-D hi.there=bye')
    end
  end

  context "removing existing output paths" do
    
    it "will not remove the output path by default" do
      hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path) }
    end
    it "will remove the output path when given the --rm option" do
      hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path) }
    end
    it "will not remove the output path when given the --rm option AND the --dry_run option" do
      hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path) }
    end
  end

  context "handle files, jars, and archives" do
    it "does not include any files, jars, or archives when no files were passed" do
      map_reduce.hadoop_commandline.should_not match(%r{-(files|archives|libjars)})
    end
    it "should include files when asked" do
      many_files.hadoop_commandline.should match(%r{-files\s+'/file/1,/file/2'})
    end
    it "should include jars when asked" do
      many_files.hadoop_commandline.should match(%r{-libjars\s+'/jar/1,/jar/2'})
    end
    it "should include archives when asked" do
      many_files.hadoop_commandline.should match(%r{-archives\s+'/archive/1,/archive/2'})
    end
    it "should include files when passed files as arguments" do
      hadoop_runner(examples_dir('tokenizer.rb'), examples_dir('counter.rb'), input: '/tmp/input1,/tmp/input2', output: '/tmp/output').hadoop_commandline.should match(%r{-files.+tokenizer\.rb,.*counter\.rb})
    end
  end
  
end