fix incompatibilities with hadoop 0.20 (fixes GH-20)

klbostee · Dec 13, 2010 · d143163 · d143163
1 parent 550528b
commit d143163
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 11 deletions.
diff --git a/dumbo/backends/common.py b/dumbo/backends/common.py
@@ -103,6 +103,12 @@ def __repr__(self):
         return repr(self.dump())
 
 
+class RunInfo(object):
+
+    def get_input_path(self):
+        return 'unknown'
+
+
 class Iteration(object):
 
     def __init__(self, prog, opts):
@@ -254,3 +260,7 @@ def get_mapredbase_class(self, opts):
     def get_joinkey_class(self, opts):
         """ Returns a suitable JoinKey class """
         return JoinKey
+
+    def get_runinfo_class(self, opts):
+        """ Returns a suitable RunInfo class """
+        return RunInfo
diff --git a/dumbo/backends/streaming.py b/dumbo/backends/streaming.py
@@ -18,7 +18,7 @@
 import sys
 import re
 
-from dumbo.backends.common import Backend, Iteration, FileSystem
+from dumbo.backends.common import Backend, Iteration, FileSystem, RunInfo
 from dumbo.util import getopt, getopts, configopts, envdef, execute
 from dumbo.util import findhadoop, findjar, dumpcode, dumptext
 
@@ -36,6 +36,9 @@ def create_filesystem(self, opts):
         hadoopopt = getopt(opts, 'hadoop', delete=False)
         return StreamingFileSystem(findhadoop(hadoopopt[0]))
 
+    def get_runinfo_class(self, opts):
+        return StreamingRunInfo
+
 
 class StreamingIteration(Iteration):
 
@@ -201,14 +204,22 @@ def run(self):
         if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
             for (key, value) in self.opts:
                 if key == 'input':
-                    execute("%s/bin/hadoop dfs -rmr '%s'" % (hadoop, value))
+                    if os.path.exists(hadoop + "/bin/hdfs"):
+                        hdfs = hadoop + "/bin/hdfs"
+                    else:
+                        hdfs = hadoop + "/bin/hadoop"
+                    execute("%s dfs -rmr '%s'" % (hdfs, value))
         return retval
 
 
 class StreamingFileSystem(FileSystem):
 
     def __init__(self, hadoop):
         self.hadoop = hadoop
+        if os.path.exists(hadoop + "/bin/hdfs"):
+            self.hdfs = hadoop + "/bin/hdfs"
+        else:
+            self.hdfs = hadoop + "/bin/hadoop"
 
     def cat(self, path, opts):
         addedopts = getopts(opts, ['libjar'], delete=False)
@@ -220,7 +231,7 @@ def cat(self, path, opts):
                         shortcuts=dict(configopts('jars')))
         try:
             import typedbytes
-            ls = os.popen('%s %s/bin/hadoop dfs -ls %s' % (hadenv, self.hadoop, path))
+            ls = os.popen('%s %s dfs -ls %s' % (hadenv, self.hdfs, path))
             if sum(c in path for c in ("*", "?", "{")) > 0:
                 # cat each file separately when the path contains special chars
                 lineparts = (line.split()[-1] for line in ls)
@@ -247,21 +258,29 @@ def cat(self, path, opts):
         return 0
 
     def ls(self, path, opts):
-        return execute("%s/bin/hadoop dfs -ls '%s'" % (self.hadoop, path),
+        return execute("%s dfs -ls '%s'" % (self.hdfs, path),
                        printcmd=False)
 
     def exists(self, path, opts):
-        shellcmd = "%s/bin/hadoop dfs -stat '%s' >/dev/null 2>&1"
-        return 1 - int(execute(shellcmd % (self.hadoop, path), printcmd=False) == 0)
+        shellcmd = "%s dfs -stat '%s' >/dev/null 2>&1"
+        return 1 - int(execute(shellcmd % (self.hdfs, path), printcmd=False) == 0)
 
     def rm(self, path, opts):
-        return execute("%s/bin/hadoop dfs -rmr '%s'" % (self.hadoop, path),
+        return execute("%s dfs -rmr '%s'" % (self.hdfs, path),
                        printcmd=False)
 
     def put(self, path1, path2, opts):
-        return execute("%s/bin/hadoop dfs -put '%s' '%s'" % (self.hadoop, path1,
+        return execute("%s dfs -put '%s' '%s'" % (self.hdfs, path1,
                        path2), printcmd=False)
 
     def get(self, path1, path2, opts):
-        return execute("%s/bin/hadoop dfs -get '%s' '%s'" % (self.hadoop, path1,
+        return execute("%s dfs -get '%s' '%s'" % (self.hdfs, path1,
                        path2), printcmd=False)
+
+
+class StreamingRunInfo(RunInfo):
+
+    def get_input_path(self):
+        if os.environ.has_key('mapreduce_map_input_file'):
+            return os.environ['mapreduce_map_input_file']
+        return os.environ['map_input_file']
diff --git a/dumbo/core.py b/dumbo/core.py
@@ -236,6 +236,7 @@ def run(mapper,
 
         mrbase_class = loadclassname(os.environ['dumbo_mrbase_class'])
         jk_class = loadclassname(os.environ['dumbo_jk_class'])
+        runinfo = loadclassname(os.environ['dumbo_runinfo_class'])()
 
         if iterarg == iter:
             if sys.argv[1].startswith('map'):
@@ -275,7 +276,7 @@ def run(mapper,
                 if combconf:
                     combconf()
                 if os.environ.has_key('dumbo_addpath'):
-                    path = os.environ['map_input_file']
+                    path = runinfo.get_input_path()
                     inputs = (((path, k), v) for (k, v) in inputs)
                 if os.environ.has_key('dumbo_joinkeys'):
                     inputs = ((jk_class(k), v) for (k, v) in inputs)
@@ -420,7 +421,9 @@ def run(mapper,
         opts.append(('cmdenv', 'dumbo_mrbase_class=' + \
                      getclassname(backend.get_mapredbase_class(opts))))
         opts.append(('cmdenv', 'dumbo_jk_class=' + \
-                     getclassname(backend.get_joinkey_class(opts))))                  
+                     getclassname(backend.get_joinkey_class(opts))))
+        opts.append(('cmdenv', 'dumbo_runinfo_class=' + \
+                     getclassname(backend.get_runinfo_class(opts))))
         retval = backend.create_iteration(opts).run()
         if retval == 127:
             print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?'

diff --git a/dumbo/util.py b/dumbo/util.py
@@ -177,9 +177,13 @@ def findjar(hadoop, name):
     hadoop home directory and component base name (e.g 'streaming')"""
 
     jardir_candidates = filter(os.path.exists, [
+        os.path.join(hadoop, 'mapred', 'build', 'contrib', name),
         os.path.join(hadoop, 'build', 'contrib', name),
+        os.path.join(hadoop, 'mapred', 'contrib', name, 'lib'),
         os.path.join(hadoop, 'contrib', name, 'lib'),
+        os.path.join(hadoop, 'mapred', 'contrib', name),
         os.path.join(hadoop, 'contrib', name),
+        os.path.join(hadoop, 'mapred', 'contrib'),
         os.path.join(hadoop, 'contrib')
     ])
     regex = re.compile(r'hadoop.*%s.*\.jar' % name)