Skip to content

cdh4, centos 6.3, cannot get simple dumbo job to run. #78

Open
bgeary opened this Issue Oct 21, 2013 · 1 comment

2 participants

@bgeary
bgeary commented Oct 21, 2013

// my python job
def mapper(key, value):
yield value.split(" ")[0], 1

def reducer(key, values):
yield key, sum(values)

if name == "main":
import dumbo
dumbo.run(mapper, reducer, combiner=reducer)

// my command (version 1)
[root@medusa tests]# dumbo start ipcount.py -hadoop /usr/lib/hadoop-mapreduce -input ips.txt -output ipcounts
EXEC: HADOOP_CLASSPATH=":$HADOOP_CLASSPATH" /usr/lib/hadoop-mapreduce/bin/hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar -mapper 'python -m ipcount map 0 262144000' -outputformat 'org.apache.hadoop.mapred.SequenceFileOutputFormat' -inputformat 'org.apache.hadoop.streaming.AutoInputFormat' -reducer 'python -m ipcount red 0 262144000' -file '/CP/dumbo/dumbo-master/tests/ipcount.py' -file '/usr/lib/python2.6/site-packages/dumbo-0.21.37-py2.6.egg' -file '/usr/lib/python2.6/site-packages/typedbytes-0.3.8-py2.6.egg' -output 'ipcounts' -jobconf 'stream.map.input=typedbytes' -jobconf 'stream.reduce.input=typedbytes' -jobconf 'stream.map.output=typedbytes' -jobconf 'stream.reduce.output=typedbytes' -jobconf 'mapred.job.name=ipcount.py (1/1)' -input 'ips.txt' -cmdenv 'dumbo_mrbase_class=dumbo.backends.common.MapRedBase' -cmdenv 'dumbo_jk_class=dumbo.backends.common.JoinKey' -cmdenv 'dumbo_runinfo_class=dumbo.backends.streaming.StreamingRunInfo' -cmdenv 'PYTHONPATH=dumbo-0.21.37-py2.6.egg:typedbytes-0.3.8-py2.6.egg'
/bin/sh: /usr/lib/hadoop-mapreduce/bin/hadoop: No such file or directory
ERROR: Are you sure that "python" is on your path?

// my second attempt changing the –hadoop path (version 2)
[root@medusa tests]# dumbo start ipcount.py -hadoop /usr/lib/hadoop-0.20-mapreduce/ -input ips.txt -output ipcounts
EXEC: HADOOP_CLASSPATH=":$HADOOP_CLASSPATH" /usr/lib/hadoop-0.20-mapreduce//bin/hadoop jar /usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.4.0.jar -mapper 'python -m ipcount map 0 262144000' -outputformat 'org.apache.hadoop.mapred.SequenceFileOutputFormat' -inputformat 'org.apache.hadoop.streaming.AutoInputFormat' -reducer 'python -m ipcount red 0 262144000' -file '/CP/dumbo/dumbo-master/tests/ipcount.py' -file '/usr/lib/python2.6/site-packages/dumbo-0.21.37-py2.6.egg' -file '/usr/lib/python2.6/site-packages/typedbytes-0.3.8-py2.6.egg' -output 'ipcounts' -jobconf 'stream.map.input=typedbytes' -jobconf 'stream.reduce.input=typedbytes' -jobconf 'stream.map.output=typedbytes' -jobconf 'stream.reduce.output=typedbytes' -jobconf 'mapred.job.name=ipcount.py (1/1)' -input 'ips.txt' -cmdenv 'dumbo_mrbase_class=dumbo.backends.common.MapRedBase' -cmdenv 'dumbo_jk_class=dumbo.backends.common.JoinKey' -cmdenv 'dumbo_runinfo_class=dumbo.backends.streaming.StreamingRunInfo' -cmdenv 'PYTHONPATH=dumbo-0.21.37-py2.6.egg:typedbytes-0.3.8-py2.6.egg'
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/util/PlatformName
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.util.PlatformName
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:306)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:247)
Could not find the main class: org.apache.hadoop.util.PlatformName. Program will exit.
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/util/RunJar
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.util.RunJar
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:306)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:247)
Could not find the main class: org.apache.hadoop.util.RunJar. Program will exit.

@lucidfrontier45

add this to the top of you script
#!/usr/bin/python

you also need to fix
if name == "main": to if __name__ == "__main__":

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.