In [1]:
import luigi
import luigi.contrib.hadoop
import luigi.contrib.hdfs
import json
import numpy as np
import time

Loading GCP module without the python packages httplib2, google-auth.         This *could* crash at runtime if no other credentials are provided.
Loading GCS module without the python packages googleapiclient & google-auth.         This will crash at runtime if GCS functionality is used.
Loading S3 module without the python package boto3. Will crash at runtime if S3 functionality is used.


In [2]:
class Split_text(luigi.Task):
    filename = luigi.Parameter()
    split_size = luigi.IntParameter(default = 5)
    
    def output(self):
        return [luigi.contrib.hdfs.target.HdfsTarget('/user/input_text/text-%d.txt' % (i)) for i in range(self.split_size)]
    
    def run(self):
        with open(self.filename) as fopen:
            texts = list(filter(None, fopen.read().split('\n')))
        splitted_list = np.array_split(texts, self.split_size)
        for i in range(len(splitted_list)):
            splitted_list[i] = splitted_list[i].tolist()
        for no, file in enumerate(self.output()):
            with file.open('w') as fopen:
                fopen.write('\n'.join(splitted_list[no]))

class WordCount_Hadoop(luigi.contrib.hadoop.JobTask):
    filename = luigi.Parameter()
    split_size = luigi.IntParameter(default = 5)

    def requires(self):
        return Split_text(filename = self.filename, split_size = self.split_size)

    def output(self):
        return luigi.contrib.hdfs.target.HdfsTarget('/user/input_text/count.txt',format=luigi.contrib.hdfs.PlainDir)

    def mapper(self, line):
        sentences = list(filter(None, line.split('\n')))
        for sentence in sentences:
            for word in sentence.split():
                yield word, 1
                
    def reducer(self, key, values):
        yield key, sum(values)

In [3]:
if __name__ == '__main__':
    luigi.build([WordCount_Hadoop(filename='big-text.txt',split_size=10)], 
                scheduler_host = 'localhost', scheduler_port = 8082)

DEBUG: Checking if WordCount_Hadoop(filename=big-text.txt, split_size=10) is complete
DEBUG: Running file existence check: /opt/hadoop/bin/hadoop fs -stat /user/input_text/count.txt
DEBUG: Checking if Split_text(filename=big-text.txt, split_size=10) is complete
DEBUG: Running file existence check: /opt/hadoop/bin/hadoop fs -stat /user/input_text/text-0.txt
DEBUG: Running file existence check: /opt/hadoop/bin/hadoop fs -stat /user/input_text/text-1.txt
DEBUG: Running file existence check: /opt/hadoop/bin/hadoop fs -stat /user/input_text/text-2.txt
DEBUG: Running file existence check: /opt/hadoop/bin/hadoop fs -stat /user/input_text/text-3.txt
DEBUG: Running file existence check: /opt/hadoop/bin/hadoop fs -stat /user/input_text/text-4.txt
DEBUG: Running file existence check: /opt/hadoop/bin/hadoop fs -stat /user/input_text/text-5.txt
DEBUG: Running file existence check: /opt/hadoop/bin/hadoop fs -stat /user/input_text/text-6.txt
DEBUG: Running file existence check: /opt/hadoop/bin/hadoop

DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/contrib/mrrunner.py -> luigi/contrib/mrrunner.py
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/contrib/hadoop_jar.py -> luigi/contrib/hadoop_jar.py
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/contrib/mongodb.py -> luigi/contrib/mongodb.py
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/contrib/lsf.py -> luigi/contrib/lsf.py
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/contrib/gcs.py -> luigi/contrib/gcs.py
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/contrib/ftp.py -> luigi/contrib/ftp.py
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/contrib/sge.py -> luigi/contrib/sge.py
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/contrib/bigquery_avro.py -> luigi/contrib/bigquery_avro.py
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/contrib/gcp.py -> luigi/contrib/g

DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/static/visualiser/lib/mustache.js -> luigi/static/visualiser/lib/mustache.js
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/static/visualiser/lib/AdminLTE/css/skin-green-light.min.css -> luigi/static/visualiser/lib/AdminLTE/css/skin-green-light.min.css
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/static/visualiser/lib/AdminLTE/css/skin-green.min.css -> luigi/static/visualiser/lib/AdminLTE/css/skin-green.min.css
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/static/visualiser/lib/AdminLTE/css/AdminLTE.min.css -> luigi/static/visualiser/lib/AdminLTE/css/AdminLTE.min.css
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/static/visualiser/lib/AdminLTE/js/app.min.js -> luigi/static/visualiser/lib/AdminLTE/js/app.min.js
DEBUG: adding to tar: /usr/local/lib/python3.5/dist-packages/luigi/static/visualiser/lib/bootstrap-toggle/css/bootstrap-toggle.min

DEBUG: Removing directory /tmp/tmpessanv1d
DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   WordCount_Hadoop_big_text_txt_10_e4ec2d5150   has status   FAILED
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
DEBUG: There are 1 pending tasks possibly being run by other workers
DEBUG: There are 1 pending tasks unique to this worker
DEBUG: There are 1 pending tasks last scheduled by this worker
INFO: Worker Worker(salt=666215452, workers=1, host=90693de3b7f1, username=root, pid=6971) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 2 tasks of which:
* 1 complete ones were encountered:
    - 1 Split_text(filename=big-text.txt, split_size=10)
* 1 failed:
    - 1 WordCount_Hadoop(filename=big-text.txt, split_size=10)

This progress looks :( because there were failed tasks

===== Luigi Execution Summary =====

