Command Line
===

* Última modificación: Mayo  14, 2022

In [1]:
%cd /tmp
!rm -rf output/ all_sales.csv
!mkdir -p input/ 

/tmp


In [2]:
%%writefile input/sales1.csv
Fance,May,100
NA
Germany,May,200

Overwriting input/sales1.csv


In [3]:
%%writefile input/sales2.csv
Fance,June,140
Germany,June,180
UK,June,180

Overwriting input/sales2.csv


In [4]:
%%writefile input/sales3.csv
France,June,240
None
Germany,June,180

Overwriting input/sales3.csv


In [5]:
%%writefile __init__.py
#

Overwriting __init__.py


In [6]:
%%writefile pipeline.py

import os

import luigi
from luigi import IntParameter, LocalTarget, Parameter, Task

OUTPUT_FOLDER = "output"


class DownloadFile(Task):

    input_folder = Parameter()
    file_name = Parameter()
    index = IntParameter()

    def output(self):
        output_path = os.path.join(
            OUTPUT_FOLDER,
            str(self.index),
            self.file_name,
        )
        return LocalTarget(output_path)

    def run(self):
        input_path = os.path.join(
            self.input_folder,
            self.file_name,
        )
        with open(input_path) as in_file:
            with self.output().open("w") as out_file:
                for line in in_file:
                    if "," in line:
                        out_file.write(line)


class DownloadSalesData(Task):
    input_folder = Parameter()

    def output(self):
        return LocalTarget("all_sales.csv")

    def run(self):
        processed_files = []
        counter = 1
        for file in sorted(os.listdir(self.input_folder)):
            target = yield DownloadFile(
                self.input_folder,
                file,
                counter,
            )
            counter += 1
            processed_files.append(target)

        with self.output().open("w") as out_file:
            for file in processed_files:
                with file.open() as in_file:
                    for line in in_file:
                        out_file.write(line)



Overwriting pipeline.py


In [7]:
!python3 -m luigi --module pipeline  DownloadSalesData --input-folder input --local-scheduler

DEBUG: Checking if DownloadSalesData(input_folder=input) is complete
INFO: Informed scheduler that task   DownloadSalesData_input_3d041dae39   has status   PENDING
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 1
INFO: [pid 731] Worker Worker(salt=628782563, workers=1, host=b148b7d2dd75, username=root, pid=731) running   DownloadSalesData(input_folder=input)
INFO: [pid 731] Worker Worker(salt=628782563, workers=1, host=b148b7d2dd75, username=root, pid=731) new requirements      DownloadSalesData(input_folder=input)
DEBUG: 1 running tasks, waiting for next task to finish
DEBUG: Checking if DownloadFile(input_folder=input, file_name=sales1.csv, index=1) is complete
INFO: Informed scheduler that task   DownloadFile_sales1_csv_1_input_c3a4c863ff   has status   PENDING
INFO: Informed scheduler that task   DownloadSalesData_input_3d041dae39   has status   PENDING
DEBUG: Asking scheduler for work...
DEBUG: Pending ta

In [8]:
!cat all_sales.csv

Fance,May,100
Germany,May,200
Fance,June,140
Germany,June,180
UK,June,180
France,June,240
Germany,June,180


In [9]:
!ls -1 /tmp/output

1
2
3


In [10]:
!ls -1 /tmp/output/1/

sales1.csv


In [11]:
!ls -1 /tmp/output/2/

sales2.csv


In [12]:
!ls -1 /tmp/output/3/

sales3.csv


In [13]:
!cat /tmp/output/1/sales1.csv

Fance,May,100
Germany,May,200
