## Side input and output

In [4]:
import apache_beam as beam
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner

In [7]:
side_list = ["149633CM", "212539MU", "231555ZZ", "704275DC"]

class FilterByLength(beam.DoFn):
    def process(self, element, side_list, lower_bound, upper_bound=float("inf")):
        element_split = element.split(",")
        id = element_split[0]
        name = element_split[1]
        if lower_bound <= len(name) <= upper_bound and id not in side_list:
            return [element_split]

p = beam.Pipeline(InteractiveRunner())

names = (
    p
    | "Read from file" >> beam.io.ReadFromText("dept_data.txt")
    | "ParDo with side inputs" >> beam.ParDo(FilterByLength(), side_list, 3, 10)
    | "Filter by department" >> beam.Filter(lambda e: e[3] == "Accounts")
    | "Convert by keyed tuple" >> beam.Map(lambda e: (e[1], 1))
    | beam.CombinePerKey(sum)
)

ib.show(names)
ib.show_graph(p)

/usr/bin/dot


In [6]:
class ProcessWords(beam.DoFn):
    def process(self, element, cutoff_length, marker):
        name = element.split(",")[1]
        if len(name) <= cutoff_length:
            return [beam.pvalue.TaggedOutput("short_names", name)]
        if len(name) > cutoff_length:
            return [beam.pvalue.TaggedOutput("long_names", name)]
        if name.startswith(marker):
            return [name]

p1 = beam.Pipeline(InteractiveRunner())

names = (
    p1
    | "Read from file" >> beam.io.ReadFromText("dept_data.txt")
    | "Multiple outputs" >> beam.ParDo(ProcessWords(), cutoff_length=4, marker="A").with_outputs("short_names", "long_names", main="names_a")
)

short_names = names.short_names
long_names = names.long_names
main_names = names.names_a

ib.show_graph(p1)

/usr/bin/dot
