## ParDo Transform

In [3]:
import apache_beam as beam
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner

In [18]:
def split_row(e: str):
    return e.split(",")

def account_filter(e: list):
    return e[3] == "Accounts"

p = beam.Pipeline(InteractiveRunner())

attendance_count = (
    p
    | "Read from file" >> beam.io.ReadFromText("dept_data.txt")
    | "Split by comma" >> beam.Map(split_row)
    | "Filter by account" >> beam.Filter(account_filter)
    | "Convert by keyed tuple" >> beam.Map(lambda e: (e[1], 1))
    | "Count by name" >> beam.CombinePerKey(sum)
)

# ib.show(attendance_count, n=5)
ib.collect(attendance_count)

Unnamed: 0,0,1
0,Marco,31
1,Rebekah,31
2,Itoe,31
3,Edouard,31
4,Kyle,62
5,Kumiko,31
6,Gaston,31
7,Ayumi,30


In [20]:
class SplitRow(beam.DoFn):
    def process(self, element):
        return [element.split(",")]

class Count(beam.DoFn):
    def process(self, element):
        key, values = element # [Marco, [1, 1, ..., 1]]
        return [(key, sum(values))]

p1 = beam.Pipeline(InteractiveRunner())

attendance_count = (
    p1
    | "Read from file" >> beam.io.ReadFromText("dept_data.txt")
    | "Split by comma" >> beam.ParDo(SplitRow())
    | "Filter by account" >> beam.ParDo(lambda e: [e] if e[3] == "Accounts" else None)
    | "Convert by keyed tuple" >> beam.ParDo(lambda e: [(e[1], 1)])
    | "Group by key" >> beam.GroupByKey() # [Marco, [1, 1, ..., 1]]
    | "Count using ParDo" >> beam.ParDo(Count())
)

# ib.show(attendance_count, n=5)
ib.collect(attendance_count)

Unnamed: 0,0,1
0,Marco,31
1,Rebekah,31
2,Itoe,31
3,Edouard,31
4,Kyle,62
5,Kumiko,31
6,Gaston,31
7,Ayumi,30
