## Simple transformation

In [1]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines = (
        p | 'Create' >> beam.Create(['one', 'two', 'three', 'four'])
          | 'Uppercase' >> beam.Map(str.upper)
          | 'Print' >> beam.Map(print)
    )



ONE
TWO
THREE
FOUR


## The pipe | is actually just an operator overload to call the apply method of the pipeline

In [147]:
import apache_beam as beam

with beam.Pipeline() as p:
        lines = p | 'Create' >> beam.Create(['one', 'two', 'three', 'four'])
        lines2 = (
            p.apply(beam.Map(str.title), lines, 'titlecase')
             .apply(beam.Map(print), lines2)
        )


One
Two
Three
Four


In [2]:
! ls

JoeyDemos.ipynb       leftjoin.py    simple3-dataflow.sh  territories.py
README.md	      products.json  simple3-local.sh	  wordcount-dataflow.sh
aggregate1.py	      regions.csv    simple3.py		  wordcount.py
aggregate2.py	      simple1.py     simple3_custom.py
dataflow_template.py  simple2.py     territories.avro


## Read from CSV and use Map

In [3]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

filename = 'regions.csv'
with beam.Pipeline() as p:
    regions = (
        p | 'Read' >> ReadFromText(filename)
          | 'Split' >> beam.Map(lambda x : tuple(x.split(',')))
          | 'Transform' >> beam.Map(lambda x : (int(x[0]) * 10, x[1].upper()))
          | 'Print' >> beam.Map(print)
    )


(10, 'EASTERN')
(20, 'WESTERN')
(30, 'NORTHERN')
(40, 'SOUTHERN')


## Read from CSV and use ParDo

In [4]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(int(regionid), regionname)] # ParDo's need to return a list
        yield (int(regionid), regionname) # Can also use yield instead of returning a list

# using a ParDo and DoFn instead of a Map
filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    records | 'Write' >> WriteToText('regions.out')



In [5]:
! cat regions.out*

('1', 'Eastern')
('2', 'Western')
('3', 'Northern')
('4', 'Southern')


## Template showing a full program that can read the command line args

In [None]:
"""A template to import the default package and parse the arguments"""

from __future__ import absolute_import

import argparse
import logging
import re

#from past.builtins import unicode

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(regionid, regionname)] # ParDo's need to return a list
        yield (int(regionid), regionname) # Can also use yield instead of returning a list

def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflowclass1-bucket/regions.csv',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      default = 'gs://dataflowclass1-bucket/regions_output',      
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=pipeline_options) as p:
    lines = p | 'Read' >> ReadFromText(known_args.input)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    uppercase = records | 'Uppercase' >> beam.Map(lambda x : (int(x[0]), x[1].upper()))
    uppercase | 'Write' >> WriteToText(known_args.output)

if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()


# using a ParDo and DoFn instead of a Map
filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    records | 'Write' >> WriteToText('regions2.out')



## Example of how to create a split ParDo with multiple outputs

In [15]:
import apache_beam as beam
from apache_beam import pvalue
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

class OddEvenRegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        if int(regionid) % 2 == 0:
            yield pvalue.TaggedOutput('Even', (int(regionid), regionname, 'Even'))
        else:
            yield pvalue.TaggedOutput('Odd', (int(regionid), regionname, 'Odd'))

# using a ParDo and DoFn instead of a Map
filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    evens, odds = lines | 'Split' >> beam.ParDo(OddEvenRegionSplit()).with_outputs("Even", "Odd")
    
    print('Evens')
    evens | 'Print Evens' >> beam.Map(print)

    print('Odds')
    odds | 'Print Odds' >> beam.Map(print)
    
  

Evens
Odds
(1, 'Eastern', 'Odd')
(2, 'Western', 'Even')
(3, 'Northern', 'Odd')
(4, 'Southern', 'Even')


## Example of branching

In [109]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(int(regionid), regionname)] # ParDo's need to return a list
        yield (int(regionid), regionname) # Can also use yield instead of returning a list

# using a ParDo and DoFn instead of a Map
filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    records | 'Write' >> WriteToText('regions.out')
    
    (records | beam.Map(lambda x : (x[0] * 10, x[1].upper()))
            | 'Print' >> beam.Map(print)
    )



(10, 'EASTERN')
(20, 'WESTERN')
(30, 'NORTHERN')
(40, 'SOUTHERN')


In [110]:
! cat regions*

1,Eastern
2,Western
3,Northern
4,Southern
(1, 'Eastern')
(2, 'Western')
(3, 'Northern')
(4, 'Southern')


## GroupByKey

In [116]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

class TerritorySplit(beam.DoFn):
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield (regionid, (territoryname, territoryid))

        territoriesfilename = 'territories.csv'

with beam.Pipeline() as p:
    territories = (
                  p | 'Read Territories' >> ReadFromText('territories.csv')
                    | 'Split Territories' >> beam.ParDo(TerritorySplit())
                    | beam.GroupByKey() 
                    | 'Print Territories' >> beam.Map(print)
                  )





('1', [('Westboro', '01581'), ('Bedford', '01730'), ('Georgetow', '01833'), ('Boston', '02116'), ('Cambridge', '02139'), ('Braintree', '02184'), ('Providence', '02903'), ('Wilton', '06897'), ('Morristown', '07960'), ('Edison', '08837'), ('New York', '10019'), ('New York', '10038'), ('Mellvile', '11747'), ('Fairport', '14450'), ('Neward', '19713'), ('Rockville', '20852'), ('Greensboro', '27403'), ('Cary', '27511'), ('Louisville', '40222')])
('3', [('Hollis', '03049'), ('Portsmouth', '03801'), ('Philadelphia', '19428'), ('Beachwood', '44122'), ('Findlay', '45839'), ('Southfield', '48075'), ('Troy', '48084'), ('Bloomfield Hills', '48304'), ('Racine', '53404'), ('Roseville', '55113'), ('Minneapolis', '55439')])
('4', [('Columbia', '29202'), ('Atlanta', '30346'), ('Savannah', '31406'), ('Orlando', '32859'), ('Tampa', '33607'), ('Bentonville', '72716'), ('Dallas', '75234'), ('Austin', '78759')])
('2', [('Hoffman Estates', '60179'), ('Chicago', '60601'), ('Denver', '80202'), ('Colorado Spring

## Flatten

In [131]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines1 = p | 'Create 1' >> beam.Create(['one', 'two', 'three', 'four'])
    lines2 = p | 'Create 2' >> beam.Create(['alpha', 'beta', 'gamma', 'delta'])

    merged = ((lines1, lines2) | 'Merge PCollections' >> beam.Flatten())
    merged | beam.Map(print)


one
two
three
four
alpha
beta
gamma
delta


## Combine

In [148]:
import apache_beam as beam

with beam.Pipeline() as p:
    data = (
        p | 'Create' >> beam.Create([('a', 10), ('a', 20), ('b', 30), ('b', 40), ('c', 50), ('a', 60)])
          | 'Combine' >> beam.CombinePerKey(sum)
          | 'Print' >> beam.Map(print)
    )


('a', 90)
('b', 70)
('c', 50)


## Custom Combine Function

In [179]:
#mport apache_beam as beam

class CustomCombine(beam.CombineFn):

  def create_accumulator(self):
    return {}

  def add_input(self, accumulator, input):
    k, v = input
    x, y, z = accumulator.get(k, (0, 0, 0))

    # take the max for the first element of the tuple and sum the second element and count for the third
    accumulator[k] = (v[0] if v[0] > x else x, y + v[1], z + 1)
    return accumulator

  def merge_accumulators(self, accumulators):
    merged = {}
    for accum in accumulators:
      for k, v in accum.items():
        x, y, z = merged.get(k, (0, 0, 0))
        merged[k] = (v[0] if v[0] > x else x, y + v[1], z + v[2])
    return merged

  def extract_output(self, accumulator):
    # return the max, the sum, the count and the average for the key
    return {k : (v[0], v[1], v[2], v[1]/v[2]) for k, v in accumulator.items()}
    return accumulator
    

with beam.Pipeline() as p:
    data = (
        p | 'Create' >> beam.Create([('a', (1, 10)), ('a', (2, 20)), ('b', (3, 30)), ('b', (4, 40)), ('c', (5, 50)), ('a', (6, 60))])
          | 'Combine' >> beam.CombineGlobally(CustomCombine())
          | 'Print' >> beam.Map(print)
    )


{'a': (6, 90, 3, 30.0), 'b': (4, 70, 2, 35.0), 'c': (5, 50, 1, 50.0)}


## Map vs FlatMap

In [182]:
import apache_beam as beam

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create(['Strawberry,Carrot,Eggplant','Tomato,Potato'])
      | 'Split words' >> beam.Map(lambda x : x.split(','))
#      | 'Split words' >> beam.FlatMap(lambda x : x.split(','))
      | beam.Map(print))

['Strawberry', 'Carrot', 'Eggplant']
['Tomato', 'Potato']


## WithKeys

In [191]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

class TerritorySplit(beam.DoFn):
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield (territoryid, territoryname, regionid)

        territoriesfilename = 'territories.csv'

with beam.Pipeline() as p:
    territories = (
                  p | 'Read Territories' >> ReadFromText('territories.csv')
                    | 'Split Territories' >> beam.ParDo(TerritorySplit())
                    | 'With Keys' >> beam.util.WithKeys(lambda x : x[2])
#                    | 'With Keys Manually' >> beam.Map(lambda x : (x[2], x))
                  )
    territories | 'Print KV' >> beam.Map(print)
    territories | beam.util.Keys() | 'Print Keys' >> beam.Map(print)
    territories | beam.util.Values() | 'Print Values' >> beam.Map(print)




('1', ('01581', 'Westboro', '1'))
('01581', 'Westboro', '1')
1
('1', ('01730', 'Bedford', '1'))
('01730', 'Bedford', '1')
1
('1', ('01833', 'Georgetow', '1'))
('01833', 'Georgetow', '1')
1
('1', ('02116', 'Boston', '1'))
('02116', 'Boston', '1')
1
('1', ('02139', 'Cambridge', '1'))
('02139', 'Cambridge', '1')
1
('1', ('02184', 'Braintree', '1'))
('02184', 'Braintree', '1')
1
('1', ('02903', 'Providence', '1'))
('02903', 'Providence', '1')
1
('3', ('03049', 'Hollis', '3'))
('03049', 'Hollis', '3')
3
('3', ('03801', 'Portsmouth', '3'))
('03801', 'Portsmouth', '3')
3
('1', ('06897', 'Wilton', '1'))
('06897', 'Wilton', '1')
1
('1', ('07960', 'Morristown', '1'))
('07960', 'Morristown', '1')
1
('1', ('08837', 'Edison', '1'))
('08837', 'Edison', '1')
1
('1', ('10019', 'New York', '1'))
('10019', 'New York', '1')
1
('1', ('10038', 'New York', '1'))
('10038', 'New York', '1')
1
('1', ('11747', 'Mellvile', '1'))
('11747', 'Mellvile', '1')
1
('1', ('14450', 'Fairport', '1'))
('14450', 'Fairport',

In [180]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplitDict(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield {'regionid':int(regionid), 'regionname':regionname.title()}

class TerritorySplitDict(beam.DoFn):
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield {'territoryid':int(territoryid), 'territoryname' : territoryname, 'regionid':int(regionid)}

class UnnestCoGrouped(beam.DoFn):
    def process(self, item, child_pipeline, parent_pipeline):
        k, v = item
        child_dict = v[child_pipeline]
        parent_dict = v[parent_pipeline]
        for child in child_dict:
            try:
                child.update(parent_dict[0])
                yield child
            except IndexError:
                yield child

class LeftJoin(beam.PTransform):
    def __init__(self, parent_pipeline_name, parent_data, parent_key, child_pipeline_name, child_data,  child_key):
        self.parent_pipeline_name = parent_pipeline_name
        self.parent_data = parent_data
        self.parent_key = parent_key
        self.child_pipeline_name = child_pipeline_name
        self.child_data = child_data
        self.child_key = child_key

    def expand(self, pcols):
        def _format_as_common_key_tuple(child_dict, child_key):
            return (child_dict[child_key], child_dict)

        return ({
                pipeline_name: pcol1 | f'Convert to ({self.parent_key} = {self.child_key}, object) for {pipeline_name}' 
                >> beam.Map(_format_as_common_key_tuple, self.child_key)
                for (pipeline_name, pcol1) in pcols.items()}
                | f'CoGroupByKeey {pcols.keys()}' >> beam.CoGroupByKey()
                | 'Unnest Cogrouped' >> beam.ParDo(UnnestCoGrouped(), self.child_pipeline_name, self.parent_pipeline_name)
        )
        
# using a ParDo and DoFn instead of a Map
regionsfilename = 'regions.csv'
territoriesfilename = 'territories.csv'

with beam.Pipeline() as p:
    regions = (
              p | 'Read Regions' >> ReadFromText(regionsfilename)
                | 'Split Regions' >> beam.ParDo(RegionSplitDict())
                #| 'Print Regions' >> beam.Map(print)
              )
        
    territories = (
                  p | 'Read Territories' >> ReadFromText('territories.csv')
                    | 'Split Territories' >> beam.ParDo(TerritorySplitDict())
                    #| 'Print Territories' >> beam.Map(print)
                  )

    leftjoin = {'regions':regions, 'territories':territories} | LeftJoin('regions', regions, 'regionid', 'territories', territories, 'regionid')
    leftjoin | 'print left join' >> beam.Map(print)




{'territoryid': 1581, 'territoryname': 'Westboro', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 1730, 'territoryname': 'Bedford', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 1833, 'territoryname': 'Georgetow', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2116, 'territoryname': 'Boston', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2139, 'territoryname': 'Cambridge', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2184, 'territoryname': 'Braintree', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2903, 'territoryname': 'Providence', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 6897, 'territoryname': 'Wilton', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 7960, 'territoryname': 'Morristown', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 8837, 'territoryname': 'Edison', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 10019, 'territoryname': 'New York', 'regionid': 1, 'regionname': 'Eastern'

In [212]:
import apache_beam as beam
from apache_beam.pvalue import AsIter, AsSingleton, AsList, AsDict
from apache_beam.io import ReadFromText, WriteToText
from apache_beam.io import ReadFromAvro, WriteToAvro
from collections import namedtuple
from apache_beam import coders
from apache_beam import coders
from apache_beam.typehints.decorators import with_output_types


class Region:
    def __init__(self, regionid, regionname):
        self.regionid = regionid
        self.regionname = regionname
        
    def __str__(self):
        return f'{self.regionid}|{self.regionname}'

    def encode(self, o):
        """Encode to bytes with a trace that coder was used."""
        # Our encoding prepends an 'x:' prefix.
        return b'x:%s' % o.encode('utf-8')

    def decode(self, s):
        # To decode, we strip off the prepended 'x:' prefix.
        s = s.decode('utf-8')
        #assert s[0:2] == 'x:'
        params = s[0:2].split('|')
        return Region(*params)

    def is_deterministic(self):
        # Since coded Player objects are used as keys below with
        # beam.CombinePerKey(sum), we require that this coder is deterministic
        # (i.e., two equivalent instances of the classes are encoded into the same
        # byte string) in order to guarantee consistent results.
        return True
    
class RegionSplitClass(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield Region(int(regionid), regionname.title())

# class RegionCoder(coders.Coder):
#   """A custom coder for the RegionSchema"""
#   def encode(self, o):
#     """Encode to bytes with a trace that coder was used."""
#     # Our encoding prepends an 'x:' prefix.
#     return b'x:%s' % o.encode('utf-8')

#   def decode(self, s):
#     # To decode, we strip off the prepended 'x:' prefix.
#     s = s.decode('utf-8')
#     #assert s[0:2] == 'x:'
#     params = s[0:2].split('|')
#     return Region(*params)

#   def is_deterministic(self):
#     # Since coded Player objects are used as keys below with
#     # beam.CombinePerKey(sum), we require that this coder is deterministic
#     # (i.e., two equivalent instances of the classes are encoded into the same
#     # byte string) in order to guarantee consistent results.
#     return True
# coders.registry.register_coder(Region, RegionCoder)

# @with_output_types(typing.Tuple[Region, int])
# def get_regions(descriptor):
#   name, points = descriptor.split(',')
#   return Player(name), int(points)


RegionSchema = namedtuple("RegionSchema", ("regionid", "regionname"))
class RegionSplitSchema(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield RegionSchema(int(regionid), regionname.title())

class RegionSplitDict(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield {'regionid': int(regionid), 'regionname': regionname.title()}


class TerritorySplit(beam.DoFn):
    # split territory into KV pair of (regionid, (territoryid, territoryname))
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield(int(territoryid), territoryname, int(regionid.title()))
#        yield (int(regionid), (territoryid, territoryname.title())) 
        
                
def lookup_region(left, right):
    territoryid, territoryname, regionid = left
    yield territoryid, territoryname, regionid
#    yield (territoryid, territorynme, regionid, right.get(regionid, 'No Region'))


def dummy(element):
    return element
#     regionid = element[0]
#     territoryid, territoryname = element[1]
#     return (territoryid, territoryname, regionid)

with beam.Pipeline() as p:
    regions = (
        p | 'Read Regions' >> ReadFromText('regions.csv')
          | 'Split Regions' >> beam.ParDo(RegionSplitClass())
          | 'Print Regions' >> beam.Map(print)
    )

#     regions = {1:"North", 2:"South", 3:"East", 4:"West"}
#     regions = p | 'Create Regions' >> beam.Create([(1, 'North'), (2, 'South')])

    
    territories =  (
        p | 'Read Territories' >> ReadFromText('territories.csv')
          | 'Split Territories' >> beam.ParDo(TerritorySplit())
          | 'Print Territories' >> beam.Map(print)
    )
    
#     join = (
#         territories
#           #| 'Lookup Region' >> beam.Map(dummy)
#           | 'Lookup Region' >> beam.Map(lookup_region, right = beam.pvalue.AsList(regions))
#           | beam.Map(print)
#     )
        


1|Eastern
2|Western
3|Northern
4|Southern
(1581, 'Westboro', 1)


TypeError: can't pickle generator objects [while running '[212]: Lookup Region']