In [1]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines = (
        p | 'Create' >> beam.Create(['one', 'two', 'three', 'four'])
          | 'Uppercase' >> beam.Map(str.upper)
          | 'Print' >> beam.Map(print)
    )



ONE
TWO
THREE
FOUR


In [2]:
#! gsutil cp gs://qwiklabs-gcp-02-7b5be618aa6e/* .
! ls

JoeyDemos.ipynb       leftjoin.py    simple3-dataflow.sh  territories.py
README.md	      products.json  simple3-local.sh	  wordcount-dataflow.sh
aggregate1.py	      regions.csv    simple3.py		  wordcount.py
aggregate2.py	      simple1.py     simple3_custom.py
dataflow_template.py  simple2.py     territories.avro


In [3]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

filename = 'regions.csv'
with beam.Pipeline() as p:
    regions = (
        p | 'Read' >> ReadFromText(filename)
          | 'Split' >> beam.Map(lambda x : tuple(x.split(',')))
          | 'Transform' >> beam.Map(lambda x : (int(x[0]) * 10, x[1].upper()))
          | 'Print' >> beam.Map(print)
    )


(10, 'EASTERN')
(20, 'WESTERN')
(30, 'NORTHERN')
(40, 'SOUTHERN')


In [4]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(int(regionid), regionname)] # ParDo's need to return a list
        yield (int(regionid), regionname) # Can also use yield instead of returning a list

# using a ParDo and DoFn instead of a Map
filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    records | 'Write' >> WriteToText('regions.out')



In [5]:
! cat regions.out*

('1', 'Eastern')
('2', 'Western')
('3', 'Northern')
('4', 'Southern')


In [None]:
"""A template to import the default package and parse the arguments"""

from __future__ import absolute_import

import argparse
import logging
import re

#from past.builtins import unicode

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(regionid, regionname)] # ParDo's need to return a list
        yield (int(regionid), regionname) # Can also use yield instead of returning a list

def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflowclass1-bucket/regions.csv',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      default = 'gs://dataflowclass1-bucket/regions_output',      
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=pipeline_options) as p:
    lines = p | 'Read' >> ReadFromText(known_args.input)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    uppercase = records | 'Uppercase' >> beam.Map(lambda x : (int(x[0]), x[1].upper()))
    uppercase | 'Write' >> WriteToText(known_args.output)

if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()


# using a ParDo and DoFn instead of a Map
filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    records | 'Write' >> WriteToText('regions2.out')



In [15]:
import apache_beam as beam
from apache_beam import pvalue
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

class OddEvenRegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        if int(regionid) % 2 == 0:
            yield pvalue.TaggedOutput('Even', (int(regionid), regionname, 'Even'))
        else:
            yield pvalue.TaggedOutput('Odd', (int(regionid), regionname, 'Odd'))

# using a ParDo and DoFn instead of a Map
filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    evens, odds = lines | 'Split' >> beam.ParDo(OddEvenRegionSplit()).with_outputs("Even", "Odd")
    
    print('Evens')
    evens | 'Print Evens' >> beam.Map(print)

    print('Odds')
    odds | 'Print Odds' >> beam.Map(print)
    
  

Evens
Odds
(1, 'Eastern', 'Odd')
(2, 'Western', 'Even')
(3, 'Northern', 'Odd')
(4, 'Southern', 'Even')


In [33]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplitDict(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield {'regionid':int(regionid), 'regionname':regionname.title()}

class TerritorySplitDict(beam.DoFn):
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield {'territoryid':int(territoryid), 'territoryname' : territoryname, 'regionid':int(regionid)}

class UnnestCoGrouped(beam.DoFn):
    def process(self, item, child_pipeline, parent_pipeline):
        k, v = item
        child_dict = v[child_pipeline]
        parent_dict = v[parent_pipeline]
        for child in child_dict:
            try:
                child.update(parent_dict[0])
                yield child
            except IndexError:
                yield child

class LeftJoin(beam.PTransform):
    def __init__(self, parent_pipeline_name, parent_data, parent_key, child_pipeline_name, child_data,  child_key):
        self.parent_pipeline_name = parent_pipeline_name
        self.parent_data = parent_data
        self.parent_key = parent_key
        self.child_pipeline_name = child_pipeline_name
        self.child_data = child_data
        self.child_key = child_key

    def expand(self, pcols):
        def _format_as_common_key_tuple(child_dict, child_key):
            return (child_dict[child_key], child_dict)

        return ({
                pipeline_name: pcol1 | f'Convert to ({self.parent_key} = {self.child_key}, object) for {pipeline_name}' 
                >> beam.Map(_format_as_common_key_tuple, self.child_key)
                for (pipeline_name, pcol1) in pcols.items()}
                | f'CoGroupByKeey {pcols.keys()}' >> beam.CoGroupByKey()
                | 'Unnest Cogrouped' >> beam.ParDo(UnnestCoGrouped(), self.child_pipeline_name, self.parent_pipeline_name)
        )
        
# using a ParDo and DoFn instead of a Map
regionsfilename = 'regions.csv'
territoriesfilename = 'territories.csv'

with beam.Pipeline() as p:
    regions = (
              p | 'Read Regions' >> ReadFromText(regionsfilename)
                | 'Split Regions' >> beam.ParDo(RegionSplitDict())
                #| 'Print Regions' >> beam.Map(print)
              )
        
    territories = (
                  p | 'Read Territories' >> ReadFromText('territories.csv')
                    | 'Split Territories' >> beam.ParDo(TerritorySplitDict())
                    #| 'Print Territories' >> beam.Map(print)
                  )

    leftjoin = {'regions':regions, 'territories':territories} | LeftJoin('regions', regions, 'regionid', 'territories', territories, 'regionid')
    leftjoin | 'print left join' >> beam.Map(print)




{'territoryid': 1581, 'territoryname': 'Westboro', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 1730, 'territoryname': 'Bedford', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 1833, 'territoryname': 'Georgetow', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2116, 'territoryname': 'Boston', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2139, 'territoryname': 'Cambridge', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2184, 'territoryname': 'Braintree', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2903, 'territoryname': 'Providence', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 6897, 'territoryname': 'Wilton', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 7960, 'territoryname': 'Morristown', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 8837, 'territoryname': 'Edison', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 10019, 'territoryname': 'New York', 'regionid': 1, 'regionname': 'Eastern'

In [35]:
# using a ParDo and DoFn instead of a Map
regionsfilename = 'regions.csv'
territoriesfilename = 'territories.csv'

with beam.Pipeline() as p:
    regions = (
              p | 'Read Regions' >> ReadFromText(regionsfilename)
                | 'Split Regions' >> beam.ParDo(RegionSplitDict())
                #| 'Print Regions' >> beam.Map(print)
              )
        
    territories = (
                  p | 'Read Territories' >> ReadFromText('territories.csv')
                    | 'Split Territories' >> beam.ParDo(TerritorySplitDict())
                    #| 'Print Territories' >> beam.Map(print)
                  )

    leftjoin = {'regions':regions, 'territories':territories} | LeftJoin('regions', regions, 'regionid', 'territories', territories, 'regionid')
#    leftjoin = {'territories':territories, 'regions':regions} | LeftJoin('territories', territories, 'regionid', 'regions', regions, 'regionid')
    leftjoin | 'print left join' >> beam.Map(print)


{'territoryid': 1581, 'territoryname': 'Westboro', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 1730, 'territoryname': 'Bedford', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 1833, 'territoryname': 'Georgetow', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2116, 'territoryname': 'Boston', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2139, 'territoryname': 'Cambridge', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2184, 'territoryname': 'Braintree', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2903, 'territoryname': 'Providence', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 6897, 'territoryname': 'Wilton', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 7960, 'territoryname': 'Morristown', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 8837, 'territoryname': 'Edison', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 10019, 'territoryname': 'New York', 'regionid': 1, 'regionname': 'Eastern'

In [106]:
import apache_beam as beam
from apache_beam.pvalue import AsIter, AsSingleton
from apache_beam.io import ReadFromText, WriteToText
from apache_beam.io import ReadFromAvro, WriteToAvro

class RegionSplitDict(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield {'regionid': int(regionid), 'regionname': regionname.title()}

class TerritorySplit(beam.DoFn):
    # split territory into KV pair of (regionid, (territoryid, territoryname))
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield(int(territoryid), territoryname, int(regionid.title()))
#        yield (int(regionid), (territoryid, territoryname.title())) 
        
                
def lookup_region(left, right):
    territoryid, territoryname, regionid = left
    yield territoryid, territoryname, regionid
#    yield (territoryid, territorynme, regionid, right.get(regionid, 'No Region'))


def dummy(element):
    return element
#     regionid = element[0]
#     territoryid, territoryname = element[1]
#     return (territoryid, territoryname, regionid)

with beam.Pipeline() as p:
    regions = (
        p | 'Read Regions' >> ReadFromText('regions.csv')
          | 'Split Regions' >> beam.ParDo(RegionSplitDict())
          | 'Print Regions' >> beam.Map(print)
    )

#     regions = {1:"North", 2:"South", 3:"East", 4:"West"}
#     regions = p | 'Create Regions' >> beam.Create([(1, 'North'), (2, 'South')])

    
    territories =  (
        p | 'Read Territories' >> ReadFromText('territories.csv')
          | 'Split Territories' >> beam.ParDo(TerritorySplit())
          | 'Print Territories' >> beam.Map(print)
    )
    
#     join = (
#         territories
#           #| 'Lookup Region' >> beam.Map(dummy)
#           | 'Lookup Region' >> beam.Map(lookup_region, right = beam.pvalue.AsList(regions))
#           | beam.Map(print)
#     )
        


(1581, 'Westboro', 1)
(1730, 'Bedford', 1)
(1833, 'Georgetow', 1)
(2116, 'Boston', 1)
(2139, 'Cambridge', 1)
(2184, 'Braintree', 1)
(2903, 'Providence', 1)
(3049, 'Hollis', 3)
(3801, 'Portsmouth', 3)
(6897, 'Wilton', 1)
(7960, 'Morristown', 1)
(8837, 'Edison', 1)
(10019, 'New York', 1)
(10038, 'New York', 1)
(11747, 'Mellvile', 1)
(14450, 'Fairport', 1)
(19428, 'Philadelphia', 3)
(19713, 'Neward', 1)
(20852, 'Rockville', 1)
(27403, 'Greensboro', 1)
(27511, 'Cary', 1)
(29202, 'Columbia', 4)
(30346, 'Atlanta', 4)
(31406, 'Savannah', 4)
(32859, 'Orlando', 4)
(33607, 'Tampa', 4)
(40222, 'Louisville', 1)
(44122, 'Beachwood', 3)
(45839, 'Findlay', 3)
(48075, 'Southfield', 3)
(48084, 'Troy', 3)
(48304, 'Bloomfield Hills', 3)
(53404, 'Racine', 3)
(55113, 'Roseville', 3)
(55439, 'Minneapolis', 3)
(60179, 'Hoffman Estates', 2)
(60601, 'Chicago', 2)
(72716, 'Bentonville', 4)
(75234, 'Dallas', 4)
(78759, 'Austin', 4)
(80202, 'Denver', 2)
(80909, 'Colorado Springs', 2)
(85014, 'Phoenix', 2)
(85251,