## Simple transformation

In [1]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines = (
        p | 'Create' >> beam.Create(['one', 'two', 'three', 'four'])
          | 'Uppercase' >> beam.Map(str.upper)
          | 'Print' >> beam.Map(print)
    )



ONE
TWO
THREE
FOUR


## The pipe | is actually just an operator overload to call the apply method of the pipeline

In [2]:
import apache_beam as beam

with beam.Pipeline() as p:
        lines = p | 'Create' >> beam.Create(['one', 'two', 'three', 'four'])
        lines2 = (
            p.apply(beam.Map(str.title), lines, 'titlecase')
             .apply(beam.Map(print))
        )


One
Two
Three
Four


In [1]:
! ls

JoeyDemos.ipynb       leftjoin.py    simple3-dataflow.sh  territories.csv
README.md	      products.json  simple3-local.sh	  territories.py
aggregate1.py	      regions.csv    simple3.py		  wordcount-dataflow.sh
aggregate2.py	      simple1.py     simple3_custom.py	  wordcount.py
dataflow_template.py  simple2.py     territories.avro


## Read from CSV and use Map

In [10]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

regionsfilename = 'regions.csv'
with beam.Pipeline() as p:
    regions = (
        p | 'Read' >> ReadFromText(regionsfilename)
          | 'Split' >> beam.Map(lambda x : tuple(x.split(',')))
          | 'Transform' >> beam.Map(lambda x : (int(x[0]) * 10, x[1].upper()))
          | 'Print' >> beam.Map(print)
    )


(10, 'EASTERN')
(20, 'WESTERN')
(30, 'NORTHERN')
(40, 'SOUTHERN')


## Read from CSV and use ParDo

In [11]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(int(regionid), regionname)] # ParDo's need to return a list
        yield (int(regionid), regionname) # Can also use yield instead of returning a list

regionsfilename = 'regions.csv'
with beam.Pipeline() as p:
    regions = (
        p | 'Read' >> ReadFromText(regionsfilename)
          | 'Split' >> beam.ParDo(RegionSplit())
          | 'Write' >> WriteToText('regions.out')
    )


In [5]:
! cat regions.out*

(1, 'Eastern')
(2, 'Western')
(3, 'Northern')
(4, 'Southern')


## Template showing a full program that can read the command line args

In [None]:
"""A template to import the default package and parse the arguments"""

from __future__ import absolute_import

import argparse
import logging
import re

#from past.builtins import unicode

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(regionid, regionname)] # ParDo's need to return a list
        yield (int(regionid), regionname) # Can also use yield instead of returning a list

def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflowclass1-bucket/regions.csv',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      default = 'gs://dataflowclass1-bucket/regions_output',      
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=pipeline_options) as p:
    lines = p | 'Read' >> ReadFromText(known_args.input)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    uppercase = records | 'Uppercase' >> beam.Map(lambda x : (int(x[0]), x[1].upper()))
    uppercase | 'Write' >> WriteToText(known_args.output)

if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()


filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    records | 'Write' >> WriteToText('regions2.out')



## Example of how to create a split ParDo with multiple outputs

In [6]:
import apache_beam as beam
from apache_beam import pvalue
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

class OddEvenRegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        if int(regionid) % 2 == 0:
            yield pvalue.TaggedOutput('Even', (int(regionid), regionname, 'Even'))
        else:
            yield pvalue.TaggedOutput('Odd', (int(regionid), regionname, 'Odd'))

regionsfilename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(regionsfilename)
    evens, odds = lines | 'Split' >> beam.ParDo(OddEvenRegionSplit()).with_outputs("Even", "Odd")
    
    print('Evens')
    evens | 'Print Evens' >> beam.Map(print)

    print('Odds')
    odds | 'Print Odds' >> beam.Map(print)
    
  

Evens
Odds
(1, 'Eastern', 'Odd')
(2, 'Western', 'Even')
(3, 'Northern', 'Odd')
(4, 'Southern', 'Even')


## Example of branching

In [12]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(int(regionid), regionname)] # ParDo's need to return a list
        yield (int(regionid), regionname) # Can also use yield instead of returning a list

regionsfilename = 'regions.csv'
with beam.Pipeline() as p:
    regions = (
        p | 'Read' >> ReadFromText(regionsfilename)
          | 'Split' >> beam.ParDo(RegionSplit())
    )
    # Branch 1
    (regions 
         | 'Lowercase regions' >> beam.Map(lambda x : (x[0] * 100, x[1].lower()))
         | 'Write' >> WriteToText('regions2.out')
    )
    
    (regions 
         | 'Uppercase regions' >> beam.Map(lambda x : (x[0] * 10, x[1].upper()))
         | 'Print' >> beam.Map(print)
    )



(10, 'EASTERN')
(20, 'WESTERN')
(30, 'NORTHERN')
(40, 'SOUTHERN')


In [13]:
! cat regions2*

(100, 'eastern')
(200, 'western')
(300, 'northern')
(400, 'southern')


## WithKeys

In [14]:
import apache_beam as beam
from apache_beam.io import ReadFromText

class TerritorySplit(beam.DoFn):
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield (territoryid, territoryname, regionid)

territoriesfilename = 'territories.csv'

with beam.Pipeline() as p:
    territories = (
                  p | 'Read Territories' >> ReadFromText(territoriesfilename)
                    | 'Split Territories' >> beam.ParDo(TerritorySplit())
                    | 'Territories With Keys' >> beam.util.WithKeys(lambda x : x[2])
#                    | 'With Keys Manually' >> beam.Map(lambda x : (x[2], x))
                  )
    territories | 'Print KV' >> beam.Map(print)
    territories | beam.util.Keys() | 'Print Keys' >> beam.Map(print)
    territories | beam.util.Values() | 'Print Values' >> beam.Map(print)




('01581', 'Westboro', '1')
1
('1', ('01581', 'Westboro', '1'))
('01730', 'Bedford', '1')
1
('1', ('01730', 'Bedford', '1'))
('01833', 'Georgetow', '1')
1
('1', ('01833', 'Georgetow', '1'))
('02116', 'Boston', '1')
1
('1', ('02116', 'Boston', '1'))
('02139', 'Cambridge', '1')
1
('1', ('02139', 'Cambridge', '1'))
('02184', 'Braintree', '1')
1
('1', ('02184', 'Braintree', '1'))
('02903', 'Providence', '1')
1
('1', ('02903', 'Providence', '1'))
('03049', 'Hollis', '3')
3
('3', ('03049', 'Hollis', '3'))
('03801', 'Portsmouth', '3')
3
('3', ('03801', 'Portsmouth', '3'))
('06897', 'Wilton', '1')
1
('1', ('06897', 'Wilton', '1'))
('07960', 'Morristown', '1')
1
('1', ('07960', 'Morristown', '1'))
('08837', 'Edison', '1')
1
('1', ('08837', 'Edison', '1'))
('10019', 'New York', '1')
1
('1', ('10019', 'New York', '1'))
('10038', 'New York', '1')
1
('1', ('10038', 'New York', '1'))
('11747', 'Mellvile', '1')
1
('1', ('11747', 'Mellvile', '1'))
('14450', 'Fairport', '1')
1
('1', ('14450', 'Fairport'

## GroupByKey

In [15]:
import apache_beam as beam
from apache_beam.io import ReadFromText

class TerritorySplit(beam.DoFn):
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield (territoryid, territoryname, regionid)

territoriesfilename = 'territories.csv'

with beam.Pipeline() as p:
    territories = (
                  p | 'Read Territories' >> ReadFromText('territories.csv')
                    | 'Split Territories' >> beam.ParDo(TerritorySplit())
                    | 'Territories With Keys' >> beam.util.WithKeys(lambda x : x[2])
                    | 'Group Territories' >> beam.GroupByKey() 
                    | 'Print Territories' >> beam.Map(print)
                  )


('1', [('01581', 'Westboro', '1'), ('01730', 'Bedford', '1'), ('01833', 'Georgetow', '1'), ('02116', 'Boston', '1'), ('02139', 'Cambridge', '1'), ('02184', 'Braintree', '1'), ('02903', 'Providence', '1'), ('06897', 'Wilton', '1'), ('07960', 'Morristown', '1'), ('08837', 'Edison', '1'), ('10019', 'New York', '1'), ('10038', 'New York', '1'), ('11747', 'Mellvile', '1'), ('14450', 'Fairport', '1'), ('19713', 'Neward', '1'), ('20852', 'Rockville', '1'), ('27403', 'Greensboro', '1'), ('27511', 'Cary', '1'), ('40222', 'Louisville', '1')])
('3', [('03049', 'Hollis', '3'), ('03801', 'Portsmouth', '3'), ('19428', 'Philadelphia', '3'), ('44122', 'Beachwood', '3'), ('45839', 'Findlay', '3'), ('48075', 'Southfield', '3'), ('48084', 'Troy', '3'), ('48304', 'Bloomfield Hills', '3'), ('53404', 'Racine', '3'), ('55113', 'Roseville', '3'), ('55439', 'Minneapolis', '3')])
('4', [('29202', 'Columbia', '4'), ('30346', 'Atlanta', '4'), ('31406', 'Savannah', '4'), ('32859', 'Orlando', '4'), ('33607', 'Tampa

## Flatten

In [16]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines1 = p | 'Create 1' >> beam.Create(['one', 'two', 'three', 'four'])
    lines2 = p | 'Create 2' >> beam.Create(['alpha', 'beta', 'gamma', 'delta'])

    merged = ((lines1, lines2) | 'Merge PCollections' >> beam.Flatten())
    merged | beam.Map(print)


one
two
three
four
alpha
beta
gamma
delta


## Combine

In [17]:
import apache_beam as beam

with beam.Pipeline() as p:
    data = (
        p | 'Create' >> beam.Create([('a', 10), ('a', 20), ('b', 30), ('b', 40), ('c', 50), ('a', 60)])
          | 'Combine' >> beam.CombinePerKey(sum)
          | 'Print' >> beam.Map(print)
    )


('a', 90)
('b', 70)
('c', 50)


## Custom Combine Function

In [18]:
#mport apache_beam as beam

class CustomCombine(beam.CombineFn):

  def create_accumulator(self):
    return {}

  def add_input(self, accumulator, input):
    k, v = input
    x, y, z = accumulator.get(k, (0, 0, 0))

    # take the max for the first element of the tuple and sum the second element and count for the third
    accumulator[k] = (v[0] if v[0] > x else x, y + v[1], z + 1)
    return accumulator

  def merge_accumulators(self, accumulators):
    merged = {}
    for accum in accumulators:
      for k, v in accum.items():
        x, y, z = merged.get(k, (0, 0, 0))
        merged[k] = (v[0] if v[0] > x else x, y + v[1], z + v[2])
    return merged

  def extract_output(self, accumulator):
    # return the max, the sum, the count and the average for the key
    return {k : (v[0], v[1], v[2], v[1]/v[2]) for k, v in accumulator.items()}
    return accumulator
    

with beam.Pipeline() as p:
    data = (
        p | 'Create' >> beam.Create([('a', (1, 10)), ('a', (2, 20)), ('b', (3, 30)), ('b', (4, 40)), ('c', (5, 50)), ('a', (6, 60))])
          | 'Combine' >> beam.CombineGlobally(CustomCombine())
          | 'Print' >> beam.Map(print)
    )


{'a': (6, 90, 3, 30.0), 'b': (4, 70, 2, 35.0), 'c': (5, 50, 1, 50.0)}


## Map vs FlatMap

In [20]:
import apache_beam as beam

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create(['Strawberry,Carrot,Eggplant','Tomato,Potato'])
      | 'Split words' >> beam.Map(lambda x : x.split(','))
#      | 'Split words' >> beam.FlatMap(lambda x : x.split(','))
      | beam.Map(print))

Strawberry
Carrot
Eggplant
Tomato
Potato


## Side Inputs

In [32]:
import apache_beam as beam
from apache_beam.pvalue import AsSingleton, AsDict
from apache_beam.io import ReadFromText

class TerritorySplitTuple(beam.DoFn):
    # split territory into KV pair of (regionid, (territoryid, territoryname))
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield(int(territoryid), territoryname, int(regionid))
        
                
class LookupRegion(beam.DoFn):
    def process(self, element, uppercase = 0):
        lookuptable = {1:'North', 2:'South', 3:'East', 4:'West'}
        territoryid, territoryname, regionid = element
        region = lookuptable.get(regionid, 'No Region')
        if uppercase == 1:
            region = region.upper()
        yield(territoryid, territoryname, regionid, region)

with beam.Pipeline() as p:
    territories =  (
        p | 'Read Territories' >> ReadFromText('territories.csv')
          | 'Split Territories' >> beam.ParDo(TerritorySplitTuple())
    )
    
    lookup = (
        territories
#        | beam.ParDo(LookupRegion())
        | beam.ParDo(LookupRegion(), uppercase = 1 ) 
        #beam.pvalue.AsSingleton(int(1)))
        | 'Print Loopup' >> beam.Map(print)
    )
        


(1581, 'Westboro', 1, 'NORTH')
(1730, 'Bedford', 1, 'NORTH')
(1833, 'Georgetow', 1, 'NORTH')
(2116, 'Boston', 1, 'NORTH')
(2139, 'Cambridge', 1, 'NORTH')
(2184, 'Braintree', 1, 'NORTH')
(2903, 'Providence', 1, 'NORTH')
(3049, 'Hollis', 3, 'EAST')
(3801, 'Portsmouth', 3, 'EAST')
(6897, 'Wilton', 1, 'NORTH')
(7960, 'Morristown', 1, 'NORTH')
(8837, 'Edison', 1, 'NORTH')
(10019, 'New York', 1, 'NORTH')
(10038, 'New York', 1, 'NORTH')
(11747, 'Mellvile', 1, 'NORTH')
(14450, 'Fairport', 1, 'NORTH')
(19428, 'Philadelphia', 3, 'EAST')
(19713, 'Neward', 1, 'NORTH')
(20852, 'Rockville', 1, 'NORTH')
(27403, 'Greensboro', 1, 'NORTH')
(27511, 'Cary', 1, 'NORTH')
(29202, 'Columbia', 4, 'WEST')
(30346, 'Atlanta', 4, 'WEST')
(31406, 'Savannah', 4, 'WEST')
(32859, 'Orlando', 4, 'WEST')
(33607, 'Tampa', 4, 'WEST')
(40222, 'Louisville', 1, 'NORTH')
(44122, 'Beachwood', 3, 'EAST')
(45839, 'Findlay', 3, 'EAST')
(48075, 'Southfield', 3, 'EAST')
(48084, 'Troy', 3, 'EAST')
(48304, 'Bloomfield Hills', 3, 'EAST

In [33]:
import apache_beam as beam
from apache_beam.pvalue import AsList
from apache_beam.io import ReadFromText, WriteToText

class RegionSplitDict(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield {'regionid': int(regionid), 'regionname': regionname.title()}

class TerritorySplitTuple(beam.DoFn):
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield(int(territoryid), territoryname, int(regionid))
        
                
class LookupRegion(beam.DoFn):
    def process(self, element, lookuptable = [{'regionid':1, 'regionname':'North'}, {'regionid':2, 'regionname':'South'}]):
        territoryid, territoryname, regionid = element
        lookup = {e['regionid'] : e['regionname'] for e in lookuptable }
        yield(territoryid, territoryname, regionid, lookup.get(regionid, 'No Region'))

with beam.Pipeline() as p:
    regions = (
        p | 'Read Regions' >> ReadFromText('regions.csv')
          | 'Split Regions' >> beam.ParDo(RegionSplitDict())
#          | 'Print Regions' >> beam.Map(print)
    )

    territories =  (
        p | 'Read Territories' >> ReadFromText('territories.csv')
          | 'Split Territories' >> beam.ParDo(TerritorySplitTuple())
#          | 'Print Territories' >> beam.Map(print)
    )
    
    lookup = (
        territories
        | beam.ParDo(LookupRegion(), lookuptable = beam.pvalue.AsList(regions))
        | 'Print Loopup' >> beam.Map(print)
    )
        


(1581, 'Westboro', 1, 'Eastern')
(1730, 'Bedford', 1, 'Eastern')
(1833, 'Georgetow', 1, 'Eastern')
(2116, 'Boston', 1, 'Eastern')
(2139, 'Cambridge', 1, 'Eastern')
(2184, 'Braintree', 1, 'Eastern')
(2903, 'Providence', 1, 'Eastern')
(3049, 'Hollis', 3, 'Northern')
(3801, 'Portsmouth', 3, 'Northern')
(6897, 'Wilton', 1, 'Eastern')
(7960, 'Morristown', 1, 'Eastern')
(8837, 'Edison', 1, 'Eastern')
(10019, 'New York', 1, 'Eastern')
(10038, 'New York', 1, 'Eastern')
(11747, 'Mellvile', 1, 'Eastern')
(14450, 'Fairport', 1, 'Eastern')
(19428, 'Philadelphia', 3, 'Northern')
(19713, 'Neward', 1, 'Eastern')
(20852, 'Rockville', 1, 'Eastern')
(27403, 'Greensboro', 1, 'Eastern')
(27511, 'Cary', 1, 'Eastern')
(29202, 'Columbia', 4, 'Southern')
(30346, 'Atlanta', 4, 'Southern')
(31406, 'Savannah', 4, 'Southern')
(32859, 'Orlando', 4, 'Southern')
(33607, 'Tampa', 4, 'Southern')
(40222, 'Louisville', 1, 'Eastern')
(44122, 'Beachwood', 3, 'Northern')
(45839, 'Findlay', 3, 'Northern')
(48075, 'Southfiel

## Simulate an Outer Join with CoGroup

In [35]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplitDict(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield {'regionid':int(regionid), 'regionname':regionname.title()}

class TerritorySplitDict(beam.DoFn):
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield {'territoryid':int(territoryid), 'territoryname' : territoryname, 'regionid':int(regionid)}

class UnnestCoGrouped(beam.DoFn):
    def process(self, item, child_pipeline, parent_pipeline):
        k, v = item
        child_dict = v[child_pipeline]
        parent_dict = v[parent_pipeline]
        for child in child_dict:
            try:
                child.update(parent_dict[0])
                yield child
            except IndexError:
                yield child

class LeftJoin(beam.PTransform):
    def __init__(self, parent_pipeline_name, parent_data, parent_key, child_pipeline_name, child_data,  child_key):
        self.parent_pipeline_name = parent_pipeline_name
        self.parent_data = parent_data
        self.parent_key = parent_key
        self.child_pipeline_name = child_pipeline_name
        self.child_data = child_data
        self.child_key = child_key

    def expand(self, pcols):
        def _format_as_common_key_tuple(child_dict, child_key):
            return (child_dict[child_key], child_dict)

        return ({
                pipeline_name: pcol1 | f'Convert to ({self.parent_key} = {self.child_key}, object) for {pipeline_name}' 
                >> beam.Map(_format_as_common_key_tuple, self.child_key)
                for (pipeline_name, pcol1) in pcols.items()}
                | f'CoGroupByKeey {pcols.keys()}' >> beam.CoGroupByKey()
                | 'Unnest Cogrouped' >> beam.ParDo(UnnestCoGrouped(), self.child_pipeline_name, self.parent_pipeline_name)
        )
        
regionsfilename = 'regions.csv'
territoriesfilename = 'territories.csv'

with beam.Pipeline() as p:
    regions = (
              p | 'Read Regions' >> ReadFromText(regionsfilename)
                | 'Split Regions' >> beam.ParDo(RegionSplitDict())
                #| 'Print Regions' >> beam.Map(print)
              )
        
    territories = (
                  p | 'Read Territories' >> ReadFromText('territories.csv')
                    | 'Split Territories' >> beam.ParDo(TerritorySplitDict())
                    #| 'Print Territories' >> beam.Map(print)
                  )

    leftjoin = {'regions':regions, 'territories':territories} | LeftJoin('regions', regions, 'regionid', 'territories', territories, 'regionid')
    leftjoin | 'print left join' >> beam.Map(print)




{'territoryid': 1581, 'territoryname': 'Westboro', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 1730, 'territoryname': 'Bedford', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 1833, 'territoryname': 'Georgetow', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2116, 'territoryname': 'Boston', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2139, 'territoryname': 'Cambridge', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2184, 'territoryname': 'Braintree', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 2903, 'territoryname': 'Providence', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 6897, 'territoryname': 'Wilton', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 7960, 'territoryname': 'Morristown', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 8837, 'territoryname': 'Edison', 'regionid': 1, 'regionname': 'Eastern'}
{'territoryid': 10019, 'territoryname': 'New York', 'regionid': 1, 'regionname': 'Eastern'

## BeamSQL

In [None]:
import json
import logging

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.transforms.sql import SqlTransform
from collections import namedtuple
from apache_beam import coders


RegionSchema = namedtuple("RegionSchema", ("regionid", "regionname"))
#coders.registry.register_coder(RegionSchema, coders.RowCoder)
class RegionSplitSchema(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield RegionSchema(int(regionid), regionname.title())

TerritorySchema = namedtuple("TerritorySchema", ("territoryid", "territoryname", "regionid"))
#coders.registry.register_coder(TerritorySchema, coders.RowCoder)
class TerritorySplitSchema(beam.DoFn):
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield TerritorySchema(int(territoryid), territoryname.title(), int(regionid))

        
# class TerritorySplitNamedTuple(beam.DoFn):
#     def process(self, element):
#         territoryid, territoryname, regionid = element.split(',')
#         yield {'territoryid':int(territoryid), 'territoryname' : territoryname, 'regionid':int(regionid)}

regionsfilename = 'regions.csv'
territoriesfilename = 'territories.csv'
print('Start')
with beam.Pipeline() as p:
#     regions = (
#               p | 'Read Regions' >> ReadFromText(regionsfilename)
#                 | 'Split Regions' >> beam.ParDo(RegionSplitSchema()).with_output_types(RegionSchema)
#               )
        
    territories = (
                  p | 'Read Territories' >> ReadFromText('territories.csv')
                    | 'Split Territories' >> beam.ParDo(TerritorySplitSchema())
                    | 'Apply Territories Schema' >> beam.Map(lambda x : beam.Row(territoryid = int(x.territoryid)
                                                                                 , territoryname = str(x.territoryname)
                                                                                 , regionid = int(x.regionid)))
#                    | 'Convert to Dictionary' >> beam.Map(lambda row : {"regionid" : row.regionid, "territoryid" : row.territoryid, "territoryname" : row.territoryname})
                    | SqlTransform("""
                        SELECT regionid, UPPER(territoryname) as name, territoryid 
                        FROM PCOLLECTION
                        """)
#                    | 'Split Territories' >> beam.ParDo(TerritorySplitSchema()).with_output_types(TerritorySchema)
#                     | SqlTransform("""
#                         SELECT regionid, count(*) as territories
#                         FROM PCOLLECTION
#                         GROUP BY regionID
#                         ORDER BY territories DESC
#                         """)
#                    | 'Convert to dictionary' >> beam.Map(lambda row : {"regionid": row.regionid, "territories": row.territories})
                    
#             })
                  )

#     regions | 'Print regions' >> beam.Map(print)
    territories | 'Print territories' >> beam.Map(print)

print('Done')


Start


In [None]:
# with beam.Pipeline() as pipeline:
#     _ = (
#         pipeline
#         | beam.io.ReadFromPubSub(
#             topic='projects/pubsub-public-data/topics/taxirides-realtime',
#             timestamp_attribute="ts").with_output_types(bytes)
#         | "Parse JSON payload" >> beam.Map(json.loads)
#         # Use beam.Row to create a schema-aware PCollection
#         | "Create beam Row" >> beam.Map(
#             lambda x: beam.Row(
#                 ride_status=str(x['ride_status']),
#                 passenger_count=int(x['passenger_count'])))
#         # SqlTransform will computes result within an existing window
#         | "15s fixed windows" >> beam.WindowInto(beam.window.FixedWindows(15))
#         # Aggregate drop offs and pick ups that occur within each 15s window
#         | SqlTransform(
#             """
#              SELECT
#                ride_status,
#                COUNT(*) AS num_rides,
#                SUM(passenger_count) AS total_passengers
#              FROM PCOLLECTION
#              WHERE NOT ride_status = 'enroute'
#              GROUP BY ride_status""")
#         # SqlTransform yields python objects with attributes corresponding to
#         # the outputs of the query.
#         # Collect those attributes, as well as window information, into a dict
#         | "Assemble Dictionary" >> beam.Map(
#             lambda row,
#             window=beam.DoFn.WindowParam: {
#                 "ride_status": row.ride_status,
#                 "num_rides": row.num_rides,
#                 "total_passengers": row.total_passengers,
#                 "window_start": window.start.to_rfc3339(),
#                 "window_end": window.end.to_rfc3339()
#             })
#         | "Convert to JSON" >> beam.Map(json.dumps)
#         | "UTF-8 encode" >> beam.Map(lambda s: s.encode("utf-8"))
#         | beam.Map(print)
#         #| beam.io.WriteToPubSub(topic=output_topic))
#     )


# if __name__ == '__main__':
#   logging.getLogger().setLevel(logging.INFO)
#   import argparse

#   parser = argparse.ArgumentParser()
#   parser.add_argument(
#       '--output_topic',
#       dest='output_topic',
#       required=True,
#       help=(
#           'Cloud PubSub topic to write to (e.g. '
#           'projects/my-project/topics/my-topic), must be created prior to '
#           'running the pipeline.'))
#   known_args, pipeline_args = parser.parse_known_args()

#   run(known_args.output_topic, pipeline_args)


In [22]:
lookuptable = [{'regionid':1, 'regionname':'North'}, {'regionid':2, 'regionname':'South'}]
lookup = {e['regionid'] : e['regionname'] for e in lookuptable }
print(lookup)

{1: 'North', 2: 'South'}


In [None]:
import apache_beam as beam
from apache_beam.pvalue import AsIter, AsSingleton, AsList, AsDict
from apache_beam.io import ReadFromText, WriteToText
from apache_beam.io import ReadFromAvro, WriteToAvro
from collections import namedtuple
from apache_beam import coders
from apache_beam.typehints.decorators import with_output_types


class Region:
    def __init__(self, regionid, regionname):
        self.regionid = regionid
        self.regionname = regionname
        
    def __str__(self):
        return f'{self.regionid}|{self.regionname}'

#     def encode(self, o):
#         """Encode to bytes with a trace that coder was used."""
#         # Our encoding prepends an 'x:' prefix.
#         return b'x:%s' % o.encode('utf-8')

#     def decode(self, s):
#         # To decode, we strip off the prepended 'x:' prefix.
#         s = s.decode('utf-8')
#         #assert s[0:2] == 'x:'
#         params = s[0:2].split('|')
#         return Region(*params)

#     def is_deterministic(self):
#         # Since coded Player objects are used as keys below with
#         # beam.CombinePerKey(sum), we require that this coder is deterministic
#         # (i.e., two equivalent instances of the classes are encoded into the same
#         # byte string) in order to guarantee consistent results.
#         return True
    
class RegionSplitClass(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield Region(int(regionid), regionname.title())

# class RegionCoder(coders.Coder):
#   """A custom coder for the RegionSchema"""
#   def encode(self, o):
#     """Encode to bytes with a trace that coder was used."""
#     # Our encoding prepends an 'x:' prefix.
#     return b'x:%s' % o.encode('utf-8')

#   def decode(self, s):
#     # To decode, we strip off the prepended 'x:' prefix.
#     s = s.decode('utf-8')
#     #assert s[0:2] == 'x:'
#     params = s[0:2].split('|')
#     return Region(*params)

#   def is_deterministic(self):
#     # Since coded Player objects are used as keys below with
#     # beam.CombinePerKey(sum), we require that this coder is deterministic
#     # (i.e., two equivalent instances of the classes are encoded into the same
#     # byte string) in order to guarantee consistent results.
#     return True
# coders.registry.register_coder(Region, RegionCoder)

# @with_output_types(typing.Tuple[Region, int])
# def get_regions(descriptor):
#   name, points = descriptor.split(',')
#   return Player(name), int(points)


# RegionSchema = namedtuple("RegionSchema", ("regionid", "regionname"))
# class RegionSplitSchema(beam.DoFn):
#     def process(self, element):
#         regionid, regionname = element.split(',')
#         yield RegionSchema(int(regionid), regionname.title())

class RegionSplitDict(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        yield {'regionid': int(regionid), 'regionname': regionname.title()}


class TerritorySplit(beam.DoFn):
    # split territory into KV pair of (regionid, (territoryid, territoryname))
    def process(self, element):
        territoryid, territoryname, regionid = element.split(',')
        yield(int(territoryid), territoryname, int(regionid))
#        yield (int(regionid), (territoryid, territoryname.title())) 
        
                
def lookup_region(left, right):
    territoryid, territoryname, regionid = left
    yield territoryid, territoryname, regionid
#    yield (territoryid, territorynme, regionid, right.get(regionid, 'No Region'))


class LookupRegion(beam.DoFn):
    def process(self, element, lookuptable = [{'regionid':1, 'regionname':'North'}, {'regionid':2, 'regionname':'South'}]):
#        yield element
        territoryid, territoryname, regionid = element
        lookup = {e['regionid'] : e['regionname'] for e in lookuptable }
        yield(territoryid, territoryname, regionid, lookup.get(regionid, 'No Region'))
# #        yield (int(regionid), (territoryid, territoryname.title())) 


# def dummy(element):
#     return element
# #     regionid = element[0]
# #     territoryid, territoryname = element[1]
# #     return (territoryid, territoryname, regionid)

with beam.Pipeline() as p:
    regions = (
        p | 'Read Regions' >> ReadFromText('regions.csv')
          | 'Split Regions' >> beam.ParDo(RegionSplitDict())
          #| 'Split Regions' >> beam.ParDo(RegionSplitClass())
#          | 'Print Regions' >> beam.Map(print)
    )

#     regions = {1:"North", 2:"South", 3:"East", 4:"West"}
#     regions = p | 'Create Regions' >> beam.Create([(1, 'North'), (2, 'South')])

    
    territories =  (
        p | 'Read Territories' >> ReadFromText('territories.csv')
          | 'Split Territories' >> beam.ParDo(TerritorySplit())
#          | 'Print Territories' >> beam.Map(print)
    )
    
    join = (
        territories
          #| 'Lookup Region' >> beam.Map(dummy)
#          | 'Lookup Region' >> beam.Map(lookup_region, right = beam.pvalue.AsList(regions))
#        | beam.ParDo(LookupRegion())
        | beam.ParDo(LookupRegion(), lookuptable = beam.pvalue.AsList(regions))
        | beam.Map(print)
    )
        
