In [1]:
import apache_beam as beam

with beam.Pipeline() as p:
    lines = (
        p | 'Create' >> beam.Create(['one', 'two', 'three', 'four'])
          | 'Uppercase' >> beam.Map(str.upper)
          | 'Print' >> beam.Map(print)
    )



ONE
TWO
THREE
FOUR


In [9]:
#! gsutil cp gs://qwiklabs-gcp-02-7b5be618aa6e/* .
! ls

01-Word_Count.ipynb		       Visualize_Data.ipynb
02-Streaming_Word_Count.ipynb	       products.json
03-Streaming_NYC_Taxi_Ride_Data.ipynb  regions.csv
Dataflow_Word_Count.ipynb	       regions.out-00000-of-00001
JoeyDemos.ipynb			       territories.avro
Use_GPUs_with_Apache_Beam.ipynb


In [12]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText

filename = 'regions.csv'
with beam.Pipeline() as p:
    regions = (
        p | 'Read' >> ReadFromText(filename)
          | 'Split' >> beam.Map(lambda x : tuple(x.split(',')))
          | 'Transform' >> beam.Map(lambda x : (int(x[0]) * 10, x[1].upper()))
          | 'Print' >> beam.Map(print)
    )


(10, 'EASTERN')
(20, 'WESTERN')
(30, 'NORTHERN')
(40, 'SOUTHERN')


In [15]:
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(regionid, regionname)] # ParDo's need to return a list
        yield (regionid, regionname) # Can also use yield instead of returning a list

# using a ParDo and DoFn instead of a Map
filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    records | 'Write' >> WriteToText('regions.out')



In [16]:
! cat regions.out*

('1', 'Eastern')
('2', 'Western')
('3', 'Northern')
('4', 'Southern')


In [17]:
"""A template to import the default package and parse the arguments"""

from __future__ import absolute_import

import argparse
import logging
import re

#from past.builtins import unicode

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

class RegionSplit(beam.DoFn):
    def process(self, element):
        regionid, regionname = element.split(',')
        #return [(regionid, regionname)] # ParDo's need to return a list
        yield (regionid, regionname) # Can also use yield instead of returning a list

def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflowclass1-bucket/regions.csv',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      default = 'gs://dataflowclass1-bucket/regions_output',      
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=pipeline_options) as p:
    lines = p | 'Read' >> ReadFromText(known_args.input)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    uppercase = records | 'Uppercase' >> beam.Map(lambda x : (int(x[0]), x[1].upper()))
    uppercase | 'Write' >> WriteToText(known_args.output)

if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()


# using a ParDo and DoFn instead of a Map
filename = 'regions.csv'
with beam.Pipeline() as p:
    lines = p | 'Read' >> ReadFromText(filename)
    records = lines | 'Split' >> beam.ParDo(RegionSplit())
    records | 'Write' >> WriteToText('regions2.out')



INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:apache_beam.internal.gcp.auth:Setting socket default timeout to 60 seconds.
INFO:apache_beam.internal.gcp.auth:socket default timeout is 60.0 seconds.
INFO:oauth2client.transport:Attempting refresh to obtain initial access_token


BeamIOError: Match operation failed with exceptions {'gs://dataflowclass1-bucket/regions.csv': HttpForbiddenError('HttpError accessing <https://www.googleapis.com/storage/v1/b/dataflowclass1-bucket/o/regions.csv?alt=json>: response: <{\'x-guploader-uploadid\': \'ABg5-UzAM1SOZOrl9b_Po2CsZygbcsMmlbQej79BuI-X_awFRYqPPcfTE43HGNhsDR1-gQipv_JnluiV3r4vPaJM5-c\', \'content-type\': \'application/json; charset=UTF-8\', \'date\': \'Sat, 17 Apr 2021 21:02:20 GMT\', \'vary\': \'Origin, X-Origin\', \'cache-control\': \'no-cache, no-store, max-age=0, must-revalidate\', \'expires\': \'Mon, 01 Jan 1990 00:00:00 GMT\', \'pragma\': \'no-cache\', \'content-length\': \'430\', \'server\': \'UploadServer\', \'status\': \'403\'}>, content <{\n  "error": {\n    "code": 403,\n    "message": "537607295389-compute@developer.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object.",\n    "errors": [\n      {\n        "message": "537607295389-compute@developer.gserviceaccount.com does not have storage.objects.get access to the Google Cloud Storage object.",\n        "domain": "global",\n        "reason": "forbidden"\n      }\n    ]\n  }\n}\n>')}