In [2]:
import apache_beam as beam 

In [3]:
with beam.Pipeline() as pipeline: 
    pass 

In [5]:
# PIPELINE OPTIONS => USE THIS TO CONFIGURE DIFFERENT ASPECTS OF YOUR PIPELINE 
# IE as the pipeline runner that will execute your pipeline and any runner-specific configuration required by the chosen runner.
from apache_beam.options.pipeline_options import PipelineOptions 
beam_options = PipelineOptions() 

In [9]:
# CREATING CUSTOM OPTIONS 
# FIRST NEED TO UNDERSTAND WHAT PYTHON DECORATORS ARE 
# STATIC VS DYNAMIC METHODS 

# STATIC METHODS => CAN BE EXECUTED THROUGH THE CLASS INSTEAD OF WHEN THE CLASS HAS BEEN INSTANTIATED OR OBJECT CAN BE CREATED FROM THE CLASS 

# WITHOUT STATIC METHOD => no attributes so method run directly without the class being instantiated 
class Formula:
    def pow(self, x, y):
        return x ** y 

m = Formula()
print(m.pow(3, 3))

27


In [10]:
# WITH A STATIC METHOD 
class Formula: 
    @staticmethod 
    def pow(x, y):
        return x ** y 

print(Formula.pow(3, 3))

27


In [15]:
# ANOTHER EXAMPLE 
class Greeting: 
    def display(self, message):
        print(message) 
message = Greeting() 
print(message.display('HelloWorld')) 

HelloWorld
None


In [18]:
class Greeting: 
    @staticmethod
    def display(message):
        print(message) 
print(Greeting.display('Hello World')) 

Hello World
None


In [41]:
# WORKING WITH CLASS METHODS 
# CLASS METHODS => STATIC METHODS THAT ACCESS THE INSTANCE ATTRIBUTES & ALSO CAN BE CALLED WITHOUT INSTANTIATING THE CLASS 
class Greeting: 
    message = "Hello"
    @classmethod
    def display(mes, finalMessage):
        print(mes.message + finalMessage) 

print(Greeting.display(' World'))




Hello World
None


In [42]:
# CREATING CUSTOM OPTIONS IN ADDITION TO THE STANDARD PIPELINE OPTIONS 
from apache_beam.options.pipeline_options import PipelineOptions

class MyOptions(PipelineOptions):
    @classmethod 
    def _add_argparse_args(cls, parser):
        parser.add_argument('--input', required=True)
        parser.add_argument('--output', required=True)

In [44]:
# SAME AS ABOVE BUT HAS DEFAULT DATAFLOW LINK AND HELP TEXT 
# this allows pipeline to accept input and output as command line arguments 
from apache_beam.options.pipeline_options import PipelineOptions

class MyOptions(PipelineOptions):
    @classmethod 
    def _add_argparse_args(cls, parser):
        parser.add_argument(
        '--input',
        default= 'gs://dataflow-samples/repairs/repairsfeedback.txt',
        help='file path for the input text to process')
        parser.add_argument(
            '--output', 
            required=True, 
            help='The path prefix for the output files'
        )

In [45]:
# PCOLLECTION => DISRTIBUTED COLLECTION OF DATA 
# BEAM TRANSFORMS USE PCOLLECTIONS AS INPUTS AND OUTPUTS 
# IF YOU WANT TO WORK WITH DATA IN YOUR PIPELINE 
# IT MUST COME FROM A PCOLLECTION 

In [None]:
# READING FROM AN EXTERNAL SOURCE 
# HAVE TO PROVIDE THE LINK 
import apache_beam as beam 

lines = pipeline | 'Read the file' >> beam.io.ReadFromText('gs://dataflow-samples/repairs/repairsfeedback.txt') 

# DO NOT RUN THIS FILE
# JUST READ THE CODE 

In [None]:
# CREATE A PCOLLECTION FROM AN IN MEMORY DATA 
import apache_beam as beam 

with beam.Pipeline() as pipeline:
    lines = (
        pipeline 
        | beam.Create([
            'This is true', 
            'To be, or not to be: that is the question: ',
          "Whether 'tis nobler in the mind to suffer ",
          'The slings and arrows of outrageous fortune, ',
          'Or to take arms against a sea of troubles, ',
        ]))

# NOTE: DO NOT RUN THIS AS IT REQUIRES AN OUTPUT TO WORK 

In [52]:
# PCOLLECTION FEATURES 
# PCOLLECTION BE ALWAYS OWNED BY A SPECIFIC PIPELINE 
# MULTIPLE PIPELINES NO FIT SHARE PCOLLECTIONS 

In [53]:
# ELEMENT TYPE 
# PCOLLECTIONS FIT BE OF ANY TYPE 
# BUT IF WE DEY DO DISTRIBUTED PROCESSING, BEAM FOR ENCODE THE ELEMENT TYPE AS A BYTE STRING 
# SO SAY ELEMENTS FIT BE PASSED AROUND TO DISTRIBUTED WORKERS 

In [54]:
# ELEMENT SCHEMA => JSON, AVRO, DB RECORDS 
# SCHEMA DEY HELP MAKE WE GET MORE EXPRESSIVE AGGREGATES 
# WHAT BE AGGREGATES ?? 

In [55]:
# PCOLLECTIONS BE IMMUTABLE 
# YOU NOT FIT CHANGE, ADD OR REMOVE ELEMENTS IF YOU CREATE AM 
# BEAM TRANSFORM JUST DEY MODIFY EACH ELEMENT DN GENERATE NEW PIPELINE 
# NO DEY MEAN SAY E GO MODIFY THE ORIGINAL PCOLLECTION 

In [56]:
# PCOLLECTION FIT BE ANY SIZE 
# FIT BE BOUNDED OR UNBOUNDED => EITHER THE SIZE WE KNOW OR WE DONT KNOW 

In [57]:
# ELEMENT TIME STAMPS 
# EACH ELEMENT FOR PCOLLECTION GET SOME ASSOCIATED TIME STAMP 
# THIS BE ASSIGNED BY THE SOURCE WEY CREATE THE PCOLLECTION 
# SAY IF PIPELINE DEY READ TWEETS, EACH ELEMENT FIT USE THE TIME THE PERSON TWEET 

In [59]:
# TRANSFORMS 
# WHAT WE WANT TURN THE DATA INTO 
# WE GO GO OVER THIS FOR THE EXPERIMENT SECTION 
# PIPELINE PROCESS DEY LOOK LIKE THIS 
# DATABASE TABLE -> READ TRANSFORM -> (PCOLLECTION) -> TRANSFORM -> (PCOLLECTION) -> WRITE TRANSFORM -> DATABASE TABLE

In [60]:
# PCOLLECTION IS IMMUTABLE BY DEFINITION BUT 
# U FIT APPLY DIFFERENT TRANSFORMS FOR THE SAME PCOLLECTION TO CREATE A BRANCING PIPELINE LIKE SO 
# DATABASE TABLE -> READ DATABASE OF NAMES -> PARDO(EXTRACT STRINGS STARTING WITH A)-> A NAMES 
# DATABASE TABLE -> READ DATABASE OF NAMES -> PARDO(EXTRACT STRINGS STARTING WITH B)-> B NAMES

In [61]:
# MAIN CORE BEAM TRANSFORMS 
# PARDO => PROCESS ELEMENTS IN PARALLEL 
# GROUP BY KEY => GROUP ELEMENTS BY KEY 
# CO GROUP BY KEY => GROUP ELEMENTS BY KEY AND JOIN THEM 
# COMBINE => AGGREGATE ELEMENTS
# FLATTEN => EXPAND A COLLECTION OF COLLECTIONS INTO A SINGLE COLLECTION
# PARTITION => DIVIDE A COLLECTION INTO A NUMBER OF PARTITIONS 

In [None]:
# PARDO 
# FILTERING DATASET 
# TYPE CONVERTING EACH ELEMENT IN THE DATASET 
# EXTRACTING PARTS OF EACH ELEMENT IN THE DATASET 
# PERFORMING COMPUTATIONS ON EACH ELEMENT IN THE DATASET 

words = ... # input pcollection of strings 

# do function to perform on each element in the input pcollection
class ComputeLength(beam.DoFn):
    def process(self, element):
        return [len(element)] 

# apply the do function to the input pcollection 
word_length = words | beam.ParDo(ComputeLength()) 

In [62]:
# WHAT BE DO FUNCTION 
# THEN DEY DEFINE THE PIPLELINE EXTRACT PROCESSING TASKS 
# THE CODE FOR FULFIL THESE TWO REQUIREMENTS 
# MAKE SURE WE DEY FULFUL THIS BEFORE WE USE THE DO FUNCTIONS 
# FUNCTION OBJECT FOR BE SERLIALISABLE 
# FUNCTION OBJECT FOR THE THREAD-COMPATIBLE AND FOR KNOW SAY BEAM SDKs NO BE THREAD SAFE 
# ALSO MAKE THE FUNCTIONS IDEMPOTENT

In [None]:
# IF THE FUNCTIONS BE STRAIGHFOWARD 
# WE FIT PROVIDE LIGHT WEIGHT DO FUNCTIONS ONE LINERS 
word_length = words | beam.ParDo(lambda x: [len(x)]) 

In [63]:
# DO FUNCTION LIFECYCLE 
# DO DEEP INTO SERLLIALISATION AND DESERIALISATION

In [64]:
# GROUP BY KEY AND UNBOUNDED PCOLLECTIONS 


# U FOR USE NON-GLOBAL KEY WINDOWING OR AGGREGATION TRIGGER TO PERFORM GROUPBYKEY OR COGROUPBYKEY 
# UNBOUNDED COLLECTIONS 
# WINDOWING OR TRIGGERS FOR ALLOW GROUPING TO OPERATE ON LOGICAL FINITE BUNDLES OF DATA WITHIN THE 
# UNBOUNDED DATA STREAMS 

# IF WE NO USE AM E GO THROW THIS ERROR =>  IllegalStateException error 

In [65]:
# COGROUPBY KEY => JOIN KEY VALUE PAIRS WEY BE THE SAME TYPE 


In [66]:
# COMBINE => COMBINING ELEMENTS OR VALUES IN PCOLLECTIOONS 
pc = [1, 10, 100, 1000] 

def boundedsum(values, bounds=500):
    return min(sum(values), bounds) 

small_sum = pc | beam.CombineGlobally(boundedsum)
large_sum = pc | beam.CombineGlobally(boundedsum, bounds=10000)

usage: ipykernel_launcher.py [-h] [--runner RUNNER] [--streaming]
                             [--resource_hint RESOURCE_HINTS]
                             [--beam_services BEAM_SERVICES]
                             [--type_check_strictness {ALL_REQUIRED,DEFAULT_TO_ANY}]
                             [--type_check_additional TYPE_CHECK_ADDITIONAL]
                             [--no_pipeline_type_check] [--runtime_type_check]
                             [--performance_runtime_type_check]
                             [--allow_non_deterministic_key_coders]
                             [--allow_unsafe_triggers]
                             [--no_direct_runner_use_stacked_bundle]
                             [--direct_runner_bundle_repeat DIRECT_RUNNER_BUNDLE_REPEAT]
                             [--direct_num_workers DIRECT_NUM_WORKERS]
                             [--direct_running_mode {in_memory,multi_threading,multi_processing}]
                             [--direct_embed_docker_pyth

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# ADVANCED COMBINATIONS USING COMBINEFN 
# NB: NEED TO UNDERSTAND COMBINERS WELL 
# NOTE: ALL COMBINERS SHOULD HAVE A MORE SOPHISTICATED ACCUMULATOR 


# FOR MORE COMPLEX FUNCTIONS, DEFINE A SUBCLASS OF COMBINEFN 
# YOU SHOULD USE A COMBINE FN IF FUNCTION REQUIRES A MORE SOPHISTICATED ACCUMULATOR 

# GENERAL COMBINING OPERATION CONSISTS OF FOUR STEPS 
# 1. CREATE AN ACCUMULATOR 2. ADD INPUT 3. MERGE ACCUMULATORS 4. EXTRACT OUTPUT 

pc = ... 

class AverageFn(beam.CombineFn):
    def create_accumulator(self):
        return (0.0, 0) 

    def add_input(self, sum_count, input):
        (sum, count) = sum_count 
        return sum + input, count + 1 

    def merge_accumulators(self, accumulators):
        sums, counts = zip(*accumulators) 
        return sum(sums), sum(counts) 

    def extract_output(self, sum_count):
        (sum, count) = sum_count 
        return sum / count

In [None]:
# COMBINING ALL PCOLLCTIONS INTO A SINGLE VALUE 

pc = ... 

average = pc | beam.CombineGlobally(AverageFn()) 

In [68]:
# COMBINE AND GLOBAL WINDOWING ?? COME BACK TO THIS PART 
# what is global windowing in the first place ?? 
# => If your input PCollection uses the default global windowing, the default behavior is to return a
#  PCollection containing one item.

In [None]:
# COMBINE VALUES in a keyed PCOLLECTION ?? COME BACK TO THIS PART TOO  
player_accuracies = ...

averageaccuracyperplayer = (
    player_accuracies 
    | beam.CombinePerKey(beam.combiners.MeanCombineFn())
)


In [None]:
# FLATTEN => MERGE MULTIPLE PCOLLECTIONS INTO A SINGLE PCOLLECTION 