In [None]:
curl -s -X POST -H "Content-Type: application/json" --data-binary @request.json \
"https://speech.googleapis.com/v1/speech:recognize?key=${API_KEY}"

In [None]:
# start dataflow job
python dataflow_python_examples/data_transformation.py \
  --project=$PROJECT \
  --region=us-east5 \
  --runner=DataflowRunner \
  --staging_location=gs://$PROJECT/test \
  --temp_location gs://$PROJECT/test \
  --input gs://$PROJECT/data_files/head_usa_names.csv \
  --save_main_session

In [3]:
# !pip install "apache-beam[interactive]"

In [5]:
import apache_beam as beam
from apache_beam.runners.interactive import interactive_beam as ib
from apache_beam.options.pipeline_options import PipelineOptions

In [208]:
# customizing pipeline options
class Myoptions(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
        parser.add_argument(
            '--input', 
            help='Input for the pipeline', 
            default='gs://dataflow-samples/shakespeare/kinglear.txt')
        
        parser.add_argument(
            '--output',
            help='Output for the pipeline',
            default='gs://$PROJECT/output/')

# beam.Row and infering schema

In [14]:
import typing

class Transaction(typing.NamedTuple):
    bank: str
    purchase_amount: float

p = beam.Pipeline()
output = (
          p
          | beam.Create([{"bank": "Wells Fargo", "purchase_amount": 103.30}])
          | beam.Map(lambda item : beam.Row(bank=item["bank"], purchase_amount = item["purchase_amount"])).with_output_types(Transaction)
          | beam.Map(print)
        )

p.run()

Row(bank='Wells Fargo', purchase_amount=103.3)


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7fe212ecaeb0>

# beam.Select and infering schema

In [15]:
p = beam.Pipeline()

output = (
          p
          | beam.Create([{"bank": "Wells Fargo", "purchase_amount": 103.30}])
          | beam.Select(bank = lambda item: item["bank"], purchase_amount = lambda item: item["purchase_amount"]).with_output_types(Transaction)
          | beam.Map(print)
        )

p.run()

Row(bank='Wells Fargo', purchase_amount=103.3)


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7fe212b5e7f0>

# creating a DoFn

In [367]:
class ComputeWordLength(beam.DoFn):
    def process(self, element):
        return [len(element)]
    
p = beam.Pipeline()
# creating Pcollection from data in memory
words = beam.Create('who is the man from the moon'.split())
wordlengths = p | words | beam.ParDo(ComputeWordLength()) | beam.Map(print)
p.run()

3
2
3
3
4
3
4


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7fa29434e070>

# passing side inputs to ParDo

In [421]:
text = """
Lorem ipsum dolor sit amet consectetur adipisicing elit. Quisquam vitae unde voluptatibus dolores perspiciatis, quis amet eveniet aperiam atque placeat laborum? Consequuntur illo accusamus, praesentium doloremque eaque recusandae earum perspiciatis!."""

p = beam.Pipeline()

words = p | beam.Create(text.split())

def filter_using_length(word, lower_bound, upper_bound=float('inf')):
    if lower_bound <= len(word) <= upper_bound:
        yield word

average_word_len = (
    words
    | beam.Map(len)
    | beam.CombineGlobally(beam.combiners.MeanCombineFn()))

small_words = words | "small words" >> beam.FlatMap(filter_using_length, lower_bound=1, upper_bound=3)
larger_words = words | "large words" >> beam.FlatMap(filter_using_length, lower_bound= beam.pvalue.AsSingleton(average_word_len))
p.run()

# tagging multiple outputs

In [436]:
p = beam.Pipeline()
numbers = p | beam.Create([1,2,3,4,5,6,7,8,9,10])

def even_odd_filter(x):
    yield beam.pvalue.TaggedOutput('odd' if x % 2 else 'even', x)
    if x % 10 == 0:
        yield x

results = numbers | beam.FlatMap(even_odd_filter).with_outputs()#('odd', 'even')
p.run()

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7fa297107310>

# creating composite transforms

In [441]:
class ComputeWordLength(beam.PTransform):
    def expand(self, pcoll):
        return pcoll | beam.Map(lambda x: len(x))
    
    p = beam.Pipeline()

# Creating a CombineFn

In [369]:
class ComputeAverag(beam.CombineFn):
    def create_accumulator(self):
        return (0.0, 0)
    
    def add_input(self, sum_count, input):
        (sum, count) = sum_count
        return sum + input, count + 1
    
    def merge_accumulators(self, accumulators):
        sums, counts = zip(*accumulators)
        return sum(sums), sum(counts)
    
    def extract_output(self, sum_count):
        (sum, count) = sum_count
        return sum/count if count else float('NaN')

p = beam.Pipeline()
# creating Pcollection from data in memory
pcoll = beam.Create([1, 2, 3, 4, 5])
average = p | pcoll | beam.CombineGlobally(ComputeAverag()) | beam.Map(print)
p.run()

15


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7fa2c2c13220>

In [259]:
p = beam.Pipeline()

emails_list = [
    ('amy', 'amy@example.com'),
    ('carl', 'carl@example.com'),
    ('julia', 'julia@example.com'),
    ('carl', 'carl@email.com'),
]
phones_list = [
    ('amy', '111-222-3333'),
    ('james', '222-333-4444'),
    ('amy', '333-444-5555'),
    ('carl', '444-555-6666'),
]

emails = p | 'CreateEmails' >> beam.Create(emails_list)
phones = p | 'CreatePhones' >> beam.Create(phones_list)

results = ({'emails': emails, 'phones': phones} | beam.CoGroupByKey()) #| beam.Map(print)

In [296]:
c.kv.get("106")

('1', None)

In [256]:
p.run()

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7fa2ac0de5b0>

In [487]:
input_data = 'data/IBM.csv'
pipeline = beam.Pipeline()

outputs = (
    pipeline
    | 'ReadData' >> beam.io.ReadFromText(input_data, skip_header_lines=1)
    | 'SplitData' >> beam.Map(lambda x: x.split(','))
    | 'FilterData' >> beam.Filter(lambda x: x[1] == 'Yes')
    | 'flatmap' >> beam.FlatMap(lambda x: x)
    | 'pair words with 1' >> beam.Map(lambda x: (x, 1))
    | 'group and sum' >> beam.CombinePerKey(sum)
    | 'print' >> beam.Map(print)
)
pipeline.run()

('41', 6)
('Yes', 237)
('Sales', 92)
('1', 377)
('2', 262)
('Life Sciences', 89)
('4', 242)
('Single', 120)
('5993', 1)
('8', 25)
('6', 32)
('37', 6)
('Research & Development', 133)
('Other', 11)
('3', 411)
('2090', 1)
('0', 39)
('28', 16)
('24', 20)
('2028', 1)
('5', 52)
('36', 6)
('9', 38)
('3407', 1)
('7', 39)
('34', 9)
('Medical', 63)
('2960', 1)
('32', 12)
('16', 8)
('3919', 1)
('10', 29)
('39', 6)
('Technical Degree', 32)
('Married', 84)
('2086', 1)
('2293', 2)
('50', 5)
('Marketing', 35)
('2683', 1)
('26', 15)
('25', 12)
('12', 6)
('19545', 1)
('22', 12)
('48', 2)
('5381', 1)
('3441', 1)
('3388', 1)
('46', 4)
('9619', 1)
('Human Resources', 19)
('Divorced', 33)
('2073', 1)
('20', 11)
('2926', 1)
('5744', 1)
('6074', 1)
('56', 3)
('14', 6)
('4963', 1)
('31', 19)
('6172', 1)
('58', 5)
('23', 10)
('10312', 1)
('40', 6)
('19', 10)
('1675', 1)
('4559', 1)
('51', 2)
('10650', 1)
('4200', 1)
('2325', 1)
('1102', 1)
('3140', 1)
('35', 10)
('5916', 1)
('38', 2)
('29', 23)
('6673', 1)
('2

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7fa2c124c370>

In [358]:
ib.show_graph(p)

/Users/hardey/opt/anaconda3/envs/bigdata/bin/dot


In [348]:
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
p = beam.Pipeline(InteractiveRunner())

In [357]:
pc = p | beam.Create([1, 10, 100, 1000])

def bounded_sum(values, bound=500):
  return min(sum(values), bound)

small_sum = pc | "small sum" >> beam.CombineGlobally(bounded_sum) | beam.Map(print)  # [500]
large_sum = pc | "large sum" >> beam.Map(lambda x : x**2) | beam.Map(print)

RuntimeError: A transform with label "[357]: Map(print)" already exists in the pipeline. To apply a transform with a specified label write pvalue | "label" >> transform

In [356]:
large_sum.element

AttributeError: 'PCollection' object has no attribute 'element'