In [None]:
!pip freeze | grep apache-beam==2.10.0 || pip install apache-beam[gcp]

In [None]:
!pip freeze | grep tensorflow==1.12.0 || pip install tensorflow==1.12.0

In [None]:
!pip freeze | grep Pillow > requirements.txt

In [None]:
import apache_beam as beam
import datetime
import os

In [None]:
PROJECT = 'qwiklabs-gcp-3f19cbba7aa3ae63'
os.environ['PROJECT'] = PROJECT
os.environ['CLOUDSDK_PYTHON'] = 'python3'

In [None]:
%%bash
gcloud config set project $PROJECT

In [None]:
query = """
#standardSQL
SELECT DISTINCT product_id, image_url FROM `qwiklabs-gcp-3f19cbba7aa3ae63.project.raw`
"""

In [None]:
# This will take the image url and product id, and fetch the actual image
class TransformImages(beam.DoFn):
  def process(self, element):
    import requests
    from PIL import Image
    import logging
    import io
    
    uri = element['image_url']
    id = element['product_id']
    
    # some urls are missing the "http:" part
    if uri[:2] == '//':
        uri = 'http:' + uri
    
    try:
      res = requests.get(url=uri)
      image_bytes = res.content
      img = Image.open(io.BytesIO(image_bytes)).convert('RGB')
    
      # if you want to resize the image, uncomment the next line
      #img = img.resize((250, 250), Image.ANTIALIAS)
    except Exception as e:
      logging.exception('Error processing image %s: %s', uri, str(e))
      return

    output = io.BytesIO()
    img.save(output, 'jpeg')
    image_bytes = output.getvalue()
    yield id, image_bytes
    
# This will write the image bytes to GCS
class WriteToStorage(beam.DoFn):
  def process(self, element):
    from apache_beam.io import filesystems
    (name, image_bytes) = element
    path = 'gs://project-sample/dataset1/{}.jpeg'.format(name)
    writer = filesystems.FileSystems.create(path)
    writer.write(image_bytes)
    writer.close()


def preprocess(runner):
  job_name = 'test-preprocess-images' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')
  print('Launching Dataflow job {} ... hang on'.format(job_name))

  options = {
    'staging_location': 'gs://project-sample/out/tmp/staging',
    'temp_location': 'gs://project-sample/out/tmp',
    'job_name': job_name,
    'requirements_file': 'requirements.txt',
    'project': PROJECT,
    'runner': runner,
  }
  
  #instantiate PipelineOptions object using options dictionary
  opts = beam.pipeline.PipelineOptions(flags=[], **options)

  #instantantiate Pipeline object using PipelineOptions
  with beam.Pipeline(options=opts) as p:
    (
      p | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))
        | 'transform' >> beam.ParDo(TransformImages())
        | 'write' >> beam.ParDo(WriteToStorage())
    )
  print("Done")

In [None]:
preprocess("DirectRunner")

In [None]:
preprocess("DataflowRunner")