In [1]:
!pip freeze | grep apache-beam==2.10.0 || pip install apache-beam[gcp]

Collecting apache-beam[gcp]
  Downloading https://files.pythonhosted.org/packages/cf/d5/bd533f864219da7251be3497e622f45f1ac56b1ee2eaa1b601b6b222044f/apache_beam-2.11.0-cp27-cp27mu-manylinux1_x86_64.whl (2.5MB)
[K    100% |████████████████████████████████| 2.5MB 566kB/s 
[?25hCollecting avro<2.0.0,>=1.8.1; python_version < "3.0" (from apache-beam[gcp])
  Downloading https://files.pythonhosted.org/packages/eb/27/143f124a7498f841317a92ced877150c5cb8d28a4109ec39666485925d00/avro-1.8.2.tar.gz (43kB)
[K    100% |████████████████████████████████| 51kB 12.4MB/s 
[?25hCollecting pyyaml<4.0.0,>=3.12 (from apache-beam[gcp])
  Downloading https://files.pythonhosted.org/packages/9e/a3/1d13970c3f36777c583f136c136f804d70f500168edc1edea6daa7200769/PyYAML-3.13.tar.gz (270kB)
[K    100% |████████████████████████████████| 276kB 5.1MB/s 
[?25hCollecting protobuf<4,>=3.5.0.post1 (from apache-beam[gcp])
  Downloading https://files.pythonhosted.org/packages/ea/72/5eadea03b06ca1320be2433ef2236155da17806

In [2]:
!pip freeze | grep tensorflow==1.12.0 || pip install tensorflow==1.12.0

Collecting tensorflow==1.12.0
  Downloading https://files.pythonhosted.org/packages/bd/68/ec26b2cb070a5760707ec8d9491a24e5be72f4885f265bb04abf70c0f9f1/tensorflow-1.12.0-cp27-cp27mu-manylinux1_x86_64.whl (83.1MB)
[K    100% |████████████████████████████████| 83.1MB 15kB/s 
[?25hCollecting keras-applications>=1.0.6 (from tensorflow==1.12.0)
  Downloading https://files.pythonhosted.org/packages/90/85/64c82949765cfb246bbdaf5aca2d55f400f792655927a017710a78445def/Keras_Applications-1.0.7-py2.py3-none-any.whl (51kB)
[K    100% |████████████████████████████████| 61kB 14.7MB/s 
[?25hCollecting mock>=2.0.0 (from tensorflow==1.12.0)
  Using cached https://files.pythonhosted.org/packages/e6/35/f187bdf23be87092bd0f1200d43d23076cee4d0dec109f195173fd3ebc79/mock-2.0.0-py2.py3-none-any.whl
Collecting grpcio>=1.8.6 (from tensorflow==1.12.0)
  Using cached https://files.pythonhosted.org/packages/b8/be/3bb6d8241b5ed1f8437169df53e7dd6ca986174e022585de15087a848c99/grpcio-1.19.0-cp27-cp27mu-manylinux1_x8

In [3]:
!pip freeze > requirements.txt

In [4]:
import apache_beam as beam
import datetime
import os

ImportError: No module named 'apache_beam'

In [None]:
PROJECT = 'qwiklabs-gcp-3f19cbba7aa3ae63'
os.environ['PROJECT'] = PROJECT
os.environ['CLOUDSDK_PYTHON'] = 'python3'

In [None]:
%%bash
gcloud config set project $PROJECT

In [None]:
query = """
#standardSQL
SELECT DISTINCT product_id, image_url FROM `qwiklabs-gcp-3f19cbba7aa3ae63.project.raw`
"""

In [None]:
# This will take the image url and product id, and fetch the actual image
class TransformImages(beam.DoFn):
  def process(self, element):
    import requests
    from PIL import Image
    import logging
    import io
    
    uri = element['image_url']
    id = element['product_id']
    
    # some urls are missing the "http:" part
    if uri[:2] == '//':
        uri = 'http:' + uri
    
    try:
      res = requests.get(url=uri)
      image_bytes = res.content
      img = Image.open(io.BytesIO(image_bytes)).convert('RGB')
    
      # if you want to resize the image, uncomment the next line
      #img = img.resize((250, 250), Image.ANTIALIAS)
    except Exception as e:
      logging.exception('Error processing image %s: %s', uri, str(e))
      return

    output = io.BytesIO()
    img.save(output, 'jpeg')
    image_bytes = output.getvalue()
    yield id, image_bytes
    
# This will write the image bytes to GCS
class WriteToStorage(beam.DoFn):
  def process(self, element):
    from apache_beam.io import filesystems
    (name, image_bytes) = element
    path = 'gs://project-sample/dataset1/{}.jpeg'.format(name)
    writer = filesystems.FileSystems.create(path)
    writer.write(image_bytes)
    writer.close()


def preprocess(runner):
  job_name = 'test-preprocess-images' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')
  print('Launching Dataflow job {} ... hang on'.format(job_name))

  options = {
    'staging_location': 'gs://project-sample/out/tmp/staging',
    'temp_location': 'gs://project-sample/out/tmp',
    'job_name': job_name,
    'requirements_file': 'requirements.txt',
    'project': PROJECT,
    'runner': runner,
  }
  
  #instantiate PipelineOptions object using options dictionary
  opts = beam.pipeline.PipelineOptions(flags=[], **options)

  #instantantiate Pipeline object using PipelineOptions
  with beam.Pipeline(options=opts) as p:
    (
      p | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))
        | 'transform' >> beam.ParDo(TransformImages())
        | 'write' >> beam.ParDo(WriteToStorage())
    )
  print("Done")