In [None]:
# %%bash

# pip install -U protobuf
# pip install -U apache_beam

In [1]:
import pandas
import time
import datetime
from google.cloud import pubsub
import json
import apache_beam as beam
import os
print(beam.__version__)

2.2.0


No handlers could be found for logger "oauth2client.contrib.multistore_file"


In [8]:
TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
RUNNER = "Dataflow"
PROJECT = 'ksalama-gcp-playground'
DATASET = 'playground_ds'
TABLE = 'babyweight_estimates'
STG_BUCKET = 'stagging-ksalama-gcs-cloudml'
REGION = 'europe-west1'
TOPIC = 'babyweights'
SUBSCRIPTION='babyweights-sub'

## Submit Dataflow Stream Processing Job

In [25]:
pubsub_subscription = "projects/{}/subscriptions/{}".format(PROJECT,SUBSCRIPTION)
pubsub_topic = "projects/{}/topics/{}".format(PROJECT,TOPIC)

print(pubsub_subscription)
print(DATASET,TABLE)


def estimate_weight(json_message):
    
    PROJECT='ksalama-gcp-playground'
    MODEL_NAME='babyweight_estimator'
    VERSION='v1'
    
    import json
    from googleapiclient import discovery
    from oauth2client.client import GoogleCredentials
    
    credentials = GoogleCredentials.get_application_default()
    api = discovery.build('ml', 'v1', credentials=credentials,
                discoveryServiceUrl='https://storage.googleapis.com/cloud-ml/discovery/ml_v1_discovery.json')

    instance = json.loads(json_message)
    source_id = instance.pop('source_id')
    source_timestamp = instance.pop('source_timestamp')
    
    request_data = {'instances': [instance]}

    model_url = 'projects/{}/models/{}/versions/{}'.format(PROJECT, MODEL_NAME, VERSION)
    response = api.projects().predict(body=request_data, name=model_url).execute()

    estimates = list(map(lambda item: round(item["scores"],2)
        ,response["predictions"]
    ))
    
    estimated_weight_kg =  round(int(estimates[0]) * 0.453592,2)
    
    instance['estimated_weight_kg'] = estimated_weight_kg
    instance['source_id'] = source_id
    instance['source_timestamp'] = source_timestamp

    return instance

# def process_events(json_message):
#     import json
#     message = json.loads(json_message)
#     message['estimated_weight_kg'] = 7
#     return message
  
def run_babyweight_estimates_streaming_pipeline():
    
    job_name = 'ingest-babyweight-estimates-{}'.format(datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
    print 'Launching Dataflow job {}'.format(job_name)
    print 'Check the Dataflow jobs on Google Cloud Console...'

    STG_DIR = 'gs://{}/babyweight'.format(STG_BUCKET)

    options = {
        'staging_location': os.path.join(STG_DIR, 'tmp', 'staging'),
        'temp_location': os.path.join(STG_DIR, 'tmp'),
        'job_name': job_name,
        'project': PROJECT,
        'streaming': True,
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True
      }


    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    
    pipeline = beam.Pipeline(RUNNER, options=opts)
      
    (
      pipeline | 'Read data from PubSub' >> beam.io.ReadStringsFromPubSub(subscription=pubsub_subscription) 
               | 'Process message' >> beam.Map(estimate_weight)
               | 'Write to BigQuery' >> beam.io.WriteToBigQuery(project=PROJECT, dataset=DATASET, table=TABLE)
    )

    pipeline.run()

projects/ksalama-gcp-playground/subscriptions/babyweights-sub
('playground_ds', 'babyweight_estimates')


In [26]:
run_babyweight_estimates_streaming_pipeline()

Launching Dataflow job ingest-babyweight-estimates-180210-000925
Check the Dataflow jobs on Google Cloud Console...


In [27]:
instances =  [
      {
        'is_male': 'True',
        'mother_age': 26.0,
        'mother_race': 'Asian Indian',
        'plurality': 1.0,
        'gestation_weeks': 39,
        'mother_married': 'True',
        'cigarette_use': 'False',
        'alcohol_use': 'False'
      },
      {
        'is_male': 'False',
        'mother_age': 29.0,
        'mother_race': 'Asian Indian',
        'plurality': 1.0,
        'gestation_weeks': 38,
        'mother_married': 'True',
        'cigarette_use': 'False',
        'alcohol_use': 'False'
      },
      {
        'is_male': 'True',
        'mother_age': 26.0,
        'mother_race': 'White',
        'plurality': 1.0,
        'gestation_weeks': 39,
        'mother_married': 'True',
        'cigarette_use': 'False',
        'alcohol_use': 'False'
      },
      {
        'is_male': 'True',
        'mother_age': 26.0,
        'mother_race': 'White',
        'plurality': 2.0,
        'gestation_weeks': 37,
        'mother_married': 'True',
        'cigarette_use': 'False',
        'alcohol_use': 'True'
      }
  ]

## Send Data Points to PubSub

In [28]:
from random import shuffle

iterations = 100
sleep_time = 1

client = pubsub.Client()
topic = client.topic(TOPIC)

if not topic.exists():
    print ('Creating pub/sub topic {}...'.format(TOPIC))
    topic.create()

print ('Pub/sub topic {} is up and running'.format(TOPIC))
print("")

for i in range(iterations):
    
    shuffle(instances)
    
    for data_point in instances:
        
        source_timestamp = datetime.datetime.now().strftime(TIME_FORMAT)
        source_id = str(abs(hash(str(data_point)+str(source_timestamp))) % (10 ** 10))
        data_point['source_id'] = source_id
        data_point['source_timestamp'] = source_timestamp
        
        message = json.dumps(data_point)
        topic.publish(message=message, source_id = source_id, source_timestamp=source_timestamp)

    print("Batch {} was sent. Last Message was: {}".format(i, message))
    print("")

    time.sleep(sleep_time)

print("Done!")

Pub/sub topic babyweights is up and running

Batch 0 was sent. Last Message was: {"mother_race": "White", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:09:44", "plurality": 1.0, "gestation_weeks": 39, "mother_married": "True", "is_male": "True", "source_id": "1539574372", "mother_age": 26.0, "alcohol_use": "False"}

Batch 1 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:09:46", "plurality": 1.0, "gestation_weeks": 39, "mother_married": "True", "is_male": "True", "source_id": "3937756422", "mother_age": 26.0, "alcohol_use": "False"}

Batch 2 was sent. Last Message was: {"mother_race": "White", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:09:47", "plurality": 2.0, "gestation_weeks": 37, "mother_married": "True", "is_male": "True", "source_id": "4943153006", "mother_age": 26.0, "alcohol_use": "True"}

Batch 3 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": 

Batch 28 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:10:21", "plurality": 1.0, "gestation_weeks": 38, "mother_married": "True", "is_male": "False", "source_id": "4723452901", "mother_age": 29.0, "alcohol_use": "False"}

Batch 29 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:10:22", "plurality": 1.0, "gestation_weeks": 39, "mother_married": "True", "is_male": "True", "source_id": "421366883", "mother_age": 26.0, "alcohol_use": "False"}

Batch 30 was sent. Last Message was: {"mother_race": "White", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:10:24", "plurality": 1.0, "gestation_weeks": 39, "mother_married": "True", "is_male": "True", "source_id": "2729298372", "mother_age": 26.0, "alcohol_use": "False"}

Batch 31 was sent. Last Message was: {"mother_race": "White", "cigarette_use": "False", "source_timestamp": "2018-02-10

Batch 57 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:10:59", "plurality": 1.0, "gestation_weeks": 38, "mother_married": "True", "is_male": "False", "source_id": "653479650", "mother_age": 29.0, "alcohol_use": "False"}

Batch 58 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:11:00", "plurality": 1.0, "gestation_weeks": 38, "mother_married": "True", "is_male": "False", "source_id": "4281173090", "mother_age": 29.0, "alcohol_use": "False"}

Batch 59 was sent. Last Message was: {"mother_race": "White", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:11:02", "plurality": 1.0, "gestation_weeks": 39, "mother_married": "True", "is_male": "True", "source_id": "1864781930", "mother_age": 26.0, "alcohol_use": "False"}

Batch 60 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": "False", "source_timestamp": "20

Batch 85 was sent. Last Message was: {"mother_race": "White", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:11:36", "plurality": 1.0, "gestation_weeks": 39, "mother_married": "True", "is_male": "True", "source_id": "4241160443", "mother_age": 26.0, "alcohol_use": "False"}

Batch 86 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:11:37", "plurality": 1.0, "gestation_weeks": 38, "mother_married": "True", "is_male": "False", "source_id": "8573732661", "mother_age": 29.0, "alcohol_use": "False"}

Batch 87 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": "False", "source_timestamp": "2018-02-10 00:11:39", "plurality": 1.0, "gestation_weeks": 38, "mother_married": "True", "is_male": "False", "source_id": "5709891883", "mother_age": 29.0, "alcohol_use": "False"}

Batch 88 was sent. Last Message was: {"mother_race": "Asian Indian", "cigarette_use": "False", "source_timestamp": "2

## Consume PubSub Topic 

In [None]:
client = pubsub.Client()
topic = client.topic(TOPIC)
subscription = topic.subscription(SUBSCRIPTION)
message = subscription.pull()

# print(message[0][1].source_timestamp)
print("source_id", message[0][1].attributes["source_id"])
print("source_timestamp:", message[0][1].attributes["source_timestamp"])
print("")
print(message[0][1].data)