## Scaling training data

In order to train an ML model on large dataset, the dataset needs to be shared into csv files and stored on Google Cloud Storage (GCS).

Now, we are going to export the data (1.5M training records & 500k validation records) from BigQuery to GCS using Dataflow

In [None]:
import os
import time
import tensorflow as tf
import apache_beam as beam
import shutil
print(beam.__version__)

## Set environment variables

In [None]:
RUNNER = "Dataflow"
PROJECT = 'ksalama-gcp-playground'
BUCKET = 'ksalama-gcs-cloudml'
REGION = 'europe-west1'

## Prepare BigQuery query

In [None]:
VALID_ROW_COUNT = 75280784

sql = "\
SELECT \
    pickup_datetime, \
    EXTRACT(DAYOFWEEK FROM pickup_datetime) AS pickup_dayofweek, \
    EXTRACT(HOUR FROM pickup_datetime) AS pickup_hour, \
    pickup_longitude, \
    pickup_latitude, \
    dropoff_longitude, \
    dropoff_latitude, \
    passenger_count, \
    tolls_amount + fare_amount AS fare_amount \
FROM `nyc-tlc.yellow.trips` \
WHERE \
    trip_distance > 0 \
AND fare_amount >= 2.5 \
AND pickup_longitude  > -78 \
AND pickup_longitude  < -70 \
AND dropoff_longitude  > -78 \
AND dropoff_longitude  < -70 \
AND pickup_latitude  > 37 \
AND pickup_latitude < 45 \
AND dropoff_latitude  > 37 \
AND dropoff_latitude  < 45 \
AND passenger_count  > 0 \
AND EXTRACT(YEAR FROM pickup_datetime) = 2015 \
AND MOD(ABS(FARM_FINGERPRINT(STRING(pickup_datetime))),@STEP_SIZE) = @PHASE \
"

## Define Dataflow pipeline logic to export data from BigQuery to Google Cloud Storage

In [None]:
def to_csv(row_dictionary):
    import copy
    days = ['null', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
    
    HEADER = ['pickup_datetime',
            'pickup_dayofweek',
            'pickup_hour',
            'pickup_longitude',
            'pickup_latitude',
            'dropoff_longitude',
            'dropoff_latitude', 
            'passenger_count',
            'fare_amount']
    
    result = copy.deepcopy(row_dictionary)
    result['pickup_dayofweek'] = days[result['pickup_dayofweek']]
    return ','.join([str(result[k]) for k in HEADER])

def run_pipeline(phase,sample_size):

    phase_string = "training" if phase == 2 else "validation"
    print("Phase:{}".format(phase_string))
    print("Sample Size:{}".format(sample_size))

    job_name = 'export-taxi-{}-data-{}'.format(phase_string,datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
    print 'Launching Dataflow job {}'.format(job_name)
    print 'Check the Dataflow jobs on Google Cloud Console...'

    OUTPUT_DIR = 'gs://{0}/data/nyc-taxifare/big'.format(BUCKET)

    options = {
      'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
      'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
      'job_name': job_name,
      'project': PROJECT,
      'teardown_policy': 'TEARDOWN_ALWAYS',
      'no_save_main_session': True
    }


    opts = beam.pipeline.PipelineOptions(flags=[], **options)
    p = beam.Pipeline(RUNNER, options=opts)

    step_size = int(VALID_ROW_COUNT / sample_size) - 1

    query = sql.replace('@STEP_SIZE', str(step_size)).replace('@PHASE', str(phase))

    sink = os.path.join(OUTPUT_DIR, '{}.csv'.format(phase_string))

    (
      p | 'Read data from BigQuery' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))
        | 'Convert to csv' >> beam.Map(to_csv) # map, filter, group, windowing, etc.
        | 'Write csv file to GCS' >> beam.io.Write(beam.io.WriteToText(sink))
    )

    p.run()

## Run Dataflow pipelines to process data to GCS 

In [None]:
train_size = 1500000
valid_size = 500000

run_pipeline(2, train_size)

run_pipeline(3, valid_size)
