In [1]:
import json
import os
import tensorflow as tf

from absl import app
from absl import flags
from absl import logging

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow_io import bigquery as tfio_bq

In [2]:
class FLAGS:
    epochs=1
    units=32
    per_replica_batch_size=128
    dropout_ratio=0.5
    training_table='jk-mlops-dev.chicago_taxi_ml.training'
    validation_table='jk-mlops-dev.chicago_taxi_ml.training'
    testing_table='jk-mlops-dev.chicago_taxi_ml.testing'

In [3]:
LOCAL_MODEL_DIR = '/tmp/saved_model'
LOCAL_TB_DIR = '/tmp/logs'
LOCAL_CHECKPOINT_DIR = '/tmp/checkpoints'
EVALUATION_FILE_NAME = 'evaluations.json'

FEATURES = {
    "tip_bin": ("categorical", tf.int64),
    "trip_month": ("categorical", tf.int64),
    "trip_day": ("categorical", tf.int64),
    "trip_day_of_week": ("categorical", tf.int64),
    "trip_hour": ("categorical", tf.int64),
    "payment_type": ("categorical", tf.string),
    "pickup_grid": ("categorical", tf.string),
    "dropoff_grid": ("categorical", tf.string),
    "euclidean": ("numeric", tf.double),
    "trip_seconds": ("numeric", tf.int64),
    "trip_miles": ("numeric", tf.double),
}

TARGET_FEATURE_NAME = "tip_bin"
TARGET_LABELS = ["tip<20%", "tip>=20%"]


def set_job_dirs():
    """Sets job directories based on env variables set by Vertex AI."""
    
    model_dir = os.getenv('AIP_MODEL_DIR', LOCAL_MODEL_DIR)
    tb_dir = os.getenv('AIP_TENSORBOARD_LOG_DIR', LOCAL_TB_DIR)
    checkpoint_dir = os.getenv('AIP_CHECKPOINT_DIR', LOCAL_CHECKPOINT_DIR)
    
    return model_dir, tb_dir, checkpoint_dir


def get_bq_dataset(table_name, selected_fields, target_feature='tip_bin', batch_size=32):
    
    def _transform_row(row_dict):
        trimmed_dict = {column:
                       (tf.strings.strip(tensor) if tensor.dtype == 'string' else tensor) 
                       for (column,tensor) in row_dict.items()
                       }
        target = trimmed_dict.pop(target_feature)
        return (trimmed_dict, target)

    project_id, dataset_id, table_id = table_name.split('.')
    
    client = tfio_bq.BigQueryClient()
    parent = f'projects/{project_id}'

    read_session = client.read_session(
        parent=parent,
        project_id=project_id,
        table_id=table_id,
        dataset_id=dataset_id,
        selected_fields=selected_fields,
    )

    dataset = read_session.parallel_read_rows().map(_transform_row).batch(batch_size)
    
    return dataset


def get_category_encoding_layer(name, dataset, dtype):
    """Creates a CategoryEncoding layer for a given feature."""

    if dtype == tf.string:
      index = preprocessing.StringLookup()
    else:
      index = preprocessing.IntegerLookup()

    feature_ds = dataset.map(lambda x, y: x[name])
    index.adapt(feature_ds)
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    return lambda feature: encoder(index(feature))


def get_normalization_layer(name, dataset):
  """"Creates a Normalization layer for a given feature."""
  normalizer = preprocessing.Normalization()

  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)

  return normalizer


def create_model(dataset, input_features, units, dropout_ratio):
    """Creates a binary classifier for Chicago Taxi tip prediction task."""
    
    all_inputs = []
    encoded_features = []
    for feature_name, feature_info in input_features.items():
        col = tf.keras.Input(shape=(1,), name=feature_name, dtype=feature_info[1])
        if feature_info[0] == 'categorical':
            
            encoding_layer = get_category_encoding_layer(feature_name, 
                                                         dataset,
                                                         feature_info[1])
        else:
            encoding_layer = get_normalization_layer(feature_name,
                                                     dataset) 
        encoded_col = encoding_layer(col)
        all_inputs.append(col)
        encoded_features.append(encoded_col)
        
    all_features = tf.keras.layers.concatenate(encoded_features)
    
    x = tf.keras.layers.Dense(units, activation="relu")(all_features)
    x = tf.keras.layers.Dropout(dropout_ratio)(x)
    output = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(all_inputs, output)
    
    return model


def main(argv):
    del argv
    
    # Set distribution strategy
    strategy = tf.distribute.MirroredStrategy()
    
    global_batch_size = (strategy.num_replicas_in_sync *
                         FLAGS.per_replica_batch_size)
    
    # Prepare datasets
    selected_fields = {key: {'output_type': value[1]} for key, value in FEATURES.items()}
    validation_ds = get_bq_dataset(FLAGS.validation_table, 
                                   selected_fields, 
                                   batch_size=global_batch_size)
    training_ds = get_bq_dataset(FLAGS.training_table,
                                 selected_fields,
                                 batch_size=global_batch_size)
    
    
    # Prepare the model
    input_features = {key: value for key, value in FEATURES.items() if key != TARGET_FEATURE_NAME}
    logging.info('Creating the model ...')
    
    with strategy.scope():
        model = create_model(training_ds, input_features, FLAGS.units, FLAGS.dropout_ratio)
        model.compile(optimizer='adam',
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
    # Configure Keras callbacks
    model_dir, tb_dir, checkpoint_dir = set_job_dirs()
    callbacks = [tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=checkpoint_dir)]
    callbacks.append(tf.keras.callbacks.TensorBoard(
            log_dir=tb_dir, update_freq='batch'))
    
    logging.info('Starting training ...')
    model.fit(training_ds, 
              epochs=FLAGS.epochs, 
              validation_data=validation_ds,
              callbacks=callbacks)
    
    # Save trained model
    logging.info('Training completed. Saving the trained model to: {}'.format(model_dir))
    model.save(model_dir, save_traces=True)

In [4]:
logging.set_verbosity(logging.INFO)
argv = []
main(argv)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


INFO:absl:Creating the model ...


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:absl:Starting training ...


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).




INFO:absl:Training completed. Saving the trained model to: /tmp/saved_model


INFO:tensorflow:Assets written to: /tmp/saved_model/assets


INFO:tensorflow:Assets written to: /tmp/saved_model/assets


In [5]:
LOCAL_MODEL_DIR

'/tmp/saved_model'

In [6]:
model = tf.keras.models.load_model(LOCAL_MODEL_DIR)













In [7]:

global_batch_size = 32
selected_fields = {key: {'output_type': value[1]} for key, value in FEATURES.items()}
testing_ds = get_bq_dataset(FLAGS.testing_table,
                                selected_fields,
                                batch_size=global_batch_size)

In [8]:
model.evaluate(testing_ds)



[0.26938238739967346, 0.8825347423553467]

In [None]:
tf.__version__

In [57]:
import os
import sys
import logging
import uuid
import kfp
import json

import kfp.v2.dsl as dsl

from datetime import datetime
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, Metrics, ClassificationMetrics)


from typing import NamedTuple, List


In [34]:
PROJECT = 'jk-mlops-dev'
STAGING_BUCKET = 'gs://jk-vertex-workshop-bucket'
REGION = 'us-central1'
PIPELINES_SA = 'pipelines-sa@jk-mlops-dev.iam.gserviceaccount.com'

BQ_DATASET_NAME = 'chicago_taxi_training_dataset' # Change to your BQ datasent name.
BQ_TRAIN_SPLIT_NAME = 'train_split'
BQ_VALID_SPLIT_NAME = 'valid_split'
BQ_TEST_SPLIT_NAME = 'test_split'
BQ_LOCATION = 'US'
SAMPLE_SIZE = 1000000
YEAR = 2020

In [35]:
api_client = AIPlatformClient(
    project_id=PROJECT,
    region=REGION,
)

In [None]:
    #args = [
    #            '--epochs=' + str(epochs), 
    #            '--per_replica_batch_size=' + str(per_replica_batch_size),
    #            '--training_table=' + training_table,
    #            '--validation_table=' + validation_table,
    #        ],
    

In [62]:
args_str = json.dumps(args)
args_str

'[["--epochs=2", "--per_replica_batch_size=128", "--training_table=jk-mlops-dev.chicago_taxi_ml.training", "--validation_table=jk-mlops-dev.chicago_taxi_ml.validation"]]'

In [71]:
args

['--epochs=2',
 '--per_replica_batch_size=128',
 '--training_table=jk-mlops-dev.chicago_taxi_ml.training',
 '--validation_table=jk-mlops-dev.chicago_taxi_ml.validation']

In [107]:
PIPELINE_NAME = 'test-pipeline'

training_container_image: str = 'gcr.io/jk-mlops-dev/taxi_classifier_trainer'
serving_container_image: str = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-4:latest'
cmd = ["python", "train.py"]

cmd = ["python", "train.py"]
args = [
    '--epochs=2', 
    '--per_replica_batch_size=128',
    '--training_table=jk-mlops-dev.chicago_taxi_ml.training',
    '--validation_table=jk-mlops-dev.chicago_taxi_ml.validation',
]

accelerator_count = 1
accelerator_type = 'NVIDIA_TESLA_T4'
replica_count = 1
machine_type = 'n1-standard-4'


@dsl.pipeline(name=PIPELINE_NAME)
def test_pipeline(
    project: str,
    location: str,
#    model_display_name: str,
    epochs: int,
    per_replica_batch_size: int,
    training_table: str,
    validation_table: str,
#    replica_count: int,
#    machine_type: str,
#    accelerator_count: int,
#    accelerator_type: str
):
    
    cmd = ["python", "train.py"]
    args = [
        '--epochs', str(epochs),
        '--per_replica_batch_size', str(per_replica_batch_size),
        '--training_table', str(training_table),
        '--validation_table',  str(validation_table),
    ]
    
    staging_bucket = 'gs://jk-vertex-workshop-bucket/test'
    
    train = gcc_aip.CustomContainerTrainingJobRunOp(
        project=project,
        location=location,
        model_display_name=model_display_name,
        display_name=model_display_name,
        container_uri=training_container_image,
        command = cmd,
        args = args,
        replica_count=replica_count,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        staging_bucket=staging_bucket,
        model_serving_container_image_uri=serving_container_image
    )
    


In [108]:
package_path = 'test_pipeline.json'
compiler.Compiler().compile(
    pipeline_func=test_pipeline,
    package_path=package_path
)

In [110]:
pipeline_root = f'{STAGING_BUCKET}/pipelines'
model_display_name = 'Taxi tip predictor'

schema = 'gs://jk-vertex-workshop-bucket/schema/schema.pbtxt'


parameter_values = {
    'project': PROJECT,
    'location': REGION,
    'epochs': 2,
    'training_table': 'jk-mlops-dev.chicago_taxi_ml.training',
    'validation_table': 'jk-mlops-dev.chicago_taxi_ml.validation',
    'per_replica_batch_size': 128,
#    'replica_count': 1,
#    'machine_type': 'n1-standard-4',
#    'accelerator_type': 'NVIDIA_TESLA_T4',
#    'accelerator_count': 1
}

response = api_client.create_run_from_job_spec(
    package_path,
    pipeline_root=pipeline_root,
    parameter_values=parameter_values,
    enable_caching=False,
    service_account=PIPELINES_SA
)