# Training and deploying a tabular model using Vertex custom training job - Part 1

![Training pipeline](../images/custom-tabular.png)

In [31]:
import os
import pprint
import pandas as pd
import tensorflow as tf
import time
import matplotlib.pyplot as plt

from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform_v1beta1 import types
from google.cloud.aiplatform import hyperparameter_tuning as hpt

from google.cloud.aiplatform.utils import JobClientWithOverride



## Configure GCP settings

*Before running the notebook make sure to follow the repo's README file to install the pre-requisites.*

In [32]:
PROJECT = 'jk-mlops-dev'
REGION = 'us-central1'

STAGING_BUCKET = 'gs://jk-vertex-us-central1'
VERTEX_SA = f'vertex-sa@{PROJECT}.iam.gserviceaccount.com'

TENSORBOARD = 'projects/1026026909625/locations/us-central1/tensorboards/1351678022536658944'

### Prepare a training script

In [33]:
SCRIPT_FOLDER = 'test'
if tf.io.gfile.exists(SCRIPT_FOLDER):
    tf.io.gfile.rmtree(SCRIPT_FOLDER)
tf.io.gfile.mkdir(SCRIPT_FOLDER)
file_path = os.path.join(SCRIPT_FOLDER, 'train.py')

In [86]:
%%writefile {file_path}


# Copyright 2021 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

import argparse
import json
import os
import logging
import time

def get_args():
    """Defines and parse commandline arguments."""

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--input_path",
        default="/tmp",
        type=str,
    )

    return parser.parse_args()

def main():
    args = get_args()
    

    logging.info('####################')
    print('***********************')
    print(os.listdir(args.input_path))
    
    for i in range(20):
        print('##########')
        logging.info('####################')
        time.sleep(10)
    
    
if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    main()

Overwriting test/train.py


In [97]:
%%writefile {file_path}


# Copyright 2021 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

import hypertune
import json
import os
import re
import tensorflow as tf

from absl import app
from absl import flags
from absl import logging

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow_io import bigquery as tfio_bq

from tensorboard.plugins.hparams import api as tb_hp


FLAGS = flags.FLAGS
flags.DEFINE_integer('epochs', 3, 'Nubmer of epochs')
flags.DEFINE_integer('units', 32, 'Number units in a hidden layer')
flags.DEFINE_integer('per_replica_batch_size', 128, 'Per replica batch size')
flags.DEFINE_float('dropout_ratio', 0.5, 'Dropout ratio')
flags.DEFINE_string('training_table', None, 'Training table name')
flags.DEFINE_string('validation_table', None, 'Validationa table name')
flags.mark_flag_as_required('training_table')
flags.mark_flag_as_required('validation_table')

LOCAL_MODEL_DIR = '/tmp/saved_model'
LOCAL_TB_DIR = '/tmp/logs'
LOCAL_CHECKPOINT_DIR = '/tmp/checkpoints'
EVALUATION_FILE_NAME = 'evaluations.json'

# Define features
FEATURES = {
    "tip_bin": ("categorical", tf.int64),
    "trip_month": ("categorical", tf.int64),
    "trip_day": ("categorical", tf.int64),
    "trip_day_of_week": ("categorical", tf.int64),
    "trip_hour": ("categorical", tf.int64),
    "payment_type": ("categorical", tf.string),
    "pickup_grid": ("categorical", tf.string),
    "dropoff_grid": ("categorical", tf.string),
    "euclidean": ("numeric", tf.double),
    "trip_seconds": ("numeric", tf.int64),
    "trip_miles": ("numeric", tf.double),
}
TARGET_FEATURE_NAME = 'tip_bin'

 # Set hparams for Tensorboard and Vertex hp tuner
HP_DROPOUT = tb_hp.HParam("dropout")
HP_UNITS = tb_hp.HParam("units")
HPARAMS = [
    HP_UNITS,
    HP_DROPOUT,
]
METRICS = [
    tb_hp.Metric(
        "epoch_accuracy",
        group="validation",
        display_name="epoch accuracy"),
]
HPTUNE_METRIC = 'val_accuracy'
    

def set_job_dirs():
    """Sets job directories and hyperparameter tuning trial id
    based on env variables set by Vertex AI."""
    
    model_dir = os.getenv('AIP_MODEL_DIR', LOCAL_MODEL_DIR)
    tb_dir = os.getenv('AIP_TENSORBOARD_LOG_DIR', LOCAL_TB_DIR)
    checkpoint_dir = os.getenv('AIP_CHECKPOINT_DIR', LOCAL_CHECKPOINT_DIR)
    
    path = os.path.normpath(tb_dir)
    trial_id = re.match('^[0-9]+$', path.split(os.sep)[-2])
    if not trial_id:
        trial_id = '0'
    else:
        trial_id = trial_id[0]
    logging.info(trial_id)
    
    return model_dir, tb_dir, checkpoint_dir, trial_id


def get_bq_dataset(table_name, selected_fields, target_feature='tip_bin', batch_size=32):
    
    def _transform_row(row_dict):
        trimmed_dict = {column:
                       (tf.strings.strip(tensor) if tensor.dtype == 'string' else tensor) 
                       for (column,tensor) in row_dict.items()
                       }
        target = trimmed_dict.pop(target_feature)
        return (trimmed_dict, target)

    project_id, dataset_id, table_id = table_name.split('.')
    
    client = tfio_bq.BigQueryClient()
    parent = f'projects/{project_id}'

    read_session = client.read_session(
        parent=parent,
        project_id=project_id,
        table_id=table_id,
        dataset_id=dataset_id,
        selected_fields=selected_fields,
    )

    dataset = read_session.parallel_read_rows().map(_transform_row).batch(batch_size)
    
    return dataset


def get_category_encoding_layer(name, dataset, dtype):
    """Creates a CategoryEncoding layer for a given feature."""

    if dtype == tf.string:
      index = preprocessing.StringLookup()
    else:
      index = preprocessing.IntegerLookup()

    feature_ds = dataset.map(lambda x, y: x[name])
    index.adapt(feature_ds)
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    return lambda feature: encoder(index(feature))


def get_normalization_layer(name, dataset):
  """"Creates a Normalization layer for a given feature."""
  normalizer = preprocessing.Normalization()

  feature_ds = dataset.map(lambda x, y: x[name])
  normalizer.adapt(feature_ds)

  return normalizer


def create_model(dataset, input_features, units, dropout_ratio):
    """Creates a binary classifier for Chicago Taxi tip prediction task."""
    
    all_inputs = []
    encoded_features = []
    for feature_name, feature_info in input_features.items():
        col = tf.keras.Input(shape=(1,), name=feature_name, dtype=feature_info[1])
        if feature_info[0] == 'categorical':
            
            encoding_layer = get_category_encoding_layer(feature_name, 
                                                         dataset,
                                                         feature_info[1])
        else:
            encoding_layer = get_normalization_layer(feature_name,
                                                     dataset) 
        encoded_col = encoding_layer(col)
        all_inputs.append(col)
        encoded_features.append(encoded_col)
        
    all_features = tf.keras.layers.concatenate(encoded_features)
    
    x = tf.keras.layers.Dense(units, activation="relu")(all_features)
    x = tf.keras.layers.Dropout(dropout_ratio)(x)
    output = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(all_inputs, output)
    
    return model


class HptuneCallback(tf.keras.callbacks.Callback):
    """
    A custom Keras callback class that reports a metric to hypertuner
    at the end of each epoch.
    """
    
    def __init__(self, metric_tag, metric_value):
        super(HptuneCallback, self).__init__()
        self.metric_tag = metric_tag
        self.metric_value = metric_value
        self.hpt = hypertune.HyperTune()
        
    def on_epoch_end(self, epoch, logs=None):
        self.hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag=self.metric_tag,
            metric_value=logs[self.metric_value],
            global_step=epoch)
        

def main(argv):
    del argv
    
    # Set distribution strategy
    strategy = tf.distribute.MirroredStrategy()
    
    global_batch_size = (strategy.num_replicas_in_sync *
                         FLAGS.per_replica_batch_size)
    
    # Prepare datasets
    selected_fields = {key: {'output_type': value[1]} for key, value in FEATURES.items()}
    validation_ds = get_bq_dataset(FLAGS.validation_table, 
                                   selected_fields, 
                                   batch_size=global_batch_size)
    training_ds = get_bq_dataset(FLAGS.training_table,
                                 selected_fields,
                                 batch_size=global_batch_size)
    
    # Configure Tensorboard hparams
    model_dir, tb_dir, checkpoint_dir, trial_id = set_job_dirs()
    with tf.summary.create_file_writer(tb_dir).as_default():
        tb_hp.hparams_config(hparams=HPARAMS, metrics=METRICS)
        
    hparams = {
        HP_UNITS: FLAGS.units,
        HP_DROPOUT: FLAGS.dropout_ratio
    }
    
    # Create the model
    input_features = {key: value for key, value in FEATURES.items() if key != TARGET_FEATURE_NAME}
    logging.info('Creating the model ...')
    with strategy.scope():
        model = create_model(training_ds, input_features, hparams[HP_UNITS], hparams[HP_DROPOUT])
        model.compile(optimizer='adam',
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
    # Configure training regimen
    callbacks = [tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=checkpoint_dir)]
    callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=tb_dir, 
                                                    update_freq='batch',
                                                    profile_batch=0))
    callbacks.append(tb_hp.KerasCallback(writer=tb_dir, 
                                         hparams=hparams,
                                         trial_id=trial_id))
    callbacks.append(HptuneCallback(HPTUNE_METRIC, HPTUNE_METRIC))
    
    # Start training
    logging.info('Starting training ...')
    model.fit(training_ds, 
              epochs=FLAGS.epochs, 
              validation_data=validation_ds,
              callbacks=callbacks)
    
    # Save trained model
    logging.info('Training completed. Saving the trained model to: {}'.format(model_dir))
    model.save(model_dir)  
    
    
if __name__ == '__main__':
    logging.set_verbosity(logging.INFO)
    app.run(main)

Overwriting test/train.py


### Initialize Vertex AI SDK

In [98]:
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

In [103]:
BQ_DATASET_NAME = 'chicago_taxi_dataset' 
BQ_TRAIN_SPLIT_NAME = 'training'
BQ_VALID_SPLIT_NAME = 'validation'
BQ_TEST_SPLIT_NAME = 'testing'
BQ_LOCATION = 'US'
SAMPLE_SIZE = 500000
YEAR = 2020

### Configure and submit a Vertex job using a custom container

#### Create a docker file

In [108]:
BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-4'
#BASE_IMAGE = 'nvcr.io/nvidia/merlin/merlin-training:0.5.3'
TRAIN_IMAGE = f'gcr.io/{PROJECT}/test_2'
#TRAIN_IMAGE = f'gcr.io/{PROJECT}/test_1'

dockerfile = f'''
FROM {BASE_IMAGE}

WORKDIR /trainer

# Copies the trainer code to the docker image.
COPY train.py .

ENTRYPOINT []
'''

with open(os.path.join(SCRIPT_FOLDER, 'Dockerfile'), 'w') as f:
    f.write(dockerfile)

#### Build a container image

In [109]:
! docker build -t {TRAIN_IMAGE} {SCRIPT_FOLDER}

Sending build context to Docker daemon  22.02kB
Step 1/4 : FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-4
 ---> ab93ebea3c35
Step 2/4 : WORKDIR /trainer
 ---> Using cache
 ---> e3ee9c630f47
Step 3/4 : COPY train.py .
 ---> Using cache
 ---> 8b83241f20de
Step 4/4 : ENTRYPOINT []
 ---> Running in 5bf481130e60
Removing intermediate container 5bf481130e60
 ---> 0a242dc083b8
Successfully built 0a242dc083b8
Successfully tagged gcr.io/jk-mlops-dev/test_2:latest


In [110]:
! docker push {TRAIN_IMAGE}

Using default tag: latest
The push refers to repository [gcr.io/jk-mlops-dev/test_2]

[1Bc6006e37: Preparing 
[1Bded0109c: Preparing 
[1B7b2f60d1: Preparing 
[1Beb2eb480: Preparing 
[1Bc3eb7ae8: Preparing 
[1B1f0fa8db: Preparing 
[1B1854aed3: Preparing 
[1B3f816411: Preparing 
[1Bbf49b163: Preparing 
[1Bae69fc5d: Preparing 
[1B7465dde9: Preparing 
[1B3d1ada9a: Preparing 
[1Bf07e787b: Preparing 
[1Bbe96190a: Preparing 
[1B4bbff46e: Preparing 
[1B9cefae00: Preparing 
[1B0ad88149: Preparing 
[1B3ab21099: Preparing 
[1B09736a4b: Preparing 
[1Be844d06f: Preparing 
[1B54c6ced7: Preparing 
[1B34b5cf74: Preparing 
[1B0a9a6a11: Preparing 
[1B7e8b38e6: Preparing 
[1B8f196cf4: Preparing 
[12Bbbff46e: Waiting g 
[1B31d2d72b: Preparing 
[1Ba966f459: Preparing 
[14Bcefae00: Waiting g 
[1B49f5bf51: Preparing 
[25B854aed3: Waiting g 
[25Bf816411: Waiting g 
[16Bab21099: Waiting g 
[1B09cad0ba: Layer already exists [27A[2K[26A[2K[23A[2K[19A[2K[18A[2K[11A[2K

#### Prepare worker pool specification

In [111]:
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_T4",
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_IMAGE,
            "command": ["python"],
            "args": [
#                'python',
                'train.py',
#                '--input_path=/tmp', 
                '--epochs=2', 
                '--per_replica_batch_size=128',
                '--training_table=' + f'{PROJECT}.{BQ_DATASET_NAME}.{BQ_TRAIN_SPLIT_NAME}',
                '--validation_table=' + f'{PROJECT}.{BQ_DATASET_NAME}.{BQ_VALID_SPLIT_NAME}',
            ],
        },
    }
]

print(worker_pool_specs)

[{'machine_spec': {'machine_type': 'n1-standard-4', 'accelerator_type': 'NVIDIA_TESLA_T4', 'accelerator_count': 1}, 'replica_count': 1, 'container_spec': {'image_uri': 'gcr.io/jk-mlops-dev/test_2', 'command': ['python'], 'args': ['train.py', '--epochs=2', '--per_replica_batch_size=128', '--training_table=jk-mlops-dev.chicago_taxi_dataset.training', '--validation_table=jk-mlops-dev.chicago_taxi_dataset.validation']}}]


#### Submit and monitor the job

In [112]:
job_name = 'CUSTOM_CONTAINER_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
base_output_dir = '{}/jobs/{}'.format(STAGING_BUCKET, job_name)

job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=base_output_dir
)

job.run(sync=False, 
        service_account=VERTEX_SA,
#        tensorboard=TENSORBOARD
)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/7163947175611727872
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/7163947175611727872')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7163947175611727872?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/7163947175611727872 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/7163947175611727872 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/