# 02.1 - ML Experimentation with Custom Model

The purpose of this notebook is to use [custom training](https://cloud.google.com/ai-platform-unified/docs/training/custom-training) to train a keras classifier to predict whether a given trip will result in a tip > 20%.

## Setup

In [2]:
%load_ext autoreload
%autoreload 2?

In [3]:
import os
import time
import logging
from datetime import datetime
import numpy as np
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow.keras as keras
from google.cloud.aiplatform import gapic as aip
import apache_beam as beam

from model_src import data, features, preprocessing, model, defaults, trainer, exporter
from dataflow_src import data_prep

print(f"TensorFlow: {tf.__version__}")
print(f"TensorFlow Transform: {tft.__version__}")
print(f"Apache Beam: {beam.__version__}")

INFO:apache_beam.typehints.native_type_compatibility:Using Any for unsupported type: typing.Sequence[~T]
TensorFlow: 2.3.0
TensorFlow Transform: 0.26.0
Apache Beam: 2.28.0


In [4]:
PROJECT = 'ksalama-cloudml'
REGION = 'us-central1'
BUCKET = 'ksalama-cloudml-us'

DATASET_DISPLAYNAME = 'chicago_taxi_tips'
CUSTOM_MODEL_DISPLAYNAME = f'{DATASET_DISPLAYNAME}_classifier_custom'
API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"
PARENT = f"projects/{PROJECT}/locations/{REGION}"
client_options = {"api_endpoint": API_ENDPOINT}

LOCAL_WORKSPACE = '_workspace'
RAW_SCHEMA_DIR = 'model_src/raw_schema/schema.pbtxt'
TRAINING_DIR = os.path.join(LOCAL_WORKSPACE, 'training_output')
PREPROCESSING_DIR = os.path.join(LOCAL_WORKSPACE, 'preprocessing_output')

In [None]:
REMOVE_WORKSPACE = True
if tf.io.gfile.exists(LOCAL_WORKSPACE) and REMOVE_WORKSPACE:
    print("Removing previous local workspace...")
    tf.io.gfile.rmtree(LOCAL_WORKSPACE)

print("Creating new local workspace...")
tf.io.gfile.mkdir(LOCAL_WORKSPACE)

## Get Source Query from Managed Dataset

In [None]:
def get_source_query(dataset_display_name, data_split, limit):
    
    dataset_client = aip.DatasetServiceClient(client_options=client_options)
    for dataset in dataset_client.list_datasets(parent=PARENT):
        if dataset.display_name == dataset_display_name:
            dataset_uri = dataset.name
            break

    dataset = dataset_client.get_dataset(name=dataset_uri)
    bq_source_uri = dataset.metadata['inputConfig']['bigquerySource']['uri']
    _, bq_dataset_name, bq_table_name = bq_source_uri.replace("g://", "").split('.')
    
    query = f'''
        SELECT 
            CAST(trip_start_timestamp AS STRING) trip_start_timestamp,
            IF(trip_month IS NULL, -1, trip_month) trip_month,	
            IF(trip_day IS NULL, -1, trip_day) trip_day,
            IF(trip_day_of_week IS NULL, -1, trip_day_of_week) trip_day_of_week,
            IF(trip_hour IS NULL, -1, trip_hour) trip_hour,	
            IF(trip_seconds IS NULL, -1, trip_seconds) trip_seconds,
            IF(trip_miles IS NULL, -1, trip_miles) trip_miles,
            IF(payment_type IS NULL, 'NA', payment_type) payment_type,
            IF(pickup_grid IS NULL, 'NA', pickup_grid) pickup_grid,
            IF(dropoff_grid IS NULL, 'NA', dropoff_grid) dropoff_grid,
            IF(euclidean IS NULL, -1, euclidean) euclidean,
            IF(loc_cross IS NULL, 'NA', loc_cross) loc_cross,
            tip_bin
        FROM {bq_dataset_name}.{bq_table_name} 
        WHERE data_split = '{data_split}' LIMIT {limit}
    '''
    return query


## Test Data Preprocessing Locally

In [None]:
EXPORTED_DATA_PREFIX = os.path.join(PREPROCESSING_DIR, 'exported_data')
TRANSFORMED_DATA_PREFIX = os.path.join(PREPROCESSING_DIR, 'transformed_data')
TRANSFORM_ARTEFACTS_DIR = os.path.join(PREPROCESSING_DIR, 'transform_artifacts')

In [None]:
if tf.io.gfile.exists(PREPROCESSING_DIR):
    print("Removing previous preprocessing outputs...")
    tf.io.gfile.rmtree(PREPROCESSING_DIR)
print("Creating preprocessing outputs directory...")
tf.io.gfile.mkdir(PREPROCESSING_DIR)

DATA_SPLIT = 'UNASSIGNED'
LIMIT = 5120

raw_data_query = get_source_query(
    DATASET_DISPLAYNAME, DATA_SPLIT, LIMIT)

args = {
    'runner': 'DirectRunner',
    'raw_schema_location': RAW_SCHEMA_DIR,
    'raw_data_query': raw_data_query,
    'write_raw_data': True,
    'exported_data_prefix': EXPORTED_DATA_PREFIX,
    'transformed_data_prefix': TRANSFORMED_DATA_PREFIX,
    'transform_artefact_dir': TRANSFORM_ARTEFACTS_DIR,
    'temporary_dir': os.path.join(LOCAL_WORKSPACE, 'tmp'),
    'gcs_location': f'gs://{BUCKET}/bq_tmp',
    'project': PROJECT
}

In [None]:
tf.get_logger().setLevel('ERROR')

print("Data preprocessing started...")
data_prep.run_transform_pipeline(args)
print("Data preprocessing completed.")

In [None]:
!ls {PREPROCESSING_DIR}

## Test the model locally

In [None]:
LOG_DIR = os.path.join(TRAINING_DIR, 'logs')
EXPORT_DIR = os.path.join(TRAINING_DIR, 'export')

### Read transformed data

In [None]:
tft_output = tft.TFTransformOutput(TRANSFORM_ARTEFACTS_DIR)
transform_feature_spec = tft_output.transformed_feature_spec()
transform_feature_spec

In [None]:
train_data_file_pattern = os.path.join(TRANSFORMED_DATA_PREFIX,'train/data-*.gz')
eval_data_file_pattern = os.path.join(TRANSFORMED_DATA_PREFIX,'eval/data-*.gz')

for input_features, target in data.get_dataset(
    train_data_file_pattern, transform_feature_spec, batch_size=3).take(1):
    for key in input_features:
        print(f"{key} ({input_features[key].dtype}): {input_features[key].numpy().tolist()}")
    print(f"target: {target.numpy().tolist()}")

### Create model inputs

In [None]:
input_layers = model.create_model_inputs()
input_layers

In [None]:
hyperparams = {
    "hidden_units": [64, 32]
}

hyperparams = defaults.update_hyperparams(hyperparams)
hyperparams

In [None]:
classifier = model.create_binary_classifier(tft_output, hyperparams)
classifier.summary()

In [None]:
keras.utils.plot_model(
    classifier, 
    show_shapes=True, 
    #show_dtype=True
)

In [None]:
classifier(input_features)

In [None]:
logging.getLogger().setLevel(logging.INFO)

hyperparams["learning_rate"] = 0.001
hyperparams["num_epochs"] = 3
hyperparams["batch_size"] = 512

classifier = trainer.train(
    train_data_dir=train_data_file_pattern,
    eval_data_dir=eval_data_file_pattern,
    raw_schema_dir=RAW_SCHEMA_DIR,
    tft_output_dir=TRANSFORM_ARTEFACTS_DIR,
    hyperparams=hyperparams,
    log_dir=LOG_DIR,
)

In [None]:
if tf.io.gfile.exists(EXPORT_DIR):
    tf.io.gfile.rmtree(EXPORT_DIR)
    
saved_model_dir = os.path.join(EXPORT_DIR)

exporter.export_serving_model(
    classifier=classifier,
    serving_model_dir=saved_model_dir,
    raw_schema_dir=RAW_SCHEMA_DIR,
    tft_output_dir=TRANSFORM_ARTEFACTS_DIR,
)

In [None]:
!saved_model_cli show --dir={saved_model_dir} --tag_set=serve --signature_def=serving_tf_example

In [None]:
!saved_model_cli show --dir={saved_model_dir} --tag_set=serve --signature_def=serving_default

In [None]:
serving_model = tf.saved_model.load(saved_model_dir)

In [None]:
file_names = tf.data.TFRecordDataset.list_files(EXPORTED_DATA_PREFIX + '-*.tfrecord')
for batch in tf.data.TFRecordDataset(file_names).batch(3).take(1):
    predictions = serving_model.signatures['serving_tf_example'](batch)
    for key in predictions:
        print(f"{key}: {predictions[key]}")

In [None]:
import tensorflow_data_validation as tfdv
from tensorflow_transform.tf_metadata import schema_utils

raw_schema = tfdv.load_schema_text(RAW_SCHEMA_DIR)
raw_feature_spec = schema_utils.schema_as_feature_spec(raw_schema).feature_spec

In [None]:
instance = {
    "dropoff_grid": "POINT(-87.6 41.9)",
    "euclidean": 2064.2696,
    "loc_cross": "",
    "payment_type": "Credit Card",
    "pickup_grid": "POINT(-87.6 41.9)",
    "trip_miles": 1.37,
    "trip_day": 12,
    "trip_hour": 6,
    "trip_month": 2,
    "trip_day_of_week": 4,
    "trip_seconds": 555,
}

for feature_name in instance:
    dtype = raw_feature_spec[feature_name].dtype
    instance[feature_name] = tf.constant([[instance[feature_name]]], dtype)

In [None]:
predictions = serving_model.signatures['serving_default'](**instance)
for key in predictions:
    print(f"{key}: {predictions[key].numpy()}")

## Train the Model on AI Platform

In [6]:
GCS_WORKSPACE = f"gs://{BUCKET}/ucaip_demo/chicago_taxi"
PREPROCESSING_DIR = os.path.join(GCS_WORKSPACE, 'preprocessing_output')
TRAINING_DIR = os.path.join(GCS_WORKSPACE, 'training_output')

EXPORTED_DATA_PREFIX = os.path.join(PREPROCESSING_DIR, 'exported_data')
TRANSFORMED_DATA_PREFIX = os.path.join(PREPROCESSING_DIR, 'transformed_data')
TRANSFORM_ARTEFACTS_DIR = os.path.join(PREPROCESSING_DIR, 'transform_artifacts')

### Preprocess the data using Dataflow

In [None]:
if tf.io.gfile.exists(PREPROCESSING_DIR):
    print("Removing previous preprocessing outputs...")
    tf.io.gfile.rmtree(PREPROCESSING_DIR)
print("Creating preprocessing outputs directory...")
tf.io.gfile.mkdir(PREPROCESSING_DIR)

DATA_SPLIT = 'UNASSIGNED'
LIMIT = 1000000

args = {
    #'runner': 'DataflowRunner',
    'raw_schema_location': RAW_SCHEMA_DIR,
    'raw_data_query': get_source_query(DATASET_DISPLAYNAME, DATA_SPLIT, LIMIT),
    'exported_data_prefix': EXPORTED_DATA_PREFIX,
    'transformed_data_prefix': TRANSFORMED_DATA_PREFIX,
    'transform_artefact_dir': TRANSFORM_ARTEFACTS_DIR,
    'write_raw_data': False,
    'temporary_dir': os.path.join(GCS_WORKSPACE, 'tmp'),
    'gcs_location': os.path.join(GCS_WORKSPACE, 'bq_tmp'),
    'project': PROJECT,
    'region': REGION,
    'setup_file': './setup.py'
}

In [None]:
logging.getLogger().setLevel(logging.ERROR)

print("Data preprocessing started...")
data_prep.run_transform_pipeline(args)
print("Data preprocessing completed.")

In [None]:
!gsutil ls {PREPROCESSING_DIR}

### Prepare training package

In [None]:
!python -m model_src.task \
    --model-dir={EXPORT_DIR} \
    --log-dir={LOG_DIR} \
    --train-data-dir={TRANSFORMED_DATA_PREFIX}/train/* \
    --eval-data-dir={TRANSFORMED_DATA_PREFIX}/eval/*  \
    --tft-output-dir={TRANSFORM_ARTEFACTS_DIR} \
    --num-epochs=1 \
    --hidden-units=32,32

In [None]:
TRAINER_PACKAGE_DIR = os.path.join(GCS_WORKSPACE, 'trainer_packages')

!rm -r model_src/__pycache__/
!rm -r model_src/.ipynb_checkpoints/
!rm -r model_src/raw_schema/.ipynb_checkpoints/
!rm -f custom_job.tar custom_job.tar.gz
!mkdir custom_job
!cp setup.py custom_job/
!cp -r model_src custom_job/
!tar cvf custom_job.tar custom_job
!gzip custom_job.tar
!gsutil cp custom_job.tar.gz {TRAINER_PACKAGE_DIR}/
!rm -r custom_job

### Submit AI Platform custom training job

In [None]:
TRAIN_RUNTIME='tf-cpu.2-3'
TRAIN_IMAGE = f"gcr.io/cloud-aiplatform/training/{TRAIN_RUNTIME}:latest"

def submit_custom_job(
    job_client, 
    model_display_name,
    trainer_package_uri,
    training_dir,
    trainer_args,
):
    
    job_name = f"train_{model_display_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
    
    worker_pool_spec = [
        {
            "replica_count": 1,
            "machine_spec": {
                "machine_type": 'n1-standard-4',
                "accelerator_count": 0
            },
            "python_package_spec": {
                "executor_image_uri": TRAIN_IMAGE,
                "package_uris": [trainer_package_uri],
                "python_module": "model_src.task",
                "args": trainer_args,
            }
        }
    ]
    
    custom_job = {
        "display_name": job_name,
        "job_spec": {
            "worker_pool_specs": worker_pool_spec,
            "base_output_directory": {
                "output_uri_prefix": training_dir
            }
        }
    }
    
    job_client = aip.JobServiceClient(
        client_options=client_options
    )
    
    response = job_client.create_custom_job(
        parent=PARENT, custom_job=custom_job)
    
    print("name:", response.name)
    print("display_name:", response.display_name)
    print("state:", response.state)
    print("create_time:", response.create_time)
    print("update_time:", response.update_time)
    return response.name


In [None]:
job_client = aip.JobServiceClient(
    client_options=client_options)

if tf.io.gfile.exists(TRAINING_DIR):
    print("Removing previous training outputs...")
    tf.io.gfile.rmtree(TRAINING_DIR)

trainer_args = [
    f'--train-data-dir={TRANSFORMED_DATA_PREFIX + "/train/*"}',
    f'--eval-data-dir={TRANSFORMED_DATA_PREFIX + "/eval/*"}',
    f'--tft-output-dir={TRANSFORM_ARTEFACTS_DIR}',
    f'--num-epochs={10}',
    f'--learning-rate={0.001}',
    f'--hidden-units=64,32'
]

job_id = submit_custom_job(
    job_client=job_client, 
    model_display_name=CUSTOM_MODEL_DISPLAYNAME,
    trainer_package_uri=os.path.join(TRAINER_PACKAGE_DIR, 'custom_job.tar.gz'),
    training_dir=TRAINING_DIR,
    trainer_args=trainer_args
)

In [None]:
while True:
    response = job_client.get_custom_job(name=job_id)
    if response.state == aip.JobState.JOB_STATE_SUCCEEDED:
        print("Training job completed. - Training Time:", response.update_time - response.create_time)
        break
        print("Training job has not completed:", response.state)
    elif response.state == aip.JobState.JOB_STATE_FAILED:
        print("Training job failed!")
        break
    else:
        print("Training job is running.")
    time.sleep(60)

## Upload exported model to AI Platform Models

In [7]:
exported_model_dir = os.path.join(TRAINING_DIR, 'model')

!gsutil ls {exported_model_dir}

gs://ksalama-cloudml-us/ucaip_demo/chicago_taxi/training_output/model/
gs://ksalama-cloudml-us/ucaip_demo/chicago_taxi/training_output/model/saved_model.pb
gs://ksalama-cloudml-us/ucaip_demo/chicago_taxi/training_output/model/assets/
gs://ksalama-cloudml-us/ucaip_demo/chicago_taxi/training_output/model/variables/


### Upload predict schemata yaml files to Cloud Storage

In [None]:
PREDICT_SCHEMATA_DIR = os.path.join(GCS_WORKSPACE, 'predict_schemata')
!gsutil cp predict_schemata/* {PREDICT_SCHEMATA_DIR}
!gsutil ls {PREDICT_SCHEMATA_DIR}

In [8]:
SERVING_RUNTIME='tf2-cpu.2-3'
SERVING_IMAGE = f"gcr.io/cloud-aiplatform/prediction/{SERVING_RUNTIME}:latest"

model = {
    "display_name": CUSTOM_MODEL_DISPLAYNAME,
    "artifact_uri": exported_model_dir,
#     "predict_schemata": {
#         "instance_schema_uri": os.path.join(PREDICT_SCHEMATA_DIR, 'instance_schema.yaml'),
#         "prediction_schema_uri": os.path.join(PREDICT_SCHEMATA_DIR, 'prediction_schema.yaml'),
    
#     },
    "container_spec": {
        "image_uri": SERVING_IMAGE,
        "command": [],
        "args": [],
        "env": [{"name": "env_name", "value": "env_value"}],
        "ports": [{"container_port": 8080}],
        "predict_route": "",
        "health_route": "",
    },
#    "metadata_schema_uri": "gs://google-cloud-aiplatform/schema/model/metadata/automl_tabular_1.0.0.yaml"
}

### Upload model

In [9]:
model_client = aip.ModelServiceClient(client_options=client_options)

response = model_client.upload_model(
    model=model,
    parent=PARENT
)

# response.result()

model: "projects/900786220115/locations/us-central1/models/2233169688664211456"

In [14]:
response_result = response.result()
response_result.model

'projects/900786220115/locations/us-central1/models/2233169688664211456'

In [None]:
model_list = model_client.list_models(parent=PARENT)

for entry in model_list:
    if entry.display_name == CUSTOM_MODEL_DISPLAYNAME:
        model_uri = entry.name
        break

print(model_uri)