# 02 - ML Experimentation with Custom Model

The purpose of this notebook is to use [custom training](https://cloud.google.com/ai-platform-unified/docs/training/custom-training) to train a keras classifier to predict whether a given trip will result in a tip > 20%. The notebook covers the following tasks:
1. Preprocess the data locally using Apache Bean.
2. Train and test custom model locally using a Keras implementation.
3. Submit a Dataflow job to preprocess the data at scale.
4. Submit a custom training job to Vertex AI using a [pre-built container](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers).
5. Upload the trained model to Vertex AI.
6. Exract and visualize experiment parameters from [Vertex AI Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction).

## Setup

In [None]:
%load_ext autoreload
%autoreload 2?

In [None]:
import os
import time
import logging
from datetime import datetime
import numpy as np

import tensorflow as tf
import tensorflow_transform as tft
import tensorflow.keras as keras

from src.common import features
from src.model_training import data, model, defaults, trainer, exporter
from src.preprocessing import etl

logging.getLogger().setLevel(logging.INFO)

print(f"TensorFlow: {tf.__version__}")
print(f"TensorFlow Transform: {tft.__version__}")

In [None]:
PROJECT = 'ksalama-cloudml'  # Change to your project Id.
REGION = 'us-central1'
BUCKET = 'ksalama-cloudml-us'  # Change to your bucket.

DATASET_DISPLAY_NAME = 'chicago_taxi_tips'
MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}_classifier_v1'

WORKSPACE = f'gs://{BUCKET}/ucaip_demo/chicago_taxi/experiments'
RAW_SCHEMA_LOCATION = 'src/raw_schema/schema.pbtxt'
TRAINING_DIR = os.path.join(WORKSPACE, 'training_output')
PREPROCESSING_DIR = os.path.join(WORKSPACE, 'preprocessing_output')

EXPERIMENT_NAME = f'{DATASET_DISPLAY_NAME}-experiment'.replace("_", "-")

In [None]:
REMOVE_WORKSPACE = True
if tf.io.gfile.exists(WORKSPACE) and REMOVE_WORKSPACE:
    print("Removing previous local workspace...")
    tf.io.gfile.rmtree(WORKSPACE)

print("Creating new local workspace...")
tf.io.gfile.mkdir(WORKSPACE)

In [None]:
from src.utils.vertex_utils import VertexUtils
vertex_utils = VertexUtils(PROJECT, REGION, BUCKET)

## Initialize Vertex AI Experiment

In [None]:
vertex_utils.set_experiment(EXPERIMENT_NAME)
vertex_utils.start_experiment_run(f"run-local-{datetime.now().strftime('%Y%m%d%H%M%S')}")

## 1. Preprocess the data using Apache Beam

The Apache Beam pipeline of data preprocessing is implemented in the [preprocessing](src/preprocessing) directory.

In [None]:
EXPORTED_DATA_PREFIX = os.path.join(PREPROCESSING_DIR, 'exported_data')
TRANSFORMED_DATA_PREFIX = os.path.join(PREPROCESSING_DIR, 'transformed_data')
TRANSFORM_ARTEFACTS_DIR = os.path.join(PREPROCESSING_DIR, 'transform_artifacts')

### Get Source Query from Managed Dataset

In [None]:
from src.utils import datasource_utils

In [None]:
DATA_SPLIT = 'UNASSIGNED'
LIMIT = 5120

raw_data_query = datasource_utils.get_training_source_query(
    project=PROJECT, 
    region=REGION, 
    dataset_display_name=DATASET_DISPLAY_NAME, 
    data_split=DATA_SPLIT, 
    limit=LIMIT
)

print(raw_data_query)

### Test Data Preprocessing Locally

In [None]:
if tf.io.gfile.exists(PREPROCESSING_DIR):
    print("Removing previous preprocessing outputs...")
    tf.io.gfile.rmtree(PREPROCESSING_DIR)
print("Creating preprocessing outputs directory...")
tf.io.gfile.mkdir(PREPROCESSING_DIR)

args = {
    'runner': 'DirectRunner',
    'raw_data_query': raw_data_query,
    'write_raw_data': True,
    'exported_data_prefix': EXPORTED_DATA_PREFIX,
    'transformed_data_prefix': TRANSFORMED_DATA_PREFIX,
    'transform_artefact_dir': TRANSFORM_ARTEFACTS_DIR,
    'temporary_dir': os.path.join(WORKSPACE, 'tmp'),
    'gcs_location': f'gs://{BUCKET}/bq_tmp',
    'project': PROJECT
}

In [None]:
vertex_utils.log_params(args)

In [None]:
print("Data preprocessing started...")
etl.run_transform_pipeline(args)
print("Data preprocessing completed.")

In [None]:
!gsutil ls {PREPROCESSING_DIR}

In [None]:
vertex_utils.log_params({"preprocessing_output_dir": PREPROCESSING_DIR})

## 2. Train a Custom Model Localy using a Keras Implementation

The Keras implementation of the custom model is in the [model_training](src/model_training) directory.

In [None]:
LOG_DIR = os.path.join(TRAINING_DIR, 'logs')
EXPORT_DIR = os.path.join(TRAINING_DIR, 'export')

### Read transformed data

In [None]:
tft_output = tft.TFTransformOutput(TRANSFORM_ARTEFACTS_DIR)
transform_feature_spec = tft_output.transformed_feature_spec()
transform_feature_spec

In [None]:
train_data_file_pattern = os.path.join(TRANSFORMED_DATA_PREFIX,'train/data-*.gz')
eval_data_file_pattern = os.path.join(TRANSFORMED_DATA_PREFIX,'eval/data-*.gz')

for input_features, target in data.get_dataset(
    train_data_file_pattern, transform_feature_spec, batch_size=3).take(1):
    for key in input_features:
        print(f"{key} {input_features[key].dtype}: {input_features[key].numpy().tolist()}")
    print(f"target: {target.numpy().tolist()}")

### Create model inputs

In [None]:
input_layers = model.create_model_inputs()
input_layers

### Create hyperparameters

In [None]:
hyperparams = {
    "hidden_units": [64, 32]
}

hyperparams = defaults.update_hyperparams(hyperparams)
hyperparams

### Create and test model inputs and outputs

In [None]:
classifier = model.create_binary_classifier(tft_output, hyperparams)
classifier.summary()

In [None]:
keras.utils.plot_model(
    classifier, 
    show_shapes=True, 
    show_dtype=True
)

In [None]:
classifier(input_features)

### Train the model locally.

In [None]:
logging.getLogger().setLevel(logging.INFO)

hyperparams["learning_rate"] = 0.001
hyperparams["num_epochs"] = 3
hyperparams["batch_size"] = 512

vertex_utils.log_params(hyperparams)

In [None]:
classifier = trainer.train(
    train_data_dir=train_data_file_pattern,
    eval_data_dir=eval_data_file_pattern,
    raw_schema_location=RAW_SCHEMA_LOCATION,
    tft_output_dir=TRANSFORM_ARTEFACTS_DIR,
    hyperparams=hyperparams,
    log_dir=LOG_DIR,
)

In [None]:
val_loss, val_accuracy = trainer.evaluate(
    model=classifier,
    data_dir=eval_data_file_pattern,
    raw_schema_location=RAW_SCHEMA_LOCATION,
    tft_output_dir=TRANSFORM_ARTEFACTS_DIR,
    hyperparams=hyperparams,
)

In [None]:
vertex_utils.log_metrics(
    {"val_loss": val_loss, "val_accuracy": val_accuracy})

### Export the trained model

In [None]:
if tf.io.gfile.exists(EXPORT_DIR):
    tf.io.gfile.rmtree(EXPORT_DIR)
    
saved_model_dir = os.path.join(EXPORT_DIR)

exporter.export_serving_model(
    classifier=classifier,
    serving_model_dir=saved_model_dir,
    raw_schema_location=RAW_SCHEMA_LOCATION,
    tft_output_dir=TRANSFORM_ARTEFACTS_DIR,
)

In [None]:
!saved_model_cli show --dir={saved_model_dir} --tag_set=serve --signature_def=serving_tf_example

In [None]:
!saved_model_cli show --dir={saved_model_dir} --tag_set=serve --signature_def=serving_default

### Test the exported SavedModel

In [None]:
serving_model = tf.saved_model.load(saved_model_dir)

In [None]:
file_names = tf.data.TFRecordDataset.list_files(EXPORTED_DATA_PREFIX + '-*.tfrecord')
for batch in tf.data.TFRecordDataset(file_names).batch(3).take(1):
    predictions = serving_model.signatures['serving_tf_example'](batch)
    for key in predictions:
        print(f"{key}: {predictions[key]}")

In [None]:
import tensorflow_data_validation as tfdv
from tensorflow_transform.tf_metadata import schema_utils

raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION)
raw_feature_spec = schema_utils.schema_as_feature_spec(raw_schema).feature_spec

In [None]:
instance = {
    "dropoff_grid": "POINT(-87.6 41.9)",
    "euclidean": 2064.2696,
    "loc_cross": "",
    "payment_type": "Credit Card",
    "pickup_grid": "POINT(-87.6 41.9)",
    "trip_miles": 1.37,
    "trip_day": 12,
    "trip_hour": 6,
    "trip_month": 2,
    "trip_day_of_week": 4,
    "trip_seconds": 555,
}

for feature_name in instance:
    dtype = raw_feature_spec[feature_name].dtype
    instance[feature_name] = tf.constant([[instance[feature_name]]], dtype)

In [None]:
predictions = serving_model.signatures['serving_default'](**instance)
for key in predictions:
    print(f"{key}: {predictions[key].numpy()}")

## 3. Submit a Data Processing Job to Dataflow

In [None]:
vertex_utils.start_run(f"run-gcp-{datetime.now().strftime('%Y%m%d%H%M%S')}")

In [None]:
if tf.io.gfile.exists(PREPROCESSING_DIR):
    print("Removing previous preprocessing outputs...")
    tf.io.gfile.rmtree(PREPROCESSING_DIR)
print("Creating preprocessing outputs directory...")
tf.io.gfile.mkdir(PREPROCESSING_DIR)

In [None]:
DATA_SPLIT = 'UNASSIGNED'
LIMIT = 1000000
raw_data_query = datasource_utils.get_training_source_query(
    project=PROJECT, 
    region=REGION, 
    dataset_display_name=DATASET_DISPLAY_NAME, 
    data_split=DATA_SPLIT, 
    limit=LIMIT
)

args = {
    'runner': 'DataflowRunner',
    'raw_data_query': raw_data_query,
    'exported_data_prefix': EXPORTED_DATA_PREFIX,
    'transformed_data_prefix': TRANSFORMED_DATA_PREFIX,
    'transform_artefact_dir': TRANSFORM_ARTEFACTS_DIR,
    'write_raw_data': False,
    'temporary_dir': os.path.join(WORKSPACE, 'tmp'),
    'gcs_location': os.path.join(WORKSPACE, 'bq_tmp'),
    'project': PROJECT,
    'region': REGION,
    'setup_file': './setup.py'
}

In [None]:
vertex_utils.log_params(args)

In [None]:
logging.getLogger().setLevel(logging.ERROR)

print("Data preprocessing started...")
etl.run_transform_pipeline(args)
print("Data preprocessing completed.")

In [None]:
!gsutil ls {PREPROCESSING_DIR}

## 4. Submit a Custom Training Job to AI Platform

### Test the training task locally

In [None]:
!python -m src.model_training.task \
    --model-dir={EXPORT_DIR} \
    --log-dir={LOG_DIR} \
    --train-data-dir={TRANSFORMED_DATA_PREFIX}/train/* \
    --eval-data-dir={TRANSFORMED_DATA_PREFIX}/eval/*  \
    --tft-output-dir={TRANSFORM_ARTEFACTS_DIR} \
    --num-epochs=5 \
    --hidden-units=32,32 \
    --experiment-name={EXPERIMENT_NAME} \
    --project={PROJECT} \
    --region={REGION} \
    --staging-bucket={BUCKET}

### Prepare training package

In [None]:
TRAINER_PACKAGE_DIR = os.path.join(WORKSPACE, 'trainer_packages')

!rm -r src/__pycache__/
!rm -r src/.ipynb_checkpoints/
!rm -r src/raw_schema/.ipynb_checkpoints/
!rm -f custom_job.tar custom_job.tar.gz

!mkdir custom_job

!cp setup.py custom_job/
!cp -r src custom_job/
!tar cvf custom_job.tar custom_job
!gzip custom_job.tar
!gsutil cp custom_job.tar.gz {TRAINER_PACKAGE_DIR}/
!rm -r custom_job

### Prepare the training job

In [None]:
TRAIN_RUNTIME = 'tf-cpu.2-4'
TRAIN_IMAGE = f"gcr.io/cloud-aiplatform/training/{TRAIN_RUNTIME}:latest"

In [None]:
num_epochs = 10
learning_rate = 0.001
hidden_units = "64,64"

trainer_args = [
    f'--train-data-dir={TRANSFORMED_DATA_PREFIX + "/train/*"}',
    f'--eval-data-dir={TRANSFORMED_DATA_PREFIX + "/eval/*"}',
    f'--tft-output-dir={TRANSFORM_ARTEFACTS_DIR}',
    f'--num-epochs={num_epochs}',
    f'--learning-rate={learning_rate}',
    f'--project={PROJECT}',
    f'--region={REGION}',
    f'--staging-bucket={BUCKET}',
    f'--experiment-name={EXPERIMENT_NAME}'
]

In [None]:
training_spec = [
    {
        "replica_count": 1,
        "machine_spec": {
            "machine_type": 'n1-standard-4',
            "accelerator_count": 0
    },
        "python_package_spec": {
            "executor_image_uri": TRAIN_IMAGE,
            "package_uris": [os.path.join(TRAINER_PACKAGE_DIR, 'custom_job.tar.gz')],
            "python_module": "src.model_training.task",
            "args": trainer_args,
        }
    }
]

### Submit the training job

In [None]:
if tf.io.gfile.exists(TRAINING_DIR):
    print("Removing previous training outputs...")
    tf.io.gfile.rmtree(TRAINING_DIR)

print("Submitting a custom training job...")
job = vertex_utils.submit_custom_job(
    model_display_name=MODEL_DISPLAY_NAME,
    training_spec=training_spec,
    training_dir=TRAINING_DIR,
)
print(f"Job {job.name} sbumitted.")

### Monitor job state

In [None]:
while True:
    response = vertex_utils.get_custom_training_job_by_uri(job.name)
    if response.state.name == 'JOB_STATE_SUCCEEDED':
        print("Training job completed. - Training Time:", response.update_time - response.create_time)
        break
    elif response.state.name == 'JOB_STATE_FAILED':
        print("Training job failed!")
        break
    else:
        print(f"Training job state is: {response.state.name}.")
    time.sleep(60)

## 5. Upload exported model to AI Platform Models

### Prepare the model upload specs

In [None]:
exported_model_dir = os.path.join(TRAINING_DIR, 'model')

In [None]:
SERVING_RUNTIME='tf2-cpu.2-4'
SERVING_IMAGE = f"gcr.io/cloud-aiplatform/prediction/{SERVING_RUNTIME}:latest"

### Upload model

In [None]:
response = vertex_utils.upload_model(
    model_display_name=MODEL_DISPLAY_NAME,
    model_artifact_uri=exported_model_dir,
    serving_image_uri=SERVING_IMAGE,
    predict_schemata=None
)

In [None]:
response.result()

## 6. Exract and Visualize Experiment Parameters

In [None]:
experiment_df = vertex_utils.get_experiment_df(EXPERIMENT_NAME)
experiment_df

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

plt.rcParams["figure.figsize"] = [15, 5]

ax = pd.plotting.parallel_coordinates(
    experiment_df.reset_index(level=0),
    "run_name",
    cols=[
        "param.num_epochs",
        "param.hidden_units",
        "param.learning_rate",
        "metric.val_loss",
        "metric.val_accuraccy",
    ],
)
ax.set_yscale("symlog")
ax.legend(bbox_to_anchor=(1.0, 0.5))

In [None]:
print("Vertex AI Experiments:")
print(
    f"https://console.cloud.google.com/vertex-ai/locations{REGION}/experiments/{EXPERIMENT_NAME}/metrics?project={PROJECT}"
)