# 02 - Experimentation - Vertex AI

This notebook covers the following steps:

1. Build the Docker container image.
2. Submit a `Vertex AI` custom job to prepare the data.
3. Submit a `Vertex AI` custom job to train and export the model.
4. Upload the exported model as a Vertex AI model resource.

## Setup

In [1]:
import os
import logging
from datetime import datetime

import tensorflow as tf
from google.cloud import aiplatform as vertex_ai

logging.getLogger().setLevel(logging.INFO)
tf.get_logger().setLevel('INFO')

print(f"TensorFlow: {tf.__version__}")

TensorFlow: 2.4.1


In [2]:
PROJECT = 'merlin-on-gcp'
REGION = 'us-central1'
BUCKET = 'merlin-on-gcp'
VERTEX_SERVICE_ACCOUNT = f'vertex-sa-mlops@{PROJECT}.iam.gserviceaccount.com'

MOVIES_DATASET_DISPLAY_NAME = 'movielens25m-movies'
RATINGS_DATASET_DISPLAY_NAME = 'movielens25m-ratings'

MODEL_DISPLAY_NAME = f'movielens25m-recommender'

WORKSPACE = f"gs://{BUCKET}/movielens25m"
EXPERIMENT_ARTIFACTS_DIR = os.path.join(WORKSPACE, 'experiments')

TENSORBOARD_DISPLAY_NAME = f'tb-{PROJECT}'
EXPERIMENT_NAME = f'{MODEL_DISPLAY_NAME}-experiment'

In [3]:
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=BUCKET,
    experiment=EXPERIMENT_NAME,
)

## Create Vertex TensorBoard Instance

In [None]:
!gcloud beta ai tensorboards create --display-name={TENSORBOARD_DISPLAY_NAME} \
  --project={PROJECT} --region={REGION}

In [4]:
TENSORBOARD_RESOURCE_NAME = "projects/659831510405/locations/us-central1/tensorboards/4450717516120981504"

## Initialize Vertex AI Experiment

In [5]:
REMOVE_EXPERIMENT_ARTIFACTS = False
if tf.io.gfile.exists(EXPERIMENT_ARTIFACTS_DIR) and REMOVE_EXPERIMENT_ARTIFACTS:
    print("Removing previous experiment artifacts...")
    tf.io.gfile.rmtree(EXPERIMENT_ARTIFACTS_DIR)

if not tf.io.gfile.exists(EXPERIMENT_ARTIFACTS_DIR):
    print("Creating new experiment artifacts directory...")
    tf.io.gfile.mkdir(EXPERIMENT_ARTIFACTS_DIR)

print("Workspace is ready.")

run_id = f"run-local-{datetime.now().strftime('%Y%m%d%H%M%S')}"
vertex_ai.start_run(run_id)

EXPERIMENT_RUN_DIR = os.path.join(EXPERIMENT_ARTIFACTS_DIR, EXPERIMENT_NAME, run_id)
print("Experiment run directory:", EXPERIMENT_RUN_DIR)

Workspace is ready.
INFO:root:Resource movielens25m-recommender-experiment-run-local-20210624112636 not found.
INFO:root:Creating Resource movielens25m-recommender-experiment-run-local-20210624112636
INFO:root:Resource movielens25m-recommender-experiment-run-local-20210624112636-metrics not found.
INFO:root:Creating Resource movielens25m-recommender-experiment-run-local-20210624112636-metrics
Experiment run directory: gs://merlin-on-gcp/movielens25m/experiments/movielens25m-recommender-experiment/run-local-20210624112636


In [7]:
def get_dataset_gcs_location(dataset_display_name):
    datasets = vertex_ai.TabularDataset.list()
    dataset = None
    for entry in datasets:
        if entry.display_name == dataset_display_name:
            dataset = entry
            break

    if not dataset:
        raise ValueError(f"Dataset with display name {dataset_display_name} does not exist!")
    
    return dataset.gca_resource.metadata['inputConfig']['gcsSource']['uri'][0]

In [10]:
movies_csv_dataset_location = get_dataset_gcs_location(MOVIES_DATASET_DISPLAY_NAME)
ratings_csv_dataset_location = get_dataset_gcs_location(RATINGS_DATASET_DISPLAY_NAME)
print("Movies CSV dataset location:", ratings_csv_dataset_location)
print("Ratings CSV dataset location:", ratings_csv_dataset_location)

Movies CSV dataset location: gs://merlin-on-gcp/movielens25m/dataset/ratings.csv
Ratings CSV dataset location: gs://merlin-on-gcp/movielens25m/dataset/ratings.csv


## 3. Build training container image

In [11]:
IMAGE_NAME="nvt-cuda11.0-tf2.4"
IMAGE_URI=f"gcr.io/{PROJECT}/{IMAGE_NAME}"
print(IMAGE_URI)

gcr.io/merlin-on-gcp/nvt-cuda11.0-tf2.4


In [None]:
! gcloud builds submit --tag {IMAGE_URI} . --timeout=45m --machine-type=e2-highcpu-8

## 4. Submit Vertex AI Custom Job for ETL

### Prepare worker pool specification

In [None]:
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_V100",
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": ["python", "-m", "src.data_preprocessing.task"],
            "args": [
                f'--project={PROJECT}', 
                f'--region={REGION}',
                f'--movies-dataset-display=name='{MOVIES_DATASET_DISPLAY_NAME}',
                f'--ratings-dataset-display=name='{RATINGS_DATASET_DISPLAY_NAME}',
                f'--etl-output-dir='{ETL_OUTPUT_DIR}',
            ],
        },
    }
]


### Submit and monitor the job

In [None]:
job_name = "movielens-nvt-etl-{}".format(time.strftime("%Y%m%d_%H%M%S"))

job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(
    sync=True, 
    service_account=VERTEX_SERVICE_ACCOUNT,
    tensorboard=TENSORBOARD_RESOURCE_NAME
)

## 5. Submit Vertex AI Custom Job for Model Training

### Prepare worker pool specification

In [None]:
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_V100",
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": ["python", "-m", "src.model_training.task"],
            "args": [
                f'--project={PROJECT}', 
                f'--region={REGION}',
                f'--movies-dataset-display=name='{MOVIES_DATASET_DISPLAY_NAME}',
                f'--ratings-dataset-display=name='{RATINGS_DATASET_DISPLAY_NAME}',
                f'--etl-output-dir='{ETL_OUTPUT_DIR}',
            ],
        },
    }
]

### Submit and monitor the job

In [None]:
job_name = "movielens-tf-training-{}".format(time.strftime("%Y%m%d_%H%M%S"))

job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(
    sync=True, 
    service_account=VERTEX_SERVICE_ACCOUNT,
    tensorboard=TENSORBOARD_RESOURCE_NAME
)

## 6. Upload the model Vertex AI

In [None]:
TRITON_SERVER_IMAGE = f"gcr.io/{PROJECT}/triton-{MODEL_DISPLAY_NAME}:latest"

In [None]:
vertex_ai.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=EXPORT_DIR,
    serving_container_image_uri=TRITON_SERVER_IMAGE
)