# 02 - Experimentation

This notebook covers the following steps:

1. Preparing the data using `NVTabular` locally.
2. Train and export a `TensorFlow` model locally.
3. Build the Docker container image.
4. Submit a `Vertex AI` custom job to prepare the data.
5. Submit a `Vertex AI` custom job to train and export the model.

## Setup

In [None]:
import os
import time
import logging
from datetime import datetime

import cudf
import nvtabular as nvt

import tensorflow as tf
import tensorflow.keras as keras

from src.common import features, utils
from src.data_preprocessing import etl

from google.cloud import aiplatform as vertex_ai

logging.getLogger().setLevel(logging.INFO)
tf.get_logger().setLevel('INFO')

print(f"TensorFlow: {tf.__version__}")

In [None]:
PROJECT = 'merlin-on-gcp'
REGION = 'us-central1'
BUCKET = 'merlin-on-gcp'
VERTEX_SERVICE_ACCOUNT = f'vertex-sa-mlops@{PROJECT}.iam.gserviceaccount.com'

MOVIES_DATASET_DISPLAY_NAME = 'movielens25m-movies'
RATINGS_DATASET_DISPLAY_NAME = 'movielens25m-ratings'

MODEL_DISPLAY_NAME = f'movielens25m-recommender'

WORKSPACE = f"gs://{BUCKET}/movielens25m"
EXPERIMENT_ARTIFACTS_DIR = os.path.join(WORKSPACE, 'experiments')

TENSORBOARD_DISPLAY_NAME = f'tb-{PROJECT}'
EXPERIMENT_NAME = f'{MODEL_DISPLAY_NAME}-experiment'

In [None]:
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=BUCKET,
    experiment=EXPERIMENT_NAME,
)

## Create Vertex TensorBoard Instance

In [None]:
!gcloud beta ai tensorboards create --display-name={TENSORBOARD_DISPLAY_NAME} \
  --project={PROJECT} --region={REGION}

In [None]:
TENSORBOARD_RESOURCE_NAME = "projects/659831510405/locations/us-central1/tensorboards/4450717516120981504"

## Initialize Vertex AI Experiment

In [None]:
REMOVE_EXPERIMENT_ARTIFACTS = False
if tf.io.gfile.exists(EXPERIMENT_ARTIFACTS_DIR) and REMOVE_EXPERIMENT_ARTIFACTS:
    print("Removing previous experiment artifacts...")
    tf.io.gfile.rmtree(EXPERIMENT_ARTIFACTS_DIR)

if not tf.io.gfile.exists(EXPERIMENT_ARTIFACTS_DIR):
    print("Creating new experiment artifacts directory...")
    tf.io.gfile.mkdir(EXPERIMENT_ARTIFACTS_DIR)

print("Workspace is ready.")

run_id = f"run-local-{datetime.now().strftime('%Y%m%d%H%M%S')}"
vertex_ai.start_run(run_id)

EXPERIMENT_RUN_DIR = os.path.join(EXPERIMENT_ARTIFACTS_DIR, EXPERIMENT_NAME, run_id)
print("Experiment run directory:", EXPERIMENT_RUN_DIR)

## 1. Preparing the data using NVTabular

In [None]:
ETL_OUTPUT_DIR = os.path.join(EXPERIMENT_RUN_DIR, 'etl_output')

In [None]:
etl.run_etl(
    PROJECT, 
    REGION, 
    MOVIES_DATASET_DISPLAY_NAME, 
    RATINGS_DATASET_DISPLAY_NAME, 
    ETL_OUTPUT_DIR)

In [None]:
!gsutil ls {ETL_OUTPUT_DIR}

## 2. Train a TensorFlow model

In [None]:
LOG_DIR = os.path.join(EXPERIMENT_RUN_DIR, 'logs')
EXPORT_DIR = os.path.join(EXPERIMENT_RUN_DIR, 'model')

In [None]:
experiment_inputs = {
    'train_data_files': os.path.join(ETL_OUTPUT_DIR, 'transformed_data', 'train', '*.parquet'),
    'test_data_files': os.path.join(ETL_OUTPUT_DIR, 'transformed_data', 'test', '*.parquet'),
    'transform_workflow_dir': os.path.join(ETL_OUTPUT_DIR, 'transform_workflow'),
}

hyperparams = {
    'learning_rate': 0.001,
    'batch_size': 1024 * 32,
    'hidden_units': [128, 128],
    'num_epochs': 1
}

vertex_ai.log_params(experiment_inputs)
vertex_ai.log_params(hyperparams)

In [None]:
import imp
from src.model_training import trainer, model
from src.common import utils, features

imp.reload(trainer)
imp.reload(features)
imp.reload(utils)
imp.reload(model)


In [None]:
if tf.io.gfile.exists('data'):
    tf.io.gfile.rmtree('data')
if tf.io.gfile.exists('transform_workflow'):
    tf.io.gfile.rmtree('transform_workflow')

tf.io.gfile.mkdir('data')
tf.io.gfile.mkdir('data/train')
tf.io.gfile.mkdir('data/test')

In [None]:
utils.copy_files(experiment_inputs['train_data_files'], 'data/train')
utils.copy_files(experiment_inputs['test_data_files'], 'data/test')
utils.download_directory(experiment_inputs['transform_workflow_dir'], '.')

In [None]:
recommendation_model = trainer.train(
    train_data_file_pattern='data/train/*.parquet',
    nvt_workflow_dir='transform_workflow',
    hyperparams=hyperparams,
    log_dir=LOG_DIR
)

In [None]:
evaluation_metric = trainer.evaluate(
    recommendation_model,
    eval_data_file_pattern='data/test/*.parquet',
    hyperparams=hyperparams
)

evaluation_metric

## 3. Build training container image

In [None]:
IMAGE_NAME="nvt-cuda11.0-tf2.4"
IMAGE_URI=f"gcr.io/{PROJECT}/{IMAGE_NAME}"
print(IMAGE_URI)

In [None]:
! gcloud builds submit --tag {IMAGE_URI} . --timeout=30m --machine-type=e2-highcpu-8

## 4. Submit Vertex AI Custom Job for ETL

### Prepare worker pool specification

In [None]:
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_V100",
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": ["python", "src/data_preprocessing/task.py"],
            "args": [
                f'--project={PROJECT}', 
                f'--region={REGION}',
                f'--movies-dataset-display=name='{MOVIES_DATASET_DISPLAY_NAME}',
                f'--ratings-dataset-display=name='{RATINGS_DATASET_DISPLAY_NAME}',
                f'--etl-output-dir='{ETL_OUTPUT_DIR}',
            ],
        },
    }
]


### Submit and monitor the job

In [None]:
job_name = "movielens-nvt-etl-{}".format(time.strftime("%Y%m%d_%H%M%S"))

job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(
    sync=True, 
    service_account=VERTEX_SERVICE_ACCOUNT,
    tensorboard=TENSORBOARD_RESOURCE_NAME
)

## 5. Submit Vertex AI Custom Job for Model Training

### Prepare worker pool specification

In [None]:
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_V100",
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": ["python", "src/data_preprocessing/task.py"],
            "args": [
                f'--project={PROJECT}', 
                f'--region={REGION}',
                f'--movies-dataset-display=name='{MOVIES_DATASET_DISPLAY_NAME}',
                f'--ratings-dataset-display=name='{RATINGS_DATASET_DISPLAY_NAME}',
                f'--etl-output-dir='{ETL_OUTPUT_DIR}',
            ],
        },
    }
]

### Submit and monitor the job

In [None]:
job_name = "movielens-tf-training-{}".format(time.strftime("%Y%m%d_%H%M%S"))

job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(
    sync=True, 
    service_account=VERTEX_SERVICE_ACCOUNT,
    tensorboard=TENSORBOARD_RESOURCE_NAME
)