# Training an NVIDIA HugeCTR model with Vertex AI Training



In [1]:
import json
import os
import time
import nvtabular as nvt
import shutil

from nvtabular.columns.schema import ColumnSchema, Schema
from nvtabular.tags import Tags
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

## Configure notebook settings
### Set project, region, and Vertex AI settings

In [2]:
PROJECT = 'jk-mlops-dev'
REGION = 'us-central1'

VERTEX_STAGING_BUCKET = 'gs://jk-vertex-merlin'
VERTEX_SA = 'vertex-sa@jk-mlops-dev.iam.gserviceaccount.com'
LOCAL_STAGING_PATH = '/home/jupyter/staging'

### Initialize Vertex AI SDK

In [3]:
aiplatform.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=VERTEX_STAGING_BUCKET
)

### Prepare a local staging area

In [4]:
if os.path.isdir(LOCAL_STAGING_PATH):
    shutil.rmtree(LOCAL_STAGING_PATH)
os.makedirs(LOCAL_STAGING_PATH)

### Set paths to training and validation datasets

In [5]:
DATA_ROOT = 'gs://jk-criteo-bucket/criteo_processed_parquet'
TRAIN_DATA = f'{DATA_ROOT}/train/_file_list.txt'
VALID_DATA = f'{DATA_ROOT}/valid/_file_list.txt'
SCHEMA_PATH = f'{DATA_ROOT}/train/schema.pbtxt'

## Submit a Vertex custom training job

### Prepare a custom training container

In [6]:
IMAGE_NAME = 'hugectr_deepfm'
IMAGE_URI = f'gcr.io/{PROJECT}/{IMAGE_NAME}'
DOCKERFILE = 'src/training/hugectr'

In [7]:
#! gcloud builds submit --tag {IMAGE_URI} {DOCKERFILE}

In [8]:
!docker build -t {IMAGE_URI} {DOCKERFILE}

Sending build context to Docker daemon  97.28kB
Step 1/4 : FROM nvcr.io/nvidia/merlin/merlin-training:21.09
 ---> 8f6ef763d770
Step 2/4 : RUN pip3 install cloudml-hypertune
 ---> Using cache
 ---> 7be51676d034
Step 3/4 : WORKDIR /src
 ---> Using cache
 ---> 8e80619f2291
Step 4/4 : COPY trainer ./trainer
 ---> Using cache
 ---> b84e9d24f841
Successfully built b84e9d24f841
Successfully tagged gcr.io/jk-mlops-dev/hugectr_deepfm:latest


In [9]:
!docker push {IMAGE_URI}

Using default tag: latest
The push refers to repository [gcr.io/jk-mlops-dev/hugectr_deepfm]

[1B0b00f7fb: Preparing 
[1B8170b157: Preparing 
[1B96e6939c: Preparing 
[1B1f36f93d: Preparing 
[1Be0fa48ac: Preparing 
[1Bd4cd8fbb: Preparing 
[1B2cf6a026: Preparing 
[1B26a95f43: Preparing 
[1Bba5d6668: Preparing 
[1Ba9641b18: Preparing 
[1B42012c23: Preparing 
[1Bbd3e801f: Preparing 
[1B7039f00c: Preparing 
[1B023e019c: Preparing 
[1Bf9e4afbc: Preparing 
[1Be9115178: Preparing 
[1B220871e3: Preparing 
[1B76810591: Preparing 
[1B7b7db32e: Preparing 
[1B176606fb: Preparing 
[1B44765ddc: Preparing 
[1B295911d7: Preparing 
[1B1a643992: Preparing 
[1B6f8bf009: Preparing 
[1Bd3ad3992: Preparing 
[1Bd0bdb6b2: Preparing 
[1B59647bc1: Preparing 
[1B9b0b029e: Preparing 
[1B3c6245a7: Preparing 
[1B9f6effa5: Preparing 
[1B43806c89: Preparing 
[1B31c5510f: Preparing 
[1B4c5d4460: Preparing 
[1Be3d1aa10: Preparing 
[1Bc1212f82: Preparing 
[1B6fc769b7: Preparing 
[1B58

### Configure a custom training job

#### Retrieve cardinalities for categorical columns

In [10]:
LOCAL_SCHEMA_PATH = f'{LOCAL_STAGING_PATH}/schema.pbtxt'

!gsutil cp {SCHEMA_PATH} {LOCAL_SCHEMA_PATH}

Copying gs://jk-criteo-bucket/criteo_processed_parquet/train/schema.pbtxt...
/ [1 files][ 20.8 KiB/ 20.8 KiB]                                                
Operation completed over 1 objects/20.8 KiB.                                     


In [11]:
schema = Schema.load_protobuf(LOCAL_SCHEMA_PATH)

In [12]:
def retrieve_cardinalities(schema):
    cardinalities = {key: value.properties['embedding_sizes']['cardinality'] 
                     for key, value in schema.column_schemas.items()
                     if Tags.CATEGORICAL in value.tags}
    
    return cardinalities
    
    
cardinalities = retrieve_cardinalities(schema)
cardinalities

{'C1': 18792578.0,
 'C2': 35176.0,
 'C3': 17091.0,
 'C4': 7383.0,
 'C5': 20154.0,
 'C6': 4.0,
 'C7': 7075.0,
 'C8': 1403.0,
 'C9': 63.0,
 'C10': 12687136.0,
 'C11': 1054830.0,
 'C12': 297377.0,
 'C13': 11.0,
 'C14': 2209.0,
 'C15': 10933.0,
 'C16': 113.0,
 'C17': 4.0,
 'C18': 972.0,
 'C19': 15.0,
 'C20': 19550853.0,
 'C21': 5602712.0,
 'C22': 16779972.0,
 'C23': 375290.0,
 'C24': 12292.0,
 'C25': 101.0,
 'C26': 35.0}

#### Set HugeCTR model and trainer configuration

In [13]:
TRAINING_MODULE = 'trainer.task'

NUM_EPOCHS = 0
MAX_ITERATIONS = 50000
EVAL_INTERVAL = 1000
EVAL_BATCHES = 500
EVAL_BATCHES_FINAL = 2500
DISPLAY_INTERVAL = 200
SNAPSHOT_INTERVAL = 0
WORKSPACE_SIZE_PER_GPU = 61
PER_GPU_BATCHSIZE = 2048
LR = 0.001
DROPOUT_RATE = 0.5
NUM_WORKERS = 12
SLOT_SIZE_ARRAY = json.dumps(
    [int(cardinality) for cardinality in cardinalities.values()]).replace(' ', '')

#### Set training node configuration

In [14]:
MACHINE_TYPE = 'a2-highgpu-4g'
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
ACCELERATOR_NUM = 4

#### Configure worker pool specifications

In [15]:
batchsize = PER_GPU_BATCHSIZE * ACCELERATOR_NUM
gpus = json.dumps([list(range(ACCELERATOR_NUM))]).replace(' ','')
                 
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": MACHINE_TYPE,
            "accelerator_type": ACCELERATOR_TYPE,
            "accelerator_count": ACCELERATOR_NUM,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": ["python", "-m", TRAINING_MODULE],
            "args": [
                '--batchsize=' + str(batchsize),
                '--train_data=' + TRAIN_DATA.replace('gs://', '/gcs/'), 
                '--valid_data=' + VALID_DATA.replace('gs://', '/gcs/'),
                '--slot_size_array=' + SLOT_SIZE_ARRAY,
                '--max_iter=' + str(MAX_ITERATIONS),
                '--max_eval_batches=' + str(EVAL_BATCHES),
                '--eval_batches=' + str(EVAL_BATCHES_FINAL),
                '--dropout_rate=' + str(DROPOUT_RATE),
                '--lr=' + str(LR),
                '--num_workers=' + str(NUM_WORKERS),
                '--num_epochs=' + str(NUM_EPOCHS),
                '--eval_interval=' + str(EVAL_INTERVAL),
                '--snapshot=' + str(SNAPSHOT_INTERVAL),
                '--display_interval=' + str(DISPLAY_INTERVAL),
                '--workspace_size_per_gpu=' + str(WORKSPACE_SIZE_PER_GPU),
                '--gpus=' + gpus,
            ],
        },
    }
]

### Submit and monitor the job

In [17]:
job_name = 'HUGECTR_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
base_output_dir = f'{VERTEX_STAGING_BUCKET}/job_dir/{job_name}'


job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    base_output_dir=base_output_dir
)
job.run(
    sync=True,
    service_account=VERTEX_SA,
    restart_job_on_worker_restart=False
)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/219403147276189696
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/219403147276189696')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/219403147276189696?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/219403147276189696 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/219403147276189696 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/custo

## Submit and monitor a Vertex hyperparameter tuning job

### Configure a hyperparameter job

#### Set HugeCTR model and trainer configuration

In [18]:
TRAINING_MODULE = 'trainer.task'

NUM_EPOCHS = 0
MAX_ITERATIONS = 10000
EVAL_INTERVAL = 1000
EVAL_BATCHES = 500
EVAL_BATCHES_FINAL = 2500
DISPLAY_INTERVAL = 200
SNAPSHOT_INTERVAL = 0
WORKSPACE_SIZE_PER_GPU = 61
PER_GPU_BATCHSIZE = 2048
NUM_WORKERS = 12
SLOT_SIZE_ARRAY = json.dumps(
    [int(cardinality) for cardinality in cardinalities.values()]).replace(' ', '')

#### Set training node configuration

In [19]:
MACHINE_TYPE = 'a2-highgpu-4g'
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
ACCELERATOR_NUM = 4

#### Configure worker pool specification

In [20]:
batchsize = PER_GPU_BATCHSIZE * ACCELERATOR_NUM
gpus = json.dumps([list(range(ACCELERATOR_NUM))]).replace(' ','')
                 
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": MACHINE_TYPE,
            "accelerator_type": ACCELERATOR_TYPE,
            "accelerator_count": ACCELERATOR_NUM,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": ["python", "-m", TRAINING_MODULE],
            "args": [
                '--batchsize=' + str(batchsize),
                '--train_data=' + TRAIN_DATA.replace('gs://', '/gcs/'), 
                '--valid_data=' + VALID_DATA.replace('gs://', '/gcs/'),
                '--slot_size_array=' + SLOT_SIZE_ARRAY,
                '--max_iter=' + str(MAX_ITERATIONS),
                '--max_eval_batches=' + str(EVAL_BATCHES),
                '--eval_batches=' + str(EVAL_BATCHES_FINAL),
                '--num_workers=' + str(NUM_WORKERS),
                '--num_epochs=' + str(NUM_EPOCHS),
                '--eval_interval=' + str(EVAL_INTERVAL),
                '--snapshot=' + str(SNAPSHOT_INTERVAL),
                '--display_interval=' + str(DISPLAY_INTERVAL),
                '--workspace_size_per_gpu=' + str(WORKSPACE_SIZE_PER_GPU),
                '--gpus=' + gpus,
            ],
        },
    }
]

#### Configure hyperparameter and metric specs

In [21]:
metric_spec = {'AUC': 'maximize'}

parameter_spec = {
    'lr': hpt.DoubleParameterSpec(min=0.001, max=0.01, scale='log'),
    'dropout_rate': hpt.DiscreteParameterSpec(values=[0.4, 0.5, 0.6], scale=None),
}

### Submit and monitor the job

In [23]:
job_name = 'HUGECTR_HTUNING_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
base_output_dir = f'{VERTEX_STAGING_BUCKET}/job_dir/{job_name}'


custom_job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    base_output_dir=base_output_dir
)

hp_job = aiplatform.HyperparameterTuningJob(
    display_name=job_name,
    custom_job=custom_job,
    metric_spec=metric_spec,
    parameter_spec=parameter_spec,
    max_trial_count=4,
    parallel_trial_count=2,
    search_algorithm=None)

hp_job.run(
    sync=True,
    service_account=VERTEX_SA,
    restart_job_on_worker_restart=False
)

INFO:google.cloud.aiplatform.jobs:Creating HyperparameterTuningJob
INFO:google.cloud.aiplatform.jobs:HyperparameterTuningJob created. Resource name: projects/895222332033/locations/us-central1/hyperparameterTuningJobs/2160454586672873472
INFO:google.cloud.aiplatform.jobs:To use this HyperparameterTuningJob in another session:
INFO:google.cloud.aiplatform.jobs:hpt_job = aiplatform.HyperparameterTuningJob.get('projects/895222332033/locations/us-central1/hyperparameterTuningJobs/2160454586672873472')
INFO:google.cloud.aiplatform.jobs:View HyperparameterTuningJob:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/2160454586672873472?project=895222332033
INFO:google.cloud.aiplatform.jobs:HyperparameterTuningJob projects/895222332033/locations/us-central1/hyperparameterTuningJobs/2160454586672873472 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:HyperparameterTuningJob projects/895222332033/locations/us-central1/hyperparameterTuningJobs/

### Retrieve trial results

In [68]:
hp_job.trials

[id: "1"
 state: SUCCEEDED
 parameters {
   parameter_id: "dropout_rate"
   value {
     number_value: 0.5
   }
 }
 parameters {
   parameter_id: "lr"
   value {
     number_value: 0.0031622776601683794
   }
 }
 final_measurement {
   step_count: 10000
   metrics {
     metric_id: "AUC"
     value: 0.6418014764785767
   }
 }
 start_time {
   seconds: 1634230319
   nanos: 373439804
 }
 end_time {
   seconds: 1634231353
 },
 id: "2"
 state: SUCCEEDED
 parameters {
   parameter_id: "dropout_rate"
   value {
     number_value: 0.4
   }
 }
 parameters {
   parameter_id: "lr"
   value {
     number_value: 0.0019070834044189785
   }
 }
 final_measurement {
   step_count: 10000
   metrics {
     metric_id: "AUC"
     value: 0.6543273329734802
   }
 }
 start_time {
   seconds: 1634230319
   nanos: 373603250
 }
 end_time {
   seconds: 1634231349
 },
 id: "3"
 state: SUCCEEDED
 parameters {
   parameter_id: "dropout_rate"
   value {
     number_value: 0.5
   }
 }
 parameters {
   parameter_id: "l

#### Find the best trial

In [69]:
def retrieve_auc(trial):
    auc = trial.final_measurement.metrics[0].value
    
    return auc
 
best_trial = sorted(hp_job.trials, key=retrieve_auc, reverse=True)[0]

print("Best trial ID:", best_trial.id)
print("   AUC:", best_trial.final_measurement.metrics[0].value)
print("   LR:", best_trial.parameters[1].value)
print("   Dropout rate:", best_trial.parameters[0].value)

Best trial ID: 4
   AUC: 0.6567307710647583
   LR: 0.00347031635066004
   Dropout rate: 0.4
