# Training an NVIDIA HugeCTR model with Vertex AI Training



In [30]:
import json
import time

from google.cloud import aiplatform

## Configure environment settings


In [37]:
PROJECT = 'jk-mlops-dev'
REGION = 'us-central1'
STAGING_BUCKET = 'gs://jk-vertex-merlin'
VERTEX_SA = 'vertex-sa@jk-mlops-dev.iam.gserviceaccount.com'

## Prepare a custom training container

In [15]:
IMAGE_NAME = 'hugectr_deepfm'
IMAGE_URI = f'gcr.io/{PROJECT}/{IMAGE_NAME}'
DOCKERFILE = 'src/training/hugectr'

In [9]:
! gcloud builds submit --tag {IMAGE_URI} {DOCKERFILE}

Creating temporary tarball archive of 13 file(s) totalling 61.9 KiB before compression.
Uploading tarball of [src/training/hugectr] to [gs://jk-mlops-dev_cloudbuild/source/1633542474.644166-246cadc794014925a531b26e22581c4e.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jk-mlops-dev/locations/global/builds/0f824ff0-98ad-43a1-94e3-2750cd7df757].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/0f824ff0-98ad-43a1-94e3-2750cd7df757?project=895222332033].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "0f824ff0-98ad-43a1-94e3-2750cd7df757"

FETCHSOURCE
Fetching storage object: gs://jk-mlops-dev_cloudbuild/source/1633542474.644166-246cadc794014925a531b26e22581c4e.tgz#1633542477005923
Copying gs://jk-mlops-dev_cloudbuild/source/1633542474.644166-246cadc794014925a531b26e22581c4e.tgz#1633542477005923...
/ [1 files][ 13.5 KiB/ 13.5 KiB]                                                
Operation completed over 

## Configure a training job

### Initialize Vertex AI SDK

In [33]:
aiplatform.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

### Training and validation datasets

In [16]:
TRAIN_DATA = '/gcs/jk-criteo-bucket/criteo_processed_parquet/train/_file_list.txt'
VALID_DATA = '/gcs/jk-criteo-bucket/criteo_processed_parquet/valid/_file_list.txt'

### Training node configuration

In [22]:
MACHINE_TYPE = 'a2-highgpu-4g'
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
ACCELERATOR_NUM = 4

### Training module

In [23]:
TRAINING_MODULE = 'trainer.train'

### HugeCTR solver configuration

In [25]:
NUM_EPOCHS = 0
MAX_ITERATIONS = 5000
EVAL_INTERVAL = 500
DISPLAY_INTERVAL = 100
SNAPSHOT_INTERVAL = 0
WORKSPACE_SIZE_PER_GPU = 61
PER_GPU_BATCHSIZE = 2048
LR = 0.001
SLOT_SIZE_ARRAY = '[18792578,35176,17091,7383,20154,4,7075,1403,63,12687136,1054830,297377,11,2209,10933,113,4,972,15,19550853,5602712,16779972,375290,12292,101,35]'

### Configure worker pool specifications

In [34]:
batchsize = PER_GPU_BATCHSIZE * ACCELERATOR_NUM
gpus = json.dumps([list(range(ACCELERATOR_NUM))]).replace(' ','')
                 
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": MACHINE_TYPE,
            "accelerator_type": ACCELERATOR_TYPE,
            "accelerator_count": ACCELERATOR_NUM,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": ["python", "-m", TRAINING_MODULE],
            "args": [
                '--batchsize=' + str(batchsize),
                '--train_data=' + TRAIN_DATA, 
                '--valid_data=' + VALID_DATA,
                '--slot_size_array=' + SLOT_SIZE_ARRAY,
                '--max_iter=' + str(MAX_ITERATIONS),
                '--num_epochs=' + str(NUM_EPOCHS),
                '--eval_interval=' + str(EVAL_INTERVAL),
                '--snapshot=' + str(SNAPSHOT_INTERVAL),
                '--display_interval=' + str(DISPLAY_INTERVAL),
                '--workspace_size_per_gpu=' + str(WORKSPACE_SIZE_PER_GPU),
                '--gpus=' + gpus,
            ],
        },
    }
]

### Submit and monitor a training job

In [38]:
job_name = 'HUGECTR_{}'.format(time.strftime("%Y%m%d_%H%M%S"))

job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)
job.run(
    sync=True,
    service_account=VERTEX_SA,
    restart_job_on_worker_restart=False
)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/7049921222741590016
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/7049921222741590016')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7049921222741590016?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/7049921222741590016 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/7049921222741590016 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/