# Using Vertex AI for online serving with NVIDIA Triton

- Demonstrate serving of ensemble models - NVTabular preprocessing + HugeCTR recommender



In [1]:
import base64
import json
import os
import random
import sys

import google.auth
from google.cloud import aiplatform
from google.cloud.aiplatform import gapic as aip
from google.cloud.aiplatform import hyperparameter_tuning as hpt
from google.protobuf.json_format import MessageToDict

### Configure notebook settings

In [2]:
PROJECT_ID = 'jk-mlops-dev'
REGION = "us-central1"
BUCKET_NAME = "gs://jk-merlin-dev"

### Initialize Vertex AI SDK

In [3]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME)

### Build a custom prediction container

In [9]:
IMAGE_NAME = 'triton-hugectr'
IMAGE_URI = f"gcr.io/{PROJECT_ID}/{IMAGE_NAME}"
DOCKERFILE = 'src/Dockerfile.triton'

In [10]:
!docker build -t {IMAGE_URI} -f {DOCKERFILE} src
!docker push {IMAGE_URI}

Sending build context to Docker daemon    448kB
Step 1/9 : FROM gcr.io/merlin-on-gcp/dongm-merlin-inference-hugectr:v0.6.1
 ---> fb6f7db2d7fd
Step 2/9 : EXPOSE 8000
 ---> Using cache
 ---> 748ffab38b92
Step 3/9 : EXPOSE 8001
 ---> Using cache
 ---> b2636665a789
Step 4/9 : EXPOSE 8002
 ---> Using cache
 ---> ab2a74e7be2d
Step 5/9 : RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && apt-get update -y && apt-get install google-cloud-sdk -y
 ---> Running in 7f8ad347f4d2
deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main
[91m  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2537  1

### Register the model resource

In [13]:
VERSION = 10
model_display_name = f"{IMAGE_NAME}-deepfm-v{VERSION}"
model_description = "Serving with Triton inference server using a custom container"

health_route = "/v2/health/ready"
predict_route = f"/v2/models/deepfm_ens/infer"
serving_container_ports = [8000]
model_ensemble_location = 'gs://jk-criteo-bucket/models'

In [14]:
model = aiplatform.Model.upload(
    display_name=model_display_name,
    description=model_description,
    serving_container_image_uri=IMAGE_URI,
    serving_container_predict_route=predict_route,
    serving_container_health_route=health_route,
    serving_container_ports=serving_container_ports,
    artifact_uri=model_ensemble_location
)

model.wait()

print(model.display_name)
print(model.resource_name)

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/895222332033/locations/us-central1/models/1570071818138550272/operations/70631115139842048
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/895222332033/locations/us-central1/models/1570071818138550272
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/895222332033/locations/us-central1/models/1570071818138550272')
triton-hugectr-deepfm-v10
projects/895222332033/locations/us-central1/models/1570071818138550272


### Create an endpoint


In [15]:
endpoint_display_name = f"{IMAGE_NAME}-endpoint-{VERSION}"
endpoint = aiplatform.Endpoint.create(display_name=endpoint_display_name)

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/895222332033/locations/us-central1/endpoints/3705908339465519104/operations/4260104668501245952
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/895222332033/locations/us-central1/endpoints/3705908339465519104
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/895222332033/locations/us-central1/endpoints/3705908339465519104')


### Deploy a model

In [16]:
traffic_percentage = 100
machine_type = "n1-standard-16"
accelerator_type="NVIDIA_TESLA_T4"
accelerator_count = 1

deployed_model_display_name = model_display_name
min_replica_count = 1
max_replica_count = 1
sync = True

In [None]:
model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=deployed_model_display_name,
    machine_type=machine_type,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    traffic_percentage=traffic_percentage,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    sync=sync,
)

INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/895222332033/locations/us-central1/endpoints/3705908339465519104
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/895222332033/locations/us-central1/endpoints/3705908339465519104/operations/4683443033474072576


### Test the deployed model

In [None]:
!curl -X POST \
  -H "Authorization: Bearer $(gcloud auth print-access-token)" \
  -H "Content-Type: application/json"  \
  https://us-central1-aiplatform.googleapis.com/v1/projects/jk-mlops-dev/locations/europe-west2/endpoints/3705908339465519104:rawPredict \
  -d @criteo.json