# Training and deploying a tabular model using Vertex custom training job - Part 1

![Training pipeline](../images/custom-tabular.png)

In [1]:
import os
import pprint
import pandas as pd
import tensorflow as tf
import time
import matplotlib.pyplot as plt

from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform_v1beta1 import types
from google.cloud.aiplatform import hyperparameter_tuning as hpt

from google.cloud.aiplatform.utils import JobClientWithOverride

2021-08-05 23:35:30.343972: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Configure GCP settings

*Before running the notebook make sure to follow the repo's README file to install the pre-requisites.*

In [2]:
PROJECT = 'jk-mlops-dev'
REGION = 'us-central1'

STAGING_BUCKET = 'gs://jk-vertex-us-central1'

#VERTEX_SA = f'vertex-sa@{PROJECT}.iam.gserviceaccount.com'

### Initialize Vertex AI SDK

In [3]:
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

### Prepare a training script

In [4]:
SCRIPT_FOLDER = 'trainer'
if tf.io.gfile.exists(SCRIPT_FOLDER):
    tf.io.gfile.rmtree(SCRIPT_FOLDER)
tf.io.gfile.mkdir(SCRIPT_FOLDER)
file_path = os.path.join(SCRIPT_FOLDER, 'train.py')

In [5]:
%%writefile {file_path}


# Copyright 2021 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

import argparse
import json
import os
import logging
import time


def get_args():
    """Defines and parse commandline arguments."""

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--output_path",
        default="/tmp",
        type=str,
    )
    
    parser.add_argument(
        "--input_path",
        default="/tmp",
        type=str,
    )

    return parser.parse_args()

def main():
    args = get_args()
    

    logging.info('****Entering****')

    print(os.listdir(args.input_path))
    
    logging.info('**** Exiting ****')
    
    
if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    main()

Writing trainer/train.py


### Configure and submit a Vertex job using a custom container

#### Create a docker file

In [6]:
#BASE_IMAGE = 'us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-4:latest'
#BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-4'
BASE_IMAGE = 'nvcr.io/nvidia/merlin/merlin-training:0.5.3'

TRAIN_IMAGE = f'gcr.io/{PROJECT}/merlin-train'

dockerfile = f'''
FROM {BASE_IMAGE}

WORKDIR /trainer

# Copies the trainer code to the docker image.
COPY train.py .

'''

with open(os.path.join(SCRIPT_FOLDER, 'Dockerfile'), 'w') as f:
    f.write(dockerfile)

#### Build a container image

In [7]:
! docker build -t {TRAIN_IMAGE} {SCRIPT_FOLDER}

Sending build context to Docker daemon  4.096kB
Step 1/3 : FROM nvcr.io/nvidia/merlin/merlin-training:0.5.3
 ---> 332a8cffc9df
Step 2/3 : WORKDIR /trainer
 ---> Using cache
 ---> e6e49164504a
Step 3/3 : COPY train.py .
 ---> Using cache
 ---> 944773da3e81
Successfully built 944773da3e81
Successfully tagged gcr.io/jk-mlops-dev/merlin-train:latest


In [8]:
! docker push {TRAIN_IMAGE}

Using default tag: latest
The push refers to repository [gcr.io/jk-mlops-dev/merlin-train]

[1Bb8ac654e: Preparing 
[1B2e09c4ff: Preparing 
[1B714d2463: Preparing 
[1Bd2ff5b33: Preparing 
[1B37647ed0: Preparing 
[1B9f4cda5f: Preparing 
[1B21185477: Preparing 
[1Bb702c731: Preparing 
[1B976c398b: Preparing 
[1Bf82b8797: Preparing 
[1Bb5e2597b: Preparing 
[1B7dc1f6bf: Preparing 
[1B777553e2: Preparing 
[1B45ef7765: Preparing 
[1Bf02461c8: Preparing 
[1B49407eae: Preparing 
[1Bb75a89f5: Preparing 
[1B814e7f5b: Preparing 
[1B61fe548e: Preparing 
[1Bdac4ffdd: Preparing 
[1B218e5cc3: Preparing 
[1B668c53f6: Preparing 
[1B5d313a81: Preparing 
[1B010e7779: Preparing 
[1Bc68e5aca: Preparing 
[1B71f76135: Preparing 
[1Bfb51e15b: Preparing 
[1B4afa0df1: Preparing 
[1B9161954b: Preparing 
[23B702c731: Waiting g 
[1B367789f5: Preparing 
[1Ba0c42d3d: Preparing 
[25B76c398b: Waiting g 
[1B303ebf75: Preparing 
[1Bbafdc7ee: Layer already exists [33A[2K[30A[2K[27A

#### Prepare worker pool specification

In [9]:
job_name = 'MERLIN_CONTAINER_TEST_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
base_output_dir = '{}/jobs/{}/test.txt'.format(STAGING_BUCKET, job_name)
input_path = '/gcs/{}'.format(STAGING_BUCKET[4:])

worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_T4",
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_IMAGE,
            "command": ["python", "train.py",],
            "args": [             
                '--input_path=' + input_path, 
            ],
        },
    }
]

print(worker_pool_specs)

[{'machine_spec': {'machine_type': 'n1-standard-4', 'accelerator_type': 'NVIDIA_TESLA_T4', 'accelerator_count': 1}, 'replica_count': 1, 'container_spec': {'image_uri': 'gcr.io/jk-mlops-dev/merlin-train', 'command': ['python', 'train.py'], 'args': ['--input_path=/gcs//jk-vertex-us-central1']}}]


#### Submit and monitor the job

In [10]:
job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=base_output_dir
)

job.run(sync=False, 
#        service_account=VERTEX_SA,
#        tensorboard=TENSORBOARD
)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/8556746933027209216
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/8556746933027209216')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/8556746933027209216?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/8556746933027209216 current state:
JobState.JOB_STATE_QUEUED
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/8556746933027209216 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/c

## Configure and submit a Vertex job using a GAPIC API

In [None]:
from google.cloud.aiplatform.gapic import \
    JobServiceClient

In [None]:
job_name = 'GAPIC_CUSTOM_CONTAINER_{}'.format(time.strftime("%Y%m%d_%H%M%S"))

custom_job_spec = {
    'display_name': job_name,
    'job_spec': {
        'worker_pool_specs': worker_pool_specs
    }
}

print(custom_job_spec)

In [None]:
API_ENDPOINT = f'{REGION}-aiplatform.googleapis.com'

options = dict(api_endpoint=API_ENDPOINT)
client = JobServiceClient(client_options=options)

parent = f"projects/{PROJECT}/locations/{REGION}"

response = client.create_custom_job(
    parent=parent, custom_job=custom_job_spec
)

response

### Configure and submit a Vertex job using `aiplatform.CustomJob.from_local_script`

In [153]:
job_name = 'CUSTOM_SCRIPT_GPU_GPU{}'.format(time.strftime("%Y%m%d_%H%M%S"))
base_output_dir = '{}/jobs/{}'.format(STAGING_BUCKET, job_name)
output_path = f'{base_output_dir}/test.txt'

container_uri = 'us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-4:latest'
args = [
    '--output_path=' + output_path, 
]

machine_type = 'n1-standard-4'
accelerator_type = 'NVIDIA_TESLA_T4'
accelerator_count = 1

job = vertex_ai.CustomJob.from_local_script(
    display_name=job_name,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    script_path=file_path,
    container_uri=container_uri,
    args=args,
    staging_bucket=base_output_dir
)

job.run(sync=False, 
#        service_account=VERTEX_SA,
#        tensorboard=TENSORBOARD,
        )

INFO:google.cloud.aiplatform.utils.source_utils:Training script copied to:
gs://jk-vertex-us-central1/jobs/CUSTOM_SCRIPT_GPU_GPU20210720_213359/aiplatform-2021-07-20-21:34:00.095-aiplatform_custom_trainer_script-0.1.tar.gz.
INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/6593538135307583488
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/6593538135307583488')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6593538135307583488?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/6593538135307583488 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatf