In [16]:
import tensorflow as tf

In [17]:
from google.cloud import aiplatform as vertex_ai
import time

In [18]:
PROJECT = 'renatoleite-mldemos'
REGION = 'us-central1'

STAGING_BUCKET = 'gs://renatoleite-staging'

VERTEX_SA = '464015718044-compute@developer.gserviceaccount.com'

In [19]:
TRAIN_IMAGE = 'gcr.io/renatoleite-mldemos/nvtabular-bench'

In [20]:
print(TRAIN_IMAGE)
print(STAGING_BUCKET)

gcr.io/renatoleite-mldemos/nvtabular-bench
gs://renatoleite-staging


In [21]:
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

## Submit Vertex job with FUSE paths (/gcs/)

In [8]:
job_name = 'NVT_BENCHMARK_FUSE_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
base_output_dir = '{}/jobs/{}/'.format(STAGING_BUCKET, job_name)

training_data = '/gcs/workshop-datasets/criteo-parque/'
output_path = f'/gcs/renatoleite-staging/nvt-benchmark/{job_name}/'

In [22]:
print(job_name)
print(base_output_dir)
print(training_data)
print(output_path)

NVT_BENCHMARK_FUSE_20210812_204827
gs://renatoleite-staging/jobs/NVT_BENCHMARK_FUSE_20210812_204827/
/gcs/workshop-datasets/criteo-parque/
/gcs/renatoleite-staging/nvt-benchmark/NVT_BENCHMARK_FUSE_20210812_204827/


In [23]:
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "a2-highgpu-2g",
            "accelerator_type": "NVIDIA_TESLA_A100",
            "accelerator_count": 2,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_IMAGE,
            "command": ["python", "dask-nvtabular-criteo-benchmark.py",],
            "args": [
                '--data-path=' + training_data,
                '--out-path=' + output_path,
                '--profile=' + '/gcs/renatoleite-staging/nvt-bench-testing/result-fuse.html',
                '--devices=' + '0,1'
            ],
        },
    }
]

print(worker_pool_specs)

[{'machine_spec': {'machine_type': 'a2-highgpu-2g', 'accelerator_type': 'NVIDIA_TESLA_A100', 'accelerator_count': 2}, 'replica_count': 1, 'container_spec': {'image_uri': 'gcr.io/renatoleite-mldemos/nvtabular-bench', 'command': ['python', 'dask-nvtabular-criteo-benchmark.py'], 'args': ['--data-path=/gcs/workshop-datasets/criteo-parque/', '--out-path=/gcs/renatoleite-staging/nvt-benchmark/NVT_BENCHMARK_FUSE_20210812_204827/', '--profile=/gcs/renatoleite-staging/nvt-bench-testing/result-fuse.html', '--devices=0,1']}}]


In [24]:
job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=base_output_dir
)

job.run(sync=False, 
        service_account=VERTEX_SA,
)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob


## Submit Vertex job with GSC paths (gs://)

In [21]:
job_name = 'NVT_BENCHMARK_GCS_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
base_output_dir = '{}/jobs/{}/test.txt'.format(STAGING_BUCKET, job_name)

training_data = 'gs://workshop-datasets/criteo-parque/'
output_path = f'gs://renatoleite-merlin-staging/nvt-bench-testing/{job_name}'

worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "a2-highgpu-2g",
            "accelerator_type": "NVIDIA_TESLA_A100",
            "accelerator_count": 2,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_IMAGE,
            "command": ["python", "dask-nvtabular-criteo-benchmark.py",],
            "args": [
                '--data-path=' + training_data,
                '--out-path=' + output_path,
                '--profile=' + 'gs://renatoleite-merlin-staging/nvt-bench-testing/result-gcs.html',
                '--devices=' + '0,1',
                '--device-limit-frac=' + '0.8',
                '--device-pool-frac=' + '0.9',
                '--num-io-threads=' + '0',
                '--part-mem-frac=' + '0.125'
            ],
        },
    }
]

print(worker_pool_specs)

[{'machine_spec': {'machine_type': 'a2-highgpu-2g', 'accelerator_type': 'NVIDIA_TESLA_A100', 'accelerator_count': 2}, 'replica_count': 1, 'container_spec': {'image_uri': 'gcr.io/merlin-on-gcp/nvt-conda-build', 'command': ['python', 'dask-nvtabular-criteo-benchmark.py'], 'args': ['--data-path=gs://workshop-datasets/criteo-parque/', '--out-path=gs://renatoleite-merlin-staging/nvt-bench-testing/NVT_BENCHMARK_GCS_20210811_204445', '--profile=gs://renatoleite-merlin-staging/nvt-bench-testing/result-gcs.html', '--devices=0,1', '--device-limit-frac=0.8', '--device-pool-frac=0.9', '--num-io-threads=0', '--part-mem-frac=0.125']}}]


In [22]:
job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=base_output_dir
)

job.run(sync=False, 
        service_account=VERTEX_SA,
)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/659831510405/locations/us-central1/customJobs/6034502443281154048
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/659831510405/locations/us-central1/customJobs/6034502443281154048')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6034502443281154048?project=659831510405
INFO:google.cloud.aiplatform.jobs:CustomJob projects/659831510405/locations/us-central1/customJobs/6034502443281154048 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/659831510405/locations/us-central1/customJobs/6034502443281154048 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/659831510405/locations/us-central1/