## Create the dockerfile

In [1]:
%%writefile Dockerfile
FROM gcr.io/deeplearning-platform-release/tf-gpu.2-10

WORKDIR /

# Copies the trainer code to the docker image.
COPY two_tower_src /two_tower_src

RUN pip install tensorflow-recommenders

Overwriting Dockerfile


### Build and push the custom training image using cloud build

In [2]:
PROJECT = 'hybrid-vertex'  # <--- TODO: CHANGE THIS
REGION = 'us-central1' 
BUCKET = 'gs://spotify-beam-v3'
REPOSITORY = 'spotify-tfrs-custom-train'
SERVER_IMAGE = "spotify-single-node-train"  # @param {type:"string"} 
REMOTE_IMAGE_NAME=f"{REGION}-docker.pkg.dev/{PROJECT}/{REPOSITORY}/{SERVER_IMAGE}"

In [3]:
# Create the repo if needed for the artifacts

! gcloud beta artifacts repositories create {REPOSITORY} \
    --repository-format=docker \
    --location=$REGION

[1;31mERROR:[0m (gcloud.beta.artifacts.repositories.create) ALREADY_EXISTS: the repository already exists


In [4]:
! gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


In [5]:
!gcloud builds submit . -t $REMOTE_IMAGE_NAME

Creating temporary tarball archive of 129 file(s) totalling 84.5 MiB before compression.
Some files were not included in the source upload.

Check the gcloud log [/home/jupyter/.config/gcloud/logs/2022.11.16/14.41.49.250696.log] to see which files and the contents of the
default gcloudignore file used (see `$ gcloud topic gcloudignore` to learn
more).

Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1668609709.330399-ded9b8f2e78e4253be54f8e49021c314.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/fba8183c-d6a4-4e26-b895-1054257be4be].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/fba8183c-d6a4-4e26-b895-1054257be4be?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "fba8183c-d6a4-4e26-b895-1054257be4be"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1668609709.330399-ded9b8f2e78e4253be54f8e490

In [6]:
def prepare_worker_pool_specs(
    image_uri,
    args,
    cmd,
    replica_count=1,
    machine_type="n1-standard-16",
    accelerator_count=1,
    accelerator_type="ACCELERATOR_TYPE_UNSPECIFIED",
    reduction_server_count=0,
    reduction_server_machine_type="n1-highcpu-16",
    reduction_server_image_uri=b"us-docker.pkg.dev/vertex-ai-restricted/training/reductionserver:latest",
):

    if accelerator_count > 0:
        machine_spec = {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        }
    else:
        machine_spec = {"machine_type": machine_type}

    container_spec = {
        "image_uri": image_uri,
        "args": args,
        "command": cmd,
    }

    chief_spec = {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "container_spec": container_spec,
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            "replica_count": replica_count - 1,
            "machine_spec": machine_spec,
            "container_spec": container_spec,
        }
        worker_pool_specs.append(workers_spec)
    if reduction_server_count > 1:
        workers_spec = {
            "replica_count": reduction_server_count,
            "machine_spec": {
                "machine_type": reduction_server_machine_type,
            },
            "container_spec": {"image_uri": reduction_server_image_uri},
        }
        worker_pool_specs.append(workers_spec)

    return worker_pool_specs

In [17]:
import time

STAGING_BUCKET = 'gs://spotify-beam-v3'

invoke_time = time.strftime("%Y%m%d-%H%M%S")

worker_args = ['--NUM_EPOCHS=100',
              '--EXPERIMENT_NAME=spotify-single-node-train-full-data-v10-01',
              '--ARCH=[512,256]',
              '--BATCH_SIZE=16000']

WORKER_CMD = ['python', 'two_tower_src/task.py']
WORKER_ARGS = worker_args
REPLICA_COUNT = 1
WORKER_MACHINE_TYPE = "a2-highgpu-1g"
ACCELERATOR_TYPE = "NVIDIA_TESLA_A100"
PER_MACHINE_ACCELERATOR_COUNT = 1


JOB_NAME = f'spotify_tfrs_single_node_train'
base_output_dir = f'{STAGING_BUCKET}/jobs/{JOB_NAME}/{invoke_time}'


worker_pool_specs = prepare_worker_pool_specs(
    image_uri=REMOTE_IMAGE_NAME,
    args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
)

In [18]:
worker_pool_specs

[{'replica_count': 1,
  'machine_spec': {'machine_type': 'a2-highgpu-1g',
   'accelerator_type': 'NVIDIA_TESLA_A100',
   'accelerator_count': 1},
  'container_spec': {'image_uri': 'us-central1-docker.pkg.dev/hybrid-vertex/spotify-tfrs-custom-train/spotify-single-node-train',
   'args': ['--NUM_EPOCHS=1',
    '--EXPERIMENT_NAME=spotify-single-node-train-full-data-v9-01',
    '--ARCH=[512,256]',
    '--BATCH_SIZE=16000'],
   'command': ['python', 'two_tower_src/task.py']}}]

In [19]:
from google.cloud import aiplatform as vertex_ai
SA = '934903580331-compute@developer.gserviceaccount.com'
job = vertex_ai.CustomJob(
    display_name=JOB_NAME,
    worker_pool_specs=worker_pool_specs,
    staging_bucket=base_output_dir,
    
)
job.run(sync=False,
       service_account=SA,
       enable_web_access = True)

Creating CustomJob
CustomJob created. Resource name: projects/934903580331/locations/us-central1/customJobs/7919452126504288256
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/934903580331/locations/us-central1/customJobs/7919452126504288256')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7919452126504288256?project=934903580331
CustomJob projects/934903580331/locations/us-central1/customJobs/7919452126504288256 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/7919452126504288256 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/7919452126504288256 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/7919452126504288256 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/934903580331/locations/us-central1/customJobs/7919452

In [None]:
?job.run

[0;31mSignature:[0m
[0mjob[0m[0;34m.[0m[0mrun[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mservice_account[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnetwork[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtimeout[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrestart_job_on_worker_restart[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0menable_web_access[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtensorboard[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;3