# Training pipeline for TFRS  2tower model 

```
tensorflow==2.10.1
tensorflow-cloud==0.1.16
tensorflow-datasets==4.6.0
tensorflow-estimator==2.10.0
tensorflow-hub==0.12.0
tensorflow-io==0.27.0
tensorflow-io-gcs-filesystem==0.27.0
tensorflow-metadata==1.8.0
tensorflow-probability==0.18.0
tensorflow-recommenders==0.7.2
tensorflow-serving-api==2.8.3
tensorflow-transform==1.8.0
```

In [None]:
# !pip install kfp
# !pip install google-cloud-pipeline-components 

In [None]:
# ! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
# ! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"
# ! python3 -c "import google.cloud.aiplatform; print('aiplatform SDK version: {}'.format(google.cloud.aiplatform.__version__))"

In [1]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")

VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1


In [2]:
import os
import json
from datetime import datetime
from time import time
import pandas as pd
# disable INFO and DEBUG logging everywhere
import logging
import time
from pprint import pprint

logging.disable(logging.WARNING)

from google.cloud import aiplatform as vertex_ai
from google.cloud import storage

# Pipelines
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.types import artifact_types

# Kubeflow SDK
# TODO: fix these
from kfp.v2 import dsl
import kfp
import kfp.v2.dsl
from kfp.v2.google import client as pipelines_client
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component)

storage_client = storage.Client(project=PROJECT_ID)

vertex_ai.init(project=PROJECT_ID,location=LOCATION)

In [3]:
# PREFIX = 'spotify-2tower'
APP='sp'
MODEL_TYPE='2tower'
FRAMEWORK = 'tfrs'
MODEL_VERSION = 'jtv13'
PIPELINE_VERSION = 'v11'
MODEL_ROOT_NAME = f'{APP}-{MODEL_TYPE}-{FRAMEWORK}-{MODEL_VERSION}-{PIPELINE_VERSION}'

print(f"MODEL_ROOT_NAME: {MODEL_ROOT_NAME}")

MODEL_ROOT_NAME: sp-2tower-tfrs-jtv13-v11


## Write Train files

In [11]:
# # Make the training subfolder
# ! rm -rf {REPO_DOCKER_PATH_PREFIX}/trainer
# ! mkdir {REPO_DOCKER_PATH_PREFIX}/trainer
# ! touch {REPO_DOCKER_PATH_PREFIX}/trainer/__init__.py

REPO_DOCKER_PATH_PREFIX = 'src'

In [12]:
# Docker definitions for training
IMAGE_NAME = f'{MODEL_ROOT_NAME}-training'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

DOCKERNAME = 'tfrs'
REPO_DOCKER_PATH_PREFIX = 'src'
MACHINE_TYPE ='e2-highcpu-32'
FILE_LOCATION = './src'

print(f"IMAGE_URI: {IMAGE_URI}")

IMAGE_URI: gcr.io/hybrid-vertex/sp-2tower-tfrs-jtv13-v11-training


In [13]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/cloudbuild.yaml

steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile.$_DOCKERNAME']
images:
- '$_IMAGE_URI'

Overwriting src/cloudbuild.yaml


In [14]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/two_tower_jt/train_config.py

PROJECT_ID = 'hybrid-vertex'
NEW_ADAPTS = 'True'
USE_CROSS_LAYER = False
USE_DROPOUT = 'False'
SEED = 1234
MAX_PLAYLIST_LENGTH = 5         # this should improve performance vs 375
EMBEDDING_DIM = 128   
PROJECTION_DIM = 25  
SEED = 1234
DROPOUT_RATE = 0.33
MAX_TOKENS = 20000

Overwriting src/two_tower_jt/train_config.py


In [15]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/two_tower_jt/requirements.txt
google-cloud-aiplatform>=1.20.0
tensorflow-recommenders==0.7.2
tensorboard==2.10.1
tensorboard-data-server==0.6.1
tensorboard-plugin-profile==2.11.1
tensorflow-io==0.27.0
google-cloud-aiplatform[cloud_profiler]>=1.20.0

Overwriting src/two_tower_jt/requirements.txt


In [16]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile.{DOCKERNAME}

# FROM tensorflow/tensorflow:2.10.1-gpu
FROM gcr.io/deeplearning-platform-release/tf-gpu.2-10

WORKDIR /src

# Copies the trainer code to the docker image.
COPY two_tower_jt/* two_tower_jt/ 

RUN pip install -r two_tower_jt/requirements.txt

RUN apt update && apt -y install nvtop

# # Sets up the entry point to invoke the trainer.
# # ENTRYPOINT ["python", "-m", "two_tower_jt.task"]

Overwriting src/Dockerfile.tfrs


## Build Custom Train Image

In [17]:
print(f"DOCKERNAME: {DOCKERNAME}")
print(f"IMAGE_URI: {IMAGE_URI}")
print(f"FILE_LOCATION: {FILE_LOCATION}")
print(f"MACHINE_TYPE: {MACHINE_TYPE}")

DOCKERNAME: tfrs
IMAGE_URI: gcr.io/hybrid-vertex/sp-2tower-tfrs-jtv13-v11-training
FILE_LOCATION: ./src
MACHINE_TYPE: e2-highcpu-32


In [18]:
!pwd

/home/jupyter/jw-repo/spotify_mpd_two_tower


In [19]:
!tree /home/jupyter/jw-repo/spotify_mpd_two_tower/src

[01;34m/home/jupyter/jw-repo/spotify_mpd_two_tower/src[00m
├── Dockerfile.tfrs
├── cloudbuild.yaml
├── [01;34mtrain_pipes[00m
│   ├── build_custom_image.py
│   └── train_custom_model.py
├── [01;34mtwo_tower_jt[00m
│   ├── __init__.py
│   ├── [01;34m__pycache__[00m
│   │   ├── __init__.cpython-37.pyc
│   │   ├── train_config.cpython-37.pyc
│   │   ├── two_tower.cpython-37.pyc
│   │   └── two_tower_lite.cpython-37.pyc
│   ├── data-pipeline.py
│   ├── interactive_train.py
│   ├── requirements.txt
│   ├── task.py
│   ├── train_config.py
│   ├── two_tower.py
│   └── two_tower_lite.py
└── [01;34mvocab_pipes[00m
    ├── [01;34m__pycache__[00m
    ├── adapt_fixed_text_layer_vocab.py
    ├── adapt_ragged_text_layer_vocab.py
    ├── config.py
    └── create_master_vocab.py

5 directories, 20 files


### Optionally include a `.gcloudignore` file 

* limits the files submitted to Cloud Build
* see [gcloudignore](https://cloud.google.com/sdk/gcloud/reference/topic/gcloudignore) for details

In [20]:
! gcloud config set gcloudignore/enabled true

Updated property [gcloudignore/enabled].


In [21]:
%%writefile .gcloudignore
.gcloudignore
/local_files/
/img/
*.pkl
*.png
.git
.github
.ipynb_checkpoints/*
*__pycache__
spotipy_secret_creds.py
candidate_embs_local_v6_20230112-180944.json

Overwriting .gcloudignore


In [22]:
# !gcloud meta list-files-for-upload
# !ls

In [23]:
! gcloud builds submit --config src/cloudbuild.yaml \
    --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
    --timeout=2h \
    --machine-type=$MACHINE_TYPE

Creating temporary tarball archive of 61 file(s) totalling 1.8 MiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1673606563.320645-46ebda4c43014d47b0b155bd5656694f.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/bcca1ea0-0d76-4630-927e-fa1b077cc00e].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/bcca1ea0-0d76-4630-927e-fa1b077cc00e?project=934903580331 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "bcca1ea0-0d76-4630-927e-fa1b077cc00e"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1673606563.320645-46ebda4c43014d47b0b155bd5656694f.tgz#1673606563965303
Copying gs://hybrid-vertex_cloudbuild/source/1673606563.320645-46ebda4c43014d47b0b155bd5656694f.tgz#1673606563965303...
/ [1 files][374.1 KiB/374.1 KiB]                                                
Operation completed over 1 objects/374.

# Pipeline Components

In [24]:
os.getcwd()

'/home/jupyter/jw-repo/spotify_mpd_two_tower'

In [25]:
REPO_DOCKER_PATH_PREFIX = 'src'
PIPELINES_SUB_DIR = 'train_pipes'

In [26]:
! rm -rf {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}
! mkdir {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}

## Build Custom Image

In [27]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/build_custom_image.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)
@kfp.v2.dsl.component(
    base_image="gcr.io/google.com/cloudsdktool/cloud-sdk:latest",
    packages_to_install=[
        "google-cloud-build"
    ],
)
def build_custom_image(
    project: str,
    artifact_gcs_path: str,
    docker_name: str,
    app_dir_name: str,
    custom_image_uri: str,
) -> NamedTuple('Outputs', [
    ('custom_image_uri', str),
]):
    # TODO: make output Artifact for image_uri
    """
    custom pipeline component to build custom image using
    Cloud Build, the training/serving application code, and dependencies
    defined in the Dockerfile
    """
    
    import logging
    import os

    from google.cloud.devtools import cloudbuild_v1 as cloudbuild
    from google.protobuf.duration_pb2 import Duration

    # initialize client for cloud build
    logging.getLogger().setLevel(logging.INFO)
    build_client = cloudbuild.services.cloud_build.CloudBuildClient()
    
    # parse step inputs to get path to Dockerfile and training application code
    _gcs_dockerfile_path = os.path.join(artifact_gcs_path, f"{docker_name}") # Dockerfile.XXXXX
    _gcs_script_dir_path = os.path.join(artifact_gcs_path, f"{app_dir_name}/") # "trainer/"
    
    logging.info(f"_gcs_dockerfile_path: {_gcs_dockerfile_path}")
    logging.info(f"_gcs_script_dir_path: {_gcs_script_dir_path}")
    
    # define build steps to pull the training code and Dockerfile
    # and build/push the custom training container image
    build = cloudbuild.Build()
    build.steps = [
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", "-r", _gcs_script_dir_path, "."],
        },
        {
            "name": "gcr.io/cloud-builders/gsutil",
            "args": ["cp", _gcs_dockerfile_path, "Dockerfile"],
        },
        # enabling Kaniko cache in a Docker build that caches intermediate
        # layers and pushes image automatically to Container Registry
        # https://cloud.google.com/build/docs/kaniko-cache
        # {
        #     "name": "gcr.io/kaniko-project/executor:latest",
        #     # "name": "gcr.io/kaniko-project/executor:v1.8.0",        # TODO; downgraded to avoid error in build
        #     # "args": [f"--destination={training_image_uri}", "--cache=true"],
        #     "args": [f"--destination={training_image_uri}", "--cache=false"],
        # },
        {
            "name": "gcr.io/cloud-builders/docker",
            "args": ['build','-t', f'{custom_image_uri}', '.'],
        },
        {
            "name": "gcr.io/cloud-builders/docker",
            "args": ['push', f'{custom_image_uri}'], 
        },
    ]
    # override default timeout of 10min
    timeout = Duration()
    timeout.seconds = 7200
    build.timeout = timeout

    # create build
    operation = build_client.create_build(project_id=project, build=build)
    logging.info("IN PROGRESS:")
    logging.info(operation.metadata)

    # get build status
    result = operation.result()
    logging.info("RESULT:", result.status)

    # return step outputs
    return (
        custom_image_uri,
    )

Writing src/train_pipes/build_custom_image.py


## Custom train job

In [28]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/train_custom_model.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)
@kfp.v2.dsl.component(
    base_image='python:3.9',
    packages_to_install=[
        'google-cloud-aiplatform==1.18.1',
        # 'tensorflow==2.9.2',
        # 'tensorflow-recommenders==0.7.0',
        'numpy',
        'google-cloud-storage',
    ],
    # output_component_file="./pipelines/train_custom_model.yaml",
)
def train_custom_model(
    project: str,
    model_version: str,
    pipeline_version: str,
    model_name: str, 
    worker_pool_specs: dict,
    # vocab_dict_uri: str, 
    train_output_gcs_bucket: str,                         # change to workdir?
    training_image_uri: str,
    tensorboard_resource_name: str,
    service_account: str,
    experiment_name: str,
    experiment_run: str,
) -> NamedTuple('Outputs', [
    ('job_dict_uri', str),
    ('query_tower_dir_uri', str),
    ('candidate_tower_dir_uri', str),
    # ('candidate_index_dir_uri', str),
]):
    
    import logging
    import numpy as np
    import pickle as pkl
    
    from google.cloud import aiplatform as vertex_ai
    from google.cloud import storage
    
    vertex_ai.init(
        project=project,
        location='us-central1',
        experiment=experiment_name,
    )
    
    storage_client = storage.Client(project=project)
    
    JOB_NAME = f'train-{model_name}'
    logging.info(f'JOB_NAME: {JOB_NAME}')
    
    BASE_OUTPUT_DIR = f'gs://{train_output_gcs_bucket}/{experiment_name}/{experiment_run}'
    logging.info(f'BASE_OUTPUT_DIR: {BASE_OUTPUT_DIR}')
    
    # logging.info(f'vocab_dict_uri: {vocab_dict_uri}')
    
    logging.info(f'tensorboard_resource_name: {tensorboard_resource_name}')
    logging.info(f'service_account: {service_account}')
    logging.info(f'worker_pool_specs: {worker_pool_specs}')
    
    # ====================================================
    # Launch Vertex job
    # ====================================================
  
    job = vertex_ai.CustomJob(
        display_name=JOB_NAME,
        worker_pool_specs=worker_pool_specs,
        base_output_dir=BASE_OUTPUT_DIR,
        staging_bucket=f"{BASE_OUTPUT_DIR}/staging",
    )
    
    logging.info(f'Submitting train job to Vertex AI...')

    # try:
    #     job.run(
    #         tensorboard=tensorboard_resource_name,
    #         service_account=f'{service_account}',
    #         restart_job_on_worker_restart=False,
    #         enable_web_access=True,
    #         sync=False,
    #     )
    # except Exception as e:
    #     # may fail in multi-worker to find startup script
    #     logging.info(e)
    
    job.run(
        tensorboard=tensorboard_resource_name,
        service_account=f'{service_account}',
        restart_job_on_worker_restart=False,
        enable_web_access=True,
        sync=False,
    )
        
    # wait for job to complete
    job.wait()
    
    # ====================================================
    # Save job details
    # ====================================================
    
    train_job_dict = job.to_dict()
    logging.info(f'train_job_dict: {train_job_dict}')
    
    # pkl dict to GCS
    logging.info(f"Write pickled dict to GCS...")
    TRAIN_DICT_LOCAL = f'train_job_dict.pkl'
    TRAIN_DICT_GCS_OBJ = f'{experiment_name}/{experiment_run}/{TRAIN_DICT_LOCAL}' # destination folder prefix and blob name
    
    logging.info(f"TRAIN_DICT_LOCAL: {TRAIN_DICT_LOCAL}")
    logging.info(f"TRAIN_DICT_GCS_OBJ: {TRAIN_DICT_GCS_OBJ}")

    # pickle
    filehandler = open(f'{TRAIN_DICT_LOCAL}', 'wb')
    pkl.dump(train_job_dict, filehandler)
    filehandler.close()
    
    # upload to GCS
    bucket_client = storage_client.bucket(train_output_gcs_bucket)
    blob = bucket_client.blob(TRAIN_DICT_GCS_OBJ)
    blob.upload_from_filename(TRAIN_DICT_LOCAL)
    
    job_dict_uri = f'gs://{train_output_gcs_bucket}/{TRAIN_DICT_GCS_OBJ}'
    logging.info(f"{TRAIN_DICT_LOCAL} uploaded to {job_dict_uri}")
    
    # ====================================================
    # Model and index artifact uris
    # ====================================================
    
    # "gs://jt-tfrs-output-v2/pipe-dev-2tower-tfrs-jtv10/run-20221228-172834/model-dir/candidate_model
    # "gs://jt-tfrs-output-v2/pipe-dev-2tower-tfrs-jtv10/run-20221228-172834/model-dir/candidate_tower"
    
    query_tower_dir_uri = f"gs://{train_output_gcs_bucket}/{experiment_name}/{experiment_run}/model-dir/query_model" 
    candidate_tower_dir_uri = f"gs://{train_output_gcs_bucket}/{experiment_name}/{experiment_run}/model-dir/candidate_model"
    # candidate_index_dir_uri = f"gs://{output_dir_gcs_bucket_name}/{experiment_name}/{experiment_run}/candidate_model"
    
    logging.info(f'query_tower_dir_uri: {query_tower_dir_uri}')
    logging.info(f'candidate_tower_dir_uri: {candidate_tower_dir_uri}')
    # logging.info(f'candidate_index_dir_uri: {candidate_index_dir_uri}')
    
    return (
        f'{job_dict_uri}',
        f'{query_tower_dir_uri}',
        f'{candidate_tower_dir_uri}',
        # f'{candidate_index_dir_uri}',
    )

Writing src/train_pipes/train_custom_model.py


# Prepare Job Specs

## Vertex Train: workerpool specs

In [29]:
def prepare_worker_pool_specs(
    image_uri,
    args,
    cmd,
    replica_count=1,
    machine_type="n1-standard-16",
    accelerator_count=1,
    accelerator_type="ACCELERATOR_TYPE_UNSPECIFIED",
    reduction_server_count=0,
    reduction_server_machine_type="n1-highcpu-16",
    reduction_server_image_uri="us-docker.pkg.dev/vertex-ai-restricted/training/reductionserver:latest",
):

    if accelerator_count > 0:
        machine_spec = {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,
        }
    else:
        machine_spec = {"machine_type": machine_type}

    container_spec = {
        "image_uri": image_uri,
        "args": args,
        "command": cmd,
    }

    chief_spec = {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "container_spec": container_spec,
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            "replica_count": replica_count - 1,
            "machine_spec": machine_spec,
            "container_spec": container_spec,
        }
        worker_pool_specs.append(workers_spec)
    if reduction_server_count > 1:
        workers_spec = {
            "replica_count": reduction_server_count,
            "machine_spec": {
                "machine_type": reduction_server_machine_type,
            },
            "container_spec": {"image_uri": reduction_server_image_uri},
        }
        worker_pool_specs.append(workers_spec)

    return worker_pool_specs

## Accelerators and Device Strategy

In [30]:
# # Single machine, single GPU
WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
REPLICA_COUNT = 1
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
PER_MACHINE_ACCELERATOR_COUNT = 1
REDUCTION_SERVER_COUNT = 0                                                      
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
DISTRIBUTE_STRATEGY = 'single'

# # # Single Machine; multiple GPU
# WORKER_MACHINE_TYPE = 'a2-highgpu-4g' # a2-ultragpu-4g
# REPLICA_COUNT = 1
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# PER_MACHINE_ACCELERATOR_COUNT = 4
# REDUCTION_SERVER_COUNT = 0                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'mirrored'

# # # # Multiple Machine; 1 GPU per machine
# WORKER_MACHINE_TYPE = 'a2-highgpu-2g' # a2-ultragpu-4g
# REPLICA_COUNT = 2
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# PER_MACHINE_ACCELERATOR_COUNT = 2
# REDUCTION_SERVER_COUNT = 4                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'multiworker'

# # # Multiple Machines, 1 GPU per Machine
# WORKER_MACHINE_TYPE = 'n1-standard-16'
# REPLICA_COUNT = 9
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_T4'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 10                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'multiworker'

## Vertex AI Experiments

In [48]:
EXPERIMENT_PREFIX = 'test-full-v14-data'                                                  # custom identifier for organizing experiments
# EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{MODEL_TYPE}-{FRAMEWORK}-{MODEL_VERSION}'
EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{MODEL_VERSION}'
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: test-full-v14-data-jtv13
RUN_NAME: run-20230113-112521


## Managed Tensorboard

In [49]:
# use existing TB instance
# TB_RESOURCE_NAME = 'projects/934903580331/locations/us-central1/tensorboards/636071874714927104'

# # create new TB instance
TENSORBOARD_DISPLAY_NAME=f"{EXPERIMENT_PREFIX}-v1"
tensorboard = vertex_ai.Tensorboard.create(display_name=TENSORBOARD_DISPLAY_NAME, project=PROJECT_ID, location="us-central1")
TB_RESOURCE_NAME = tensorboard.resource_name


print(f"TB_RESOURCE_NAME: {TB_RESOURCE_NAME}")

TB_RESOURCE_NAME: projects/934903580331/locations/us-central1/tensorboards/3550569336693325824


In [50]:
tensorboard.display_name

'test-full-v14-data-v1'

## Training Config

* see [src code](https://github.com/googleapis/python-aiplatform/blob/e7bf0d83d8bb0849a9bce886c958d13f5cbe5fab/google/cloud/aiplatform/utils/worker_spec_utils.py#L153) for worker_pool_spec

In [51]:
# train_samples = 60_733_427
# valid_samples = 613_001

train_samples = 8205265
valid_samples = 82959

batch_size = 4096

train_steps = train_samples // batch_size
val_steps = valid_samples // batch_size

print(f"train_steps: {train_steps}")
print(f"val_steps: {val_steps}")

train_steps: 2003
val_steps: 20


In [54]:
train_sample_cnt = 8_205_265
valid_samples_cnt = 82_959

# ====================================================
# Pipeline output repo 
# ====================================================
OUTPUT_BUCKET = 'jt-tfrs-central-v2'
OUTPUT_GCS_URI =f'gs://{OUTPUT_BUCKET}'

# Stores pipeline executions for each run
# PIPELINE_ROOT_PATH = f'gs://{OUTPUT_BUCKET}/{MODEL_ROOT_NAME}/pipeline_root'
PIPELINE_ROOT_PATH = f'gs://{OUTPUT_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}/pipeline_root'
print('PIPELINE_ROOT_PATH: {}'.format(PIPELINE_ROOT_PATH))

# =================================================
# Data sources
# =================================================
CANDIDATE_FILE_DIR = 'spotify-data-regimes'
# CANDIDATE_PREFIX = 'jtv10/candidates'
CANDIDATE_PREFIX = 'jtv14-8m/candidates'

TRAIN_DIR = 'spotify-data-regimes' 
# TRAIN_DIR_PREFIX = 'jtv10/train_v9' # full 65m
# TRAIN_DIR_PREFIX = 'jtv10/valid_v9'  # test small data
TRAIN_DIR_PREFIX = 'jtv14-8m/train_v14' # full 8m

VALID_DIR = 'spotify-data-regimes' 
# VALID_DIR_PREFIX = 'jtv10/valid_v9'
VALID_DIR_PREFIX = 'jtv14-8m/valid_v14' 

# =================================================
# train image
# =================================================
# Existing image URI or name for image to create
IMAGE_URI = f"{IMAGE_URI}"
print(f"IMAGE_URI: {IMAGE_URI}")

# =================================================
# train job config
# =================================================
SEED = 1234
TF_GPU_THREAD_COUNT='8' # '1' | '4' | '8'

BLOCK_LENGTH = 64 # 1, 8, 16, 32, 64

NUM_DATA_SHARDS=4 # 2, 4, 8, 16, 32, 64

# training hparams
NUM_EPOCHS = 10
BATCH_SIZE = 4096  # 4096, 2048, 1024, 512 
LEARNING_RATE = 0.01

# regularization
DROPOUT_RATE = 0.33

# model size
EMBEDDING_DIM = 128
PROJECTION_DIM = 50
LAYER_SIZES = '[64,32]'
MAX_TOKENS = 20000 # vocab
# MAX_PADDING = 375

# validation & evaluation
VALID_FREQUENCY = 51
VALID_STEPS = valid_samples_cnt // BATCH_SIZE # 100
EPOCH_STEPS = train_sample_cnt // BATCH_SIZE

# tensorboard
EMBED_FREQUENCY=0
HISTOGRAM_FREQUENCY=0
CHECKPOINT_FREQ='epoch'

WORKER_CMD = ["python", "two_tower_jt/task.py"]
# WORKER_CMD ["python", "-m", "trainer.task"]

WORKER_ARGS = [
    f'--project={PROJECT_ID}',
    f'--train_output_gcs_bucket={OUTPUT_BUCKET}',
    f'--train_dir={TRAIN_DIR}',
    f'--train_dir_prefix={TRAIN_DIR_PREFIX}',
    f'--valid_dir={VALID_DIR}',
    f'--valid_dir_prefix={VALID_DIR_PREFIX}',
    f'--candidate_file_dir={CANDIDATE_FILE_DIR}',
    f'--candidate_files_prefix={CANDIDATE_PREFIX}',
    f'--experiment_name={EXPERIMENT_NAME}',
    f'--experiment_run={RUN_NAME}',
    f'--num_epochs={NUM_EPOCHS}',
    f'--batch_size={BATCH_SIZE}',
    f'--embedding_dim={EMBEDDING_DIM}',
    f'--projection_dim={PROJECTION_DIM}',
    f'--layer_sizes={LAYER_SIZES}',
    f'--learning_rate={LEARNING_RATE}',
    f'--valid_frequency={VALID_FREQUENCY}',
    f'--valid_steps={VALID_STEPS}',
    f'--epoch_steps={EPOCH_STEPS}',
    f'--distribute={DISTRIBUTE_STRATEGY}',
    f'--model_version={MODEL_VERSION}',
    f'--pipeline_version={PIPELINE_VERSION}',
    f'--seed={SEED}',
    f'--max_tokens={MAX_TOKENS}',
    f'--tb_resource_name={TB_RESOURCE_NAME}',
    f'--embed_frequency={EMBED_FREQUENCY}',
    f'--hist_frequency={HISTOGRAM_FREQUENCY}',
    f'--tf_gpu_thread_count={TF_GPU_THREAD_COUNT}',
    f'--block_length={BLOCK_LENGTH}',
    f'--num_data_shards={NUM_DATA_SHARDS}',
    f'--chkpt_freq={CHECKPOINT_FREQ}',
    # f'--cache_train', # uncomment to cache train_dataset
    # f'--evaluate_model',  # uncomment to run model.eval()
    # f'--write_embeddings', # uncomment to write embeddings index in train job
    f'--profiler',
    # f'--set_jit',
]

WORKER_POOL_SPECS = prepare_worker_pool_specs(
    image_uri=IMAGE_URI,
    args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

PIPELINE_ROOT_PATH: gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root
IMAGE_URI: gcr.io/hybrid-vertex/sp-2tower-tfrs-jtv13-v11-training
[{'container_spec': {'args': ['--project=hybrid-vertex',
                              '--train_output_gcs_bucket=jt-tfrs-central-v2',
                              '--train_dir=spotify-data-regimes',
                              '--train_dir_prefix=jtv14-8m/train_v14',
                              '--valid_dir=spotify-data-regimes',
                              '--valid_dir_prefix=jtv14-8m/valid_v14',
                              '--candidate_file_dir=spotify-data-regimes',
                              '--candidate_files_prefix=jtv14-8m/candidates',
                              '--experiment_name=test-full-v14-data-jtv13',
                              '--experiment_run=run-20230113-112521',
                              '--num_epochs=10',
                              '--batch_size=4096',
                       

In [55]:
!export PWD=pwd
!export PIPELINE_ROOT_PATH=PIPELINE_ROOT_PATH
!export REPO_DOCKER_PATH_PREFIX=REPO_DOCKER_PATH_PREFIX

! echo $PWD
! echo $PIPELINE_ROOT_PATH
! echo $REPO_DOCKER_PATH_PREFIX

/home/jupyter/jw-repo/spotify_mpd_two_tower
gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root
src


### copy train package to GCS

In [56]:
# copy training Dockerfile
!gsutil cp $REPO_DOCKER_PATH_PREFIX/cloudbuild.yaml $PIPELINE_ROOT_PATH/cloudbuild.yaml
!gsutil cp $REPO_DOCKER_PATH_PREFIX/Dockerfile.tfrs $PIPELINE_ROOT_PATH/Dockerfile.tfrs

# # # copy training application code
! gsutil -m cp -r $REPO_DOCKER_PATH_PREFIX/two_tower_jt/* $PIPELINE_ROOT_PATH/trainer

print(f"\n Copied training package and Dockerfile to {PIPELINE_ROOT_PATH}\n")

Copying file://src/cloudbuild.yaml [Content-Type=application/octet-stream]...
/ [1 files][  178.0 B/  178.0 B]                                                
Operation completed over 1 objects/178.0 B.                                      
Copying file://src/Dockerfile.tfrs [Content-Type=application/octet-stream]...
/ [1 files][  387.0 B/  387.0 B]                                                
Operation completed over 1 objects/387.0 B.                                      
Copying file://src/two_tower_jt/__init__.py [Content-Type=text/x-python]...
Copying file://src/two_tower_jt/__pycache__/train_config.cpython-37.pyc [Content-Type=application/x-python-code]...
Copying file://src/two_tower_jt/__pycache__/two_tower_lite.cpython-37.pyc [Content-Type=application/x-python-code]...
Copying file://src/two_tower_jt/__pycache__/__init__.cpython-37.pyc [Content-Type=application/x-python-code]...
Copying file://src/two_tower_jt/__pycache__/two_tower.cpython-37.pyc [Content-Type=application/x

In [57]:
! gsutil ls -Rl $PIPELINE_ROOT_PATH/trainer

gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/trainer/:
         0  2023-01-13T11:32:28Z  gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/trainer/__init__.py
       835  2023-01-13T11:32:28Z  gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/trainer/data-pipeline.py
        44  2023-01-13T11:32:28Z  gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/trainer/interactive_train.py
       219  2023-01-13T11:32:28Z  gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/trainer/requirements.txt
     24848  2023-01-13T11:32:28Z  gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/trainer/task.py
       277  2023-01-13T11:32:28Z  gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/trainer/train_config.py
     56895  2023-01-13T11:32:28Z  gs://jt-tfrs-central-v2/test-ful

# Build & Submit Pipeline

In [58]:
PIPELINE_TAG = f'2tower-{PIPELINE_VERSION}'
print("PIPELINE_TAG:", PIPELINE_TAG)

PIPELINE_NAME = f'trainer-{MODEL_VERSION}-{PIPELINE_TAG}'.replace('_', '-')
print("PIPELINE_NAME:", PIPELINE_NAME)

PIPELINE_TAG: 2tower-v11
PIPELINE_NAME: trainer-jtv13-2tower-v11


## Create pipeline

In [59]:
from src.train_pipes import build_custom_image, train_custom_model

@kfp.v2.dsl.pipeline(
    name=f'{PIPELINE_NAME}'.replace('_', '-')
)
def pipeline(
    project: str,
    project_number: str,
    location: str,
    service_account: str,
    model_version: str,
    pipeline_version: str,
    train_image_uri: str,
    train_output_gcs_bucket: str,
    gcs_train_script_path: str,
    model_display_name: str,
    train_dockerfile_name: str,
    train_dir: str,
    train_dir_prefix: str,
    valid_dir: str,
    valid_dir_prefix: str,
    candidate_file_dir: str,
    candidate_files_prefix: str,
    tensorboard_resource_name: str,
    experiment_name: str,
    experiment_run: str,
    register_model: bool,
):
    
    from kfp.v2.components import importer_node
    from google_cloud_pipeline_components.types import artifact_types
    
    # ========================================================================
    # Build Custom Train Image
    # ========================================================================
    
    # build_custom_train_image_op = (
    #     build_custom_train_image.build_custom_train_image(
    #         project=project,
    #         gcs_train_script_path=gcs_train_script_path,
    #         training_image_uri=train_image_uri,
    #         train_dockerfile_name=train_dockerfile_name,
    #     )
    #     .set_display_name("Build custom train image")
    #     .set_caching_options(False)
    # )


    run_train_task_op = (
        train_custom_model.train_custom_model(
            project=project,
            model_version=model_version,
            pipeline_version=pipeline_version,
            model_name=model_display_name,
            worker_pool_specs=WORKER_POOL_SPECS, 
            train_output_gcs_bucket=train_output_gcs_bucket,
            # vocab_dict_uri=build_vocabs_string_lookups_op.outputs['vocab_gcs_uri'],
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            training_image_uri=train_image_uri,                                                                       # build_custom_train_image_op.outputs['training_image_uri'],
            tensorboard_resource_name=tensorboard_resource_name, # create_tensorboard_op.outputs['tensorboard_resource_name'],
            service_account=service_account,
        )
        .set_display_name("2Tower Training")
        .set_caching_options(True)
        # .after(build_custom_train_image_op)
    )
    
    # ========================================================================
    # Import trained Query and Candidate Towers to this DAG (metadata)
    # ========================================================================
    
    import_unmanaged_query_model_task = (
        importer_node.importer(
            artifact_uri=run_train_task_op.outputs['query_tower_dir_uri'],
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                'containerSpec': {
                    'imageUri': 'us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-10:latest',
                },
            },
        )
        .set_display_name("Import Query Tower")
        .after(run_train_task_op)
        .set_caching_options(True)
    )
    
    import_unmanaged_candidate_model_task = (
        importer_node.importer(
            artifact_uri=run_train_task_op.outputs['candidate_tower_dir_uri'],
            artifact_class=artifact_types.UnmanagedContainerModel,
            metadata={
                'containerSpec': {
                    'imageUri': 'us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-10:latest',
                },
            },
        )
        .set_display_name("Import Candidate Tower")
        .after(run_train_task_op)
        .set_caching_options(True)
    )
    
    # ========================================================================
    # Conditional: Upload models to Vertex model registry
    # ========================================================================
    with kfp.v2.dsl.Condition(register_model == "True", name="Create New Endpoint"):
        
        query_model_upload_op = (
            gcc_aip.ModelUploadOp(
                project=project,
                location=location,
                display_name=f'query-tower-{model_display_name}',
                unmanaged_container_model=import_unmanaged_query_model_task.outputs["artifact"],
                labels={"tower": "query"},
            )
            .set_display_name("Upload Query Tower")
            .set_caching_options(True)
        )
        
        candidate_model_upload_op = (
            gcc_aip.ModelUploadOp(
                project=project,
                location=location,
                display_name=f'candidate-tower-{model_display_name}',
                unmanaged_container_model=import_unmanaged_candidate_model_task.outputs["artifact"],
                labels={"tower": "candidate"},
            )
            .set_display_name("Upload Query Tower to Vertex")
            .set_caching_options(True)
        )
        
        

In [60]:
# ! rm -f custom_container_pipeline_spec.json

PIPELINE_JSON_SPEC_LOCAL = "custom_pipeline_spec.json"

! rm -f $PIPELINE_JSON_SPEC_LOCAL

kfp.v2.compiler.Compiler().compile(
    pipeline_func=pipeline, package_path=PIPELINE_JSON_SPEC_LOCAL,
)

### save pipeline spec json

In [61]:
# !gsutil cp custom_container_pipeline_spec.json $PIPELINE_ROOT_PATH/pipeline_spec.json

PIPELINES_FILEPATH = f'{PIPELINE_ROOT_PATH}/pipeline_spec.json'
print("PIPELINES_FILEPATH:", PIPELINES_FILEPATH)

!gsutil cp $PIPELINE_JSON_SPEC_LOCAL $PIPELINES_FILEPATH

PIPELINES_FILEPATH: gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/pipeline_spec.json
Copying file://custom_pipeline_spec.json [Content-Type=application/json]...
/ [1 files][ 36.6 KiB/ 36.6 KiB]                                                
Operation completed over 1 objects/36.6 KiB.                                     


In [62]:
!gsutil ls $PIPELINE_ROOT_PATH

gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/Dockerfile.tfrs
gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/cloudbuild.yaml
gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/pipeline_spec.json
gs://jt-tfrs-central-v2/test-full-v14-data-jtv13/run-20230113-112521/pipeline_root/trainer/


## Submit pipeline to Vertex

In [63]:
PIPELINE_NAME

'trainer-jtv13-2tower-v11'

In [64]:
PROJECT_NUMBER='934903580331'
vpc_network_name = 'ucaip-haystack-vpc-network'
# SERVICE_ACCOUNT = '934903580331-compute@developer.gserviceaccount.com'
SERVICE_ACCOUNT = 'notebooksa@hybrid-vertex.iam.gserviceaccount.com'

TRAIN_APP_CODE_PATH = f'{PIPELINE_ROOT_PATH}/trainer'

job = vertex_ai.PipelineJob(
    display_name=PIPELINE_NAME,
    template_path=PIPELINES_FILEPATH,
    pipeline_root=f'{PIPELINE_ROOT_PATH}',
    failure_policy='fast', # slow | fast
    # enable_caching=False,
    parameter_values={
        'project': PROJECT_ID,
        'project_number': PROJECT_NUMBER,
        'location': LOCATION,
        'model_version': MODEL_VERSION,
        'pipeline_version': PIPELINE_VERSION,
        'model_display_name': MODEL_ROOT_NAME,
        # 'pipeline_tag': PIPELINE_TAG,
        'gcs_train_script_path': TRAIN_APP_CODE_PATH,
        'train_image_uri': f"{IMAGE_URI}",
        'train_output_gcs_bucket': OUTPUT_BUCKET,
        'train_dir': TRAIN_DIR,
        'train_dir_prefix': TRAIN_DIR_PREFIX,
        'valid_dir': VALID_DIR,
        'valid_dir_prefix': VALID_DIR_PREFIX,
        'candidate_file_dir': CANDIDATE_FILE_DIR,
        'candidate_files_prefix': CANDIDATE_PREFIX,
        'tensorboard_resource_name': TB_RESOURCE_NAME,
        'train_dockerfile_name': DOCKERNAME,
        'experiment_name': EXPERIMENT_NAME,
        'experiment_run': RUN_NAME,
        'service_account': SERVICE_ACCOUNT,
        'register_model': False,
    },
)

job.run(
    sync=False,
    service_account=SERVICE_ACCOUNT,
    network=f'projects/{PROJECT_NUMBER}/global/networks/{vpc_network_name}'
)

#### clean up

In [32]:
# ! rm -rf custom_pipeline_spec.json