# Scaling Two-Tower training with Vertex AI

In [1]:
PREFIX = 'twotower-v1'

In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "twotower-v1"
VERSION                  = "v1"
DATA_VERSION             = "v1-0-0"

BUCKET_NAME              = "twotower-v1-hybrid-vertex-bucket"
BUCKET_URI               = "gs://twotower-v1-hybrid-vertex-bucket"
SOURCE_BUCKET            = "spotify-million-playlist-dataset"

DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://twotower-v1-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BQ_DATASET               = "spotify_e2e_test"

REPO_SRC                 = "sr

In [3]:
# GCP_PROJECTS = !gcloud config get-value project
# PROJECT_ID = GCP_PROJECTS[0]
# PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
# PROJECT_NUM = PROJECT_NUM[0]
# REGION = 'us-central1'

# print(f"PROJECT_ID: {PROJECT_ID}")
# print(f"PROJECT_NUM: {PROJECT_NUM}")
# print(f"REGION: {REGION}")

# VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'

In [4]:
import os
import json
from datetime import datetime
from time import time
import pandas as pd
import logging
import time
from pprint import pprint
import pickle as pkl

logging.disable(logging.WARNING)

from google.cloud import aiplatform as vertex_ai
from google.cloud import storage

In [5]:
storage_client = storage.Client(project=PROJECT_ID)

vertex_ai.init(project=PROJECT_ID,location=REGION)

### update vars

In [6]:
# VERSION= "trainerv7"
# APP='sp'
# MODEL_TYPE='2tower'
# FRAMEWORK = 'tfrs'
# MODEL_ROOT_NAME = f'{APP}-{MODEL_TYPE}-{FRAMEWORK}-{VERSION}'

# print(f"MODEL_ROOT_NAME: {MODEL_ROOT_NAME}")

## Create training package

In [7]:
# REPO_DOCKER_PATH_PREFIX = 'src'

In [8]:
# # Docker definitions for training
# IMAGE_NAME = f'{MODEL_ROOT_NAME}-tr'
# IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

# DOCKERNAME = 'tfrs'
# REPO_DOCKER_PATH_PREFIX = 'src'
# MACHINE_TYPE ='e2-highcpu-32'
# FILE_LOCATION = './src'

# print(f"IMAGE_URI: {IMAGE_URI}")

In [9]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/cloudbuild.yaml

# steps:
# - name: 'gcr.io/cloud-builders/docker'
#   args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile_$_DOCKERNAME']
# images:
# - '$_IMAGE_URI'

In [10]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/two_tower_jt/train_config.py
# PROJECT_ID='hybrid-vertex'
# MAX_PLAYLIST_LENGTH = 5

In [11]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/two_tower_jt/requirements.txt
# # google-cloud-aiplatform>=1.25.0
# google-cloud-aiplatform[cloud_profiler]>=1.25.0
# tensorflow-recommenders==0.7.2
# tensorboard==2.10.1
# # tensorboard==2.11.2 
# tensorboard-data-server==0.6.1
# tensorboard-plugin-profile==2.11.1
# tensorflow-io==0.27.0

In [12]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/Dockerfile_{DOCKERNAME}

# # FROM tensorflow/tensorflow:2.10.1-gpu
# FROM gcr.io/deeplearning-platform-release/tf-gpu.2-11

# WORKDIR /src

# # Copies the trainer code to the docker image.
# # COPY src/two_tower_jt/* src/two_tower_jt/ 
# COPY two_tower_jt/* ./ 

# # RUN pip install -r two_tower_jt/requirements.txt
# RUN pip install -r ./requirements.txt

# RUN apt update && apt -y install nvtop

## Build Training image with Cloud Build

In [13]:
# print(f"DOCKERNAME: {DOCKERNAME}")
# print(f"IMAGE_URI: {IMAGE_URI}")
# print(f"FILE_LOCATION: {FILE_LOCATION}")
# print(f"MACHINE_TYPE: {MACHINE_TYPE}")

### set `gcloudignore`

In [14]:
# ! gcloud config set gcloudignore/enabled true

In [15]:
# %%writefile .gcloudignore
# .gcloudignore
# /local_files/
# img/*
# *.pkl
# *.png
# *.ipynb
# .git
# .github
# .ipynb_checkpoints/*
# *__pycache__
# *cpython-37.pyc
# spotipy_secret_creds.py
# custom_pipeline_spec.json
# /WIP/*
# beam_candidates/*
# beam_training/*
# learning/*
# src/vocab_pipes/*
# src/train_pipes/*
# src/feature_pipes/*
# test_root/*
# custom_track_meta_pipeline_spec.json
# pip_freeze.txt
# README.md
# .gitignore
# .DS_Store

In [16]:
# !gcloud meta list-files-for-upload

### submit job to Cloud Build

In [17]:
# ! gcloud builds submit --config src/cloudbuild.yaml \
#     --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
#     --timeout=2h \
#     --machine-type=$MACHINE_TYPE

## Prepare Train Job Specs

In [18]:
filehandler = open('vocab_dict.pkl', 'rb')
vocab_dict = pkl.load(filehandler)
filehandler.close()

In [19]:
for keys in vocab_dict:
    print(keys)

pl_name_src
track_name_pl
artist_name_pl
album_name_pl
artist_genres_pl
tracks_playlist_titles_pl
track_name_can
artist_name_can
album_name_can
artist_genres_can
track_pl_titles_can


### Training Accelerators

In [88]:
### A100 (40GB)
# WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
# REPLICA_COUNT = 1
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 0                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'single'

### A100 (80GB)
# WORKER_MACHINE_TYPE = 'a2-ultragpu-1g'
# REPLICA_COUNT = 1
# ACCELERATOR_TYPE = 'NVIDIA_A100_80GB'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 0                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'single'

### Tesla T4
WORKER_MACHINE_TYPE = 'n1-standard-16'
REPLICA_COUNT = 1
ACCELERATOR_TYPE = 'NVIDIA_TESLA_T4' # NVIDIA_TESLA_T4 NVIDIA_TESLA_V100
PER_MACHINE_ACCELERATOR_COUNT = 1
DISTRIBUTE_STRATEGY = 'single'
REDUCTION_SERVER_COUNT = 0                                                      
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

### Vertex Experiments

#### create an experiemnt and experiment run

In [89]:
EXPERIMENT_PREFIX = 'test-v3'                     # custom identifier for organizing experiments
EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{VERSION}'
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: test-v3-v1
RUN_NAME: run-20230919-031924


#### create Managed TensorBoard instance

In [90]:
# # create new TB instance
TENSORBOARD_DISPLAY_NAME=f"{EXPERIMENT_NAME}"

tensorboard = vertex_ai.Tensorboard.create(
    display_name=TENSORBOARD_DISPLAY_NAME, 
    project=PROJECT_ID, 
    location=REGION
)
TB_RESOURCE_NAME = tensorboard.resource_name

# use existing TB instance
# TB_RESOURCE_NAME = 'projects/934903580331/locations/us-central1/tensorboards/6924469145035603968'

print(f"TB_RESOURCE_NAME: {TB_RESOURCE_NAME}")
print(f"TB display name: {tensorboard.display_name}")

TB_RESOURCE_NAME: projects/934903580331/locations/us-central1/tensorboards/7279356314109607936
TB display name: test-v3-v1-v1


### training config

In [91]:
SEED = 1234

# =================================================
# trainconfig: GPU related
# =================================================
TF_GPU_THREAD_COUNT='8'      # '1' | '4' | '8'

# =================================================
# trainconfig: data input pipeline
# =================================================
BLOCK_LENGTH = 64            # 1, 8, 16, 32, 64
NUM_DATA_SHARDS = 4          # 2, 4, 8, 16, 32, 64
# TRAIN_PREFETCH=3

# =================================================
# trainconfig: training hparams
# =================================================
NUM_EPOCHS = 10
LEARNING_RATE = 0.01
BATCH_SIZE = 4096           # 8192, 4096, 2048, 1024, 512 

# dropout
DROPOUT_RATE = 0.33

# model size
EMBEDDING_DIM = 128
PROJECTION_DIM = 50
LAYER_SIZES = '[512,256,128]'
MAX_TOKENS = 20000     # vocab

# =================================================
# trainconfig: tensorboard
# =================================================
EMBED_FREQUENCY=0
HISTOGRAM_FREQUENCY=0
CHECKPOINT_FREQ='epoch'

In [92]:
# =================================================
# trainconfig: train & valid steps
# =================================================
train_sample_cnt = 8_205_265 # 8_205_265
valid_samples_cnt = 82_959

# validation & evaluation
VALID_FREQUENCY = 20
VALID_STEPS = valid_samples_cnt // BATCH_SIZE # 100
EPOCH_STEPS = train_sample_cnt // BATCH_SIZE

print(f"VALID_STEPS: {VALID_STEPS}")
print(f"EPOCH_STEPS: {EPOCH_STEPS}")

VALID_STEPS: 20
EPOCH_STEPS: 2003


### data sources

In [93]:
# =================================================
# trainconfig: gcs locations
# =================================================
OUTPUT_BUCKET = 'jt-tfrs-central-v3' # TODO: change this
OUTPUT_GCS_URI =f'gs://{OUTPUT_BUCKET}'

# =================================================
# trainconfig: Data sources
# =================================================
# BUCKET_DATA_DIR = 'spotify-data-regimes' 
# # data strategy: 08m
# CANDIDATE_PREFIX = 'jtv15-8m/candidates'
# TRAIN_DIR_PREFIX = 'jtv15-8m/train'     # train | train_v14
# VALID_DIR_PREFIX = 'jtv15-8m/valid'     # valid_v14

BUCKET_DATA_DIR = 'matching-engine-content'
# DATA_VERSION = 'v1-0-0'

TRAIN_DIR_PREFIX = f'{DATA_VERSION}/valid' #train' # subset: valid_v9 | train_v9
VALID_DIR_PREFIX = f'{DATA_VERSION}/valid' # valid_v9 | train_v9
CANDIDATE_PREFIX = f'{DATA_VERSION}/candidates' 

### training args

In [94]:
# WORKER_CMD = ["python", "two_tower_jt/task.py"]
# WORKER_CMD = ["python", "./task.py"]
WORKER_CMD = ["python", "-m", "task"]

WORKER_ARGS = [
    f'--project={PROJECT_ID}',
    f'--train_output_gcs_bucket={OUTPUT_BUCKET}',
    f'--train_dir={BUCKET_DATA_DIR}',
    f'--train_dir_prefix={TRAIN_DIR_PREFIX}',
    f'--valid_dir={BUCKET_DATA_DIR}',
    f'--valid_dir_prefix={VALID_DIR_PREFIX}',
    f'--candidate_file_dir={BUCKET_DATA_DIR}',
    f'--candidate_files_prefix={CANDIDATE_PREFIX}',
    f'--experiment_name={EXPERIMENT_NAME}',
    f'--experiment_run={RUN_NAME}',
    f'--num_epochs={NUM_EPOCHS}',
    f'--batch_size={BATCH_SIZE}',
    f'--embedding_dim={EMBEDDING_DIM}',
    f'--projection_dim={PROJECTION_DIM}',
    f'--layer_sizes={LAYER_SIZES}',
    f'--learning_rate={LEARNING_RATE}',
    f'--valid_frequency={VALID_FREQUENCY}',
    f'--valid_steps={VALID_STEPS}',
    f'--epoch_steps={EPOCH_STEPS}',
    f'--distribute={DISTRIBUTE_STRATEGY}',
    f'--model_version={VERSION}',
    f'--pipeline_version={VERSION}',
    f'--seed={SEED}',
    f'--max_tokens={MAX_TOKENS}',
    f'--tb_resource_name={TB_RESOURCE_NAME}',
    f'--embed_frequency={EMBED_FREQUENCY}',
    f'--hist_frequency={HISTOGRAM_FREQUENCY}',
    f'--tf_gpu_thread_count={TF_GPU_THREAD_COUNT}',
    f'--block_length={BLOCK_LENGTH}',
    f'--num_data_shards={NUM_DATA_SHARDS}',
    f'--chkpt_freq={CHECKPOINT_FREQ}',
    f'--dropout_rate={DROPOUT_RATE}',
    # uncomment these to pass value of True (bool)
    # f'--cache_train',                                # caches train_dataset
    # f'--evaluate_model',                           # runs model.eval()
    # f'--write_embeddings',                         # writes embeddings index in train job
    # f'--profiler',                                   # runs TB profiler
    # f'--set_jit',                                  # enables XLA
    f'--compute_batch_metrics',
    f'--use_cross_layer',
    f'--use_dropout',
]


In [95]:
from util import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=f"{REMOTE_IMAGE_NAME}",
    args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--project=hybrid-vertex',
                              '--train_output_gcs_bucket=jt-tfrs-central-v3',
                              '--train_dir=matching-engine-content',
                              '--train_dir_prefix=v1-0-0/valid',
                              '--valid_dir=matching-engine-content',
                              '--valid_dir_prefix=v1-0-0/valid',
                              '--candidate_file_dir=matching-engine-content',
                              '--candidate_files_prefix=v1-0-0/candidates',
                              '--experiment_name=test-v3-v1',
                              '--experiment_run=run-20230919-031924',
                              '--num_epochs=10',
                              '--batch_size=4096',
                              '--embedding_dim=128',
                              '--projection_dim=50',
                              '--layer_sizes=[512,256,128]',
                              '--learning_ra

### copy training package to GCS

In [96]:
BASE_OUTPUT_DIR = f'gs://{OUTPUT_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}'

# copy training Dockerfile
# !gsutil -q cp $REPO_SRC/cloudbuild.yaml $BASE_OUTPUT_DIR/cloudbuild.yaml
!gsutil -q cp $REPO_SRC/Dockerfile_tfrs $BASE_OUTPUT_DIR/Dockerfile_tfrs
!gsutil -q cp vocab_dict.pkl $BASE_OUTPUT_DIR/vocab_dict.pkl

# # # copy training application code
! gsutil -q -m cp -r $REPO_SRC/two_tower_jt/* $BASE_OUTPUT_DIR/trainer

print(f"\n Copied training package and Dockerfile to {BASE_OUTPUT_DIR}\n")


 Copied training package and Dockerfile to gs://jt-tfrs-central-v3/test-v3-v1/run-20230919-031924



In [97]:
! gsutil ls $BASE_OUTPUT_DIR

gs://jt-tfrs-central-v3/test-v3-v1/run-20230919-031924/Dockerfile_tfrs
gs://jt-tfrs-central-v3/test-v3-v1/run-20230919-031924/vocab_dict.pkl
gs://jt-tfrs-central-v3/test-v3-v1/run-20230919-031924/trainer/


## submit training job to Vertex

In [98]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME,
)

JOB_NAME = f'train-{VERSION}-{RUN_NAME}'
print(f"JOB_NAME: {JOB_NAME}")

JOB_NAME: train-v1-run-20230919-031924


In [99]:
job = vertex_ai.CustomJob(
    display_name=JOB_NAME,
    worker_pool_specs=WORKER_POOL_SPECS,
    base_output_dir=BASE_OUTPUT_DIR,
    staging_bucket=f"{BASE_OUTPUT_DIR}/staging",
)

In [100]:
job.run(
    tensorboard=TB_RESOURCE_NAME,
    service_account=VERTEX_SA,
    restart_job_on_worker_restart=False,
    enable_web_access=True,
    sync=False,
)

## TensorBoard Profiler

Once the profiler has uploaded trace logs to `BASE_OUTPUT_DIR/logs`, we can use the in-notebook tensoborad extension to view the profiler

<img
  src="img/tfrs-train-profiler-v1.png"
  alt="Alt text"
  title="train profiler"
  style="display: inline-block; margin: 0 auto; max-width: 1200px">

In [101]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf

TB_LOGS_PATH = f'{BASE_OUTPUT_DIR}/logs' # 
print(f"TB_LOGS_PATH: {TB_LOGS_PATH}")

TB_LOGS_PATH: gs://jt-tfrs-central-v3/test-v3-v1/run-20230919-031924/logs


In [102]:
%load_ext tensorboard
# %reload_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [103]:
%tensorboard --logdir=$TB_LOGS_PATH

# Notes

In [34]:
?job.run

[0;31mSignature:[0m
[0mjob[0m[0;34m.[0m[0mrun[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mservice_account[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnetwork[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtimeout[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrestart_job_on_worker_restart[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0menable_web_access[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mexperiment[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'aiplatform.Experiment'[0m[0;34m)

**Finished**