# Scaling Two-Tower training with Vertex AI

## Load env config

In [1]:
# naming convention for all cloud resources
VERSION        = "v1"                  # TODO
PREFIX         = f'ndr-{VERSION}'      # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = ndr-v1


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "ndr-v1"
VERSION                  = "v1"

APP                      = "sp"
MODEL_TYPE               = "2tower"
FRAMEWORK                = "tfrs"
DATA_VERSION             = "v1"
TRACK_HISTORY            = "5"

BUCKET_NAME              = "ndr-v1-hybrid-vertex-bucket"
BUCKET_URI               = "gs://ndr-v1-hybrid-vertex-bucket"
SOURCE_BUCKET            = "spotify-million-playlist-dataset"

DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://ndr-v1-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

CANDIDATE_PREFIX         = "candidates"
TRAIN_DIR_PREFIX      

In [3]:
import os
import json
from datetime import datetime
from time import time
import pandas as pd
import logging
import time
from pprint import pprint
import pickle as pkl

logging.disable(logging.WARNING)

from google.cloud import aiplatform as vertex_ai
from google.cloud import storage

In [4]:
storage_client = storage.Client(project=PROJECT_ID)

vertex_ai.init(project=PROJECT_ID,location=REGION)

## Prepare Train Job Specs

In [5]:
VOCAB_FILENAME

'vocab_dict.pkl'

In [6]:
filehandler = open(VOCAB_FILENAME, 'rb')
vocab_dict = pkl.load(filehandler)
filehandler.close()

In [7]:
for keys in vocab_dict:
    print(keys)

pl_name_src
track_name_pl
artist_name_pl
album_name_pl
artist_genres_pl
tracks_playlist_titles_pl
track_name_can
artist_name_can
album_name_can
artist_genres_can
track_pl_titles_can


### Training Accelerators

In [8]:
### A100 (40GB)
# WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
# REPLICA_COUNT = 1
# ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 0                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'single'

### A100 (80GB)
# WORKER_MACHINE_TYPE = 'a2-ultragpu-1g'
# REPLICA_COUNT = 1
# ACCELERATOR_TYPE = 'NVIDIA_A100_80GB'
# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 0                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'single'

### Tesla T4
WORKER_MACHINE_TYPE = 'n1-standard-16'
REPLICA_COUNT = 1
ACCELERATOR_TYPE = 'NVIDIA_TESLA_T4' # NVIDIA_TESLA_T4 NVIDIA_TESLA_V100
PER_MACHINE_ACCELERATOR_COUNT = 1
DISTRIBUTE_STRATEGY = 'single'
REDUCTION_SERVER_COUNT = 0                                                      
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"

### Vertex Experiments

#### create an experiemnt and experiment run

In [9]:
EXPERIMENT_PREFIX = 'scale-training'                     # custom identifier for organizing experiments
EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{VERSION}'
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: scale-training-v1
RUN_NAME: run-20230919-154220


#### create Managed TensorBoard instance

In [10]:
# # create new TB instance
TENSORBOARD_DISPLAY_NAME=f"{EXPERIMENT_NAME}"

tensorboard = vertex_ai.Tensorboard.create(
    display_name=TENSORBOARD_DISPLAY_NAME, 
    project=PROJECT_ID, 
    location=REGION
)
TB_RESOURCE_NAME = tensorboard.resource_name

# use existing TB instance
# TB_RESOURCE_NAME = 'projects/934903580331/locations/us-central1/tensorboards/6924469145035603968'

print(f"TB_RESOURCE_NAME: {TB_RESOURCE_NAME}")
print(f"TB display name: {tensorboard.display_name}")

TB_RESOURCE_NAME: projects/934903580331/locations/us-central1/tensorboards/3637070115473719296
TB display name: scale-training-v1


### training config

In [11]:
SEED = 1234

# =================================================
# trainconfig: GPU related
# =================================================
TF_GPU_THREAD_COUNT   = '8'      # '1' | '4' | '8'

# =================================================
# trainconfig: data input pipeline
# =================================================
BLOCK_LENGTH          = 64            # 1, 8, 16, 32, 64
NUM_DATA_SHARDS       = 4          # 2, 4, 8, 16, 32, 64
# TRAIN_PREFETCH=3

# =================================================
# trainconfig: training hparams
# =================================================
NUM_EPOCHS           = 5
LEARNING_RATE        = 0.01
BATCH_SIZE           = 4096           # 8192, 4096, 2048, 1024, 512 

# dropout
DROPOUT_RATE         = 0.33

# model size
EMBEDDING_DIM        = 128
PROJECTION_DIM       = 50
LAYER_SIZES          = '[512,256,128]'
MAX_TOKENS           = 20000     # vocab

# =================================================
# trainconfig: tensorboard
# =================================================
EMBED_FREQUENCY      = 0
HISTOGRAM_FREQUENCY  = 0
CHECKPOINT_FREQ      = 'epoch'

In [14]:
# =================================================
# trainconfig: train & valid steps
# =================================================
train_sample_cnt  = 8_205_265 # 8_205_265
valid_samples_cnt = 82_959

# validation & evaluation
VALID_FREQUENCY   = 20
VALID_STEPS       = valid_samples_cnt // BATCH_SIZE # 100
EPOCH_STEPS       = train_sample_cnt // BATCH_SIZE

print(f"VALID_STEPS: {VALID_STEPS}")
print(f"EPOCH_STEPS: {EPOCH_STEPS}")

VALID_STEPS: 20
EPOCH_STEPS: 2003


### data sources

In [15]:
# =================================================
# trainconfig: Data sources
# =================================================
TRAIN_DIR_PREFIX = f'data/{DATA_VERSION}/valid' # train
VALID_DIR_PREFIX = f'data/{DATA_VERSION}/valid' 
CANDIDATE_PREFIX = f'data/{DATA_VERSION}/candidates' 

### training args

In [16]:
WORKER_CMD = ["python", "-m", "src.two_tower_jt.task"]
# WORKER_CMD = ["python", "./task.py"]
# WORKER_CMD = ["python", "-m", "task"]

WORKER_ARGS = [
    f'--project={PROJECT_ID}',
    f'--train_output_gcs_bucket={BUCKET_NAME}',
    f'--train_dir={BUCKET_NAME}',
    f'--train_dir_prefix={TRAIN_DIR_PREFIX}',
    f'--valid_dir={BUCKET_NAME}',
    f'--valid_dir_prefix={VALID_DIR_PREFIX}',
    f'--candidate_file_dir={BUCKET_NAME}',
    f'--candidate_files_prefix={CANDIDATE_PREFIX}',
    f'--experiment_name={EXPERIMENT_NAME}',
    f'--experiment_run={RUN_NAME}',
    f'--num_epochs={NUM_EPOCHS}',
    f'--batch_size={BATCH_SIZE}',
    f'--embedding_dim={EMBEDDING_DIM}',
    f'--projection_dim={PROJECTION_DIM}',
    f'--layer_sizes={LAYER_SIZES}',
    f'--learning_rate={LEARNING_RATE}',
    f'--valid_frequency={VALID_FREQUENCY}',
    f'--valid_steps={VALID_STEPS}',
    f'--epoch_steps={EPOCH_STEPS}',
    f'--distribute={DISTRIBUTE_STRATEGY}',
    f'--model_version={VERSION}',
    f'--pipeline_version={VERSION}',
    f'--seed={SEED}',
    f'--max_tokens={MAX_TOKENS}',
    f'--tb_resource_name={TB_RESOURCE_NAME}',
    f'--embed_frequency={EMBED_FREQUENCY}',
    f'--hist_frequency={HISTOGRAM_FREQUENCY}',
    f'--tf_gpu_thread_count={TF_GPU_THREAD_COUNT}',
    f'--block_length={BLOCK_LENGTH}',
    f'--num_data_shards={NUM_DATA_SHARDS}',
    f'--chkpt_freq={CHECKPOINT_FREQ}',
    f'--dropout_rate={DROPOUT_RATE}',
    # uncomment these to pass value of True (bool)
    # f'--cache_train',                                # caches train_dataset
    # f'--evaluate_model',                           # runs model.eval()
    # f'--write_embeddings',                         # writes embeddings index in train job
    f'--profiler',                                   # runs TB profiler
    # f'--set_jit',                                  # enables XLA
    f'--compute_batch_metrics',
    f'--use_cross_layer',
    f'--use_dropout',
]


In [17]:
from util import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=f"{REMOTE_IMAGE_NAME}:latest",
    args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=PER_MACHINE_ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--project=hybrid-vertex',
                              '--train_output_gcs_bucket=ndr-v1-hybrid-vertex-bucket',
                              '--train_dir=ndr-v1-hybrid-vertex-bucket',
                              '--train_dir_prefix=data/v1/valid',
                              '--valid_dir=ndr-v1-hybrid-vertex-bucket',
                              '--valid_dir_prefix=data/v1/valid',
                              '--candidate_file_dir=ndr-v1-hybrid-vertex-bucket',
                              '--candidate_files_prefix=data/v1/candidates',
                              '--experiment_name=scale-training-v1',
                              '--experiment_run=run-20230919-154220',
                              '--num_epochs=5',
                              '--batch_size=4096',
                              '--embedding_dim=128',
                              '--projection_dim=50',
                              '--layer_sizes=[512,256,128]',
              

### copy training package to GCS

In [18]:
BASE_OUTPUT_DIR = f'gs://{BUCKET_NAME}/{EXPERIMENT_NAME}/{RUN_NAME}'

# copy training Dockerfile
# !gsutil -q cp $REPO_SRC/cloudbuild.yaml $BASE_OUTPUT_DIR/cloudbuild.yaml
!gsutil -q cp $REPO_SRC/Dockerfile_tfrs $BASE_OUTPUT_DIR/Dockerfile_tfrs
!gsutil -q cp vocab_dict.pkl $BASE_OUTPUT_DIR/vocab_dict.pkl

# # # copy training application code
! gsutil -q -m cp -r $REPO_SRC/two_tower_jt/* $BASE_OUTPUT_DIR/trainer

print(f"\n Copied training package and Dockerfile to {BASE_OUTPUT_DIR}\n")


 Copied training package and Dockerfile to gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230919-154220



In [19]:
! gsutil ls $BASE_OUTPUT_DIR

gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230919-154220/Dockerfile_tfrs
gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230919-154220/vocab_dict.pkl
gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230919-154220/trainer/


## submit training job to Vertex

In [20]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME,
)

JOB_NAME = f'train-{VERSION}-{RUN_NAME}'
print(f"JOB_NAME: {JOB_NAME}")

JOB_NAME: train-v1-run-20230919-154220


In [21]:
job = vertex_ai.CustomJob(
    display_name=JOB_NAME,
    worker_pool_specs=WORKER_POOL_SPECS,
    base_output_dir=BASE_OUTPUT_DIR,
    staging_bucket=f"{BASE_OUTPUT_DIR}/staging",
)

In [22]:
job.run(
    tensorboard=TB_RESOURCE_NAME,
    service_account=VERTEX_SA,
    restart_job_on_worker_restart=False,
    enable_web_access=True,
    sync=False,
)

## TensorBoard Profiler

Once the profiler has uploaded trace logs to `BASE_OUTPUT_DIR/logs`, we can use the in-notebook tensoborad extension to view the profiler

<img
  src="img/tfrs-train-profiler-v1.png"
  alt="Alt text"
  title="train profiler"
  style="display: inline-block; margin: 0 auto; max-width: 1200px">

In [23]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf

TB_LOGS_PATH = f'{BASE_OUTPUT_DIR}/logs' # 
print(f"TB_LOGS_PATH: {TB_LOGS_PATH}")

TB_LOGS_PATH: gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230919-154220/logs


In [24]:
%load_ext tensorboard
# %reload_ext tensorboard

In [25]:
%tensorboard --logdir=$TB_LOGS_PATH

# Notes

In [34]:
?job.run

[0;31mSignature:[0m
[0mjob[0m[0;34m.[0m[0mrun[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mservice_account[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnetwork[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtimeout[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrestart_job_on_worker_restart[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0menable_web_access[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mexperiment[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'aiplatform.Experiment'[0m[0;34m)

**Finished**