# Scaling TFRS Ranking model

In [1]:
! python3 -c "import google.cloud.aiplatform; print('aiplatform SDK version: {}'.format(google.cloud.aiplatform.__version__))"

aiplatform SDK version: 1.26.1


In [2]:
import os

root_path = '/home/jupyter/jw-repo2/spotify_mpd_two_tower'
os.chdir(root_path)
os.getcwd()

'/home/jupyter/jw-repo2/spotify_mpd_two_tower'

In [3]:
# creds, PROJECT_ID = google.auth.default()
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

PROJECT_NUM              = !gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NUM              = PROJECT_NUM[0]

VERTEX_SA                = f'{PROJECT_NUM}-compute@developer.gserviceaccount.com' # 934903580331
REGION                   = 'us-central1'

print(f"PROJECT_ID     = {PROJECT_ID}")
print(f"PROJECT_NUM    = {PROJECT_NUM}")
print(f"VERTEX_SA      = {VERTEX_SA}")

PROJECT_ID     = hybrid-vertex
PROJECT_NUM    = 934903580331
VERTEX_SA      = 934903580331-compute@developer.gserviceaccount.com


In [4]:
import os
import json
import random
import string
import logging
import pickle as pkl
import pandas as pd
from pprint import pprint

from datetime import datetime
from time import time
import time

from google.cloud import aiplatform as vertex_ai
from google.cloud import storage

logging.disable(logging.WARNING)

In [5]:
storage_client = storage.Client(project=PROJECT_ID)

vertex_ai.init(project=PROJECT_ID,location=REGION)

### update vars

In [6]:
DOCKERNAME                = 'Dockerfile_rank'
print(f"DOCKERNAME        = {DOCKERNAME}")

DOCKERNAME        = Dockerfile_rank


In [7]:
VERSION                   = "v9"
APP                       = 'sp'
MODEL_TYPE                = 'rank'
FRAMEWORK                 = 'tfrs'
MODEL_ROOT_NAME           = f'{APP}-{MODEL_TYPE}-{FRAMEWORK}-{VERSION}'

print(f"MODEL_ROOT_NAME: {MODEL_ROOT_NAME}")

MODEL_ROOT_NAME: sp-rank-tfrs-v9


## Create training package

In [8]:
REPO_DOCKER_PATH_PREFIX = 'src'
TRAIN_SUBFOLDER = 'ranking'

# ! rm -rf $REPO_DOCKER_PATH_PREFIX/$TRAIN_SUBFOLDER
# ! mkdir -p $REPO_DOCKER_PATH_PREFIX/$TRAIN_SUBFOLDER
# ! touch $REPO_DOCKER_PATH_PREFIX/$TRAIN_SUBFOLDER/__init__.py

### train config

In [9]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUBFOLDER}/train_config.py
PROJECT_ID='hybrid-vertex'
MAX_PLAYLIST_LENGTH = 5

Overwriting src/ranking/train_config.py


### requirements

In [10]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{TRAIN_SUBFOLDER}/requirements.txt
google-cloud-aiplatform>=1.26.1
tensorflow-recommenders==0.7.2
tensorboard==2.10.1
tensorboard-data-server==0.6.1
tensorboard-plugin-profile==2.11.1
tensorflow-io==0.27.0
google-cloud-aiplatform[cloud_profiler]>=1.26.1

Overwriting src/ranking/requirements.txt


### dockerfile

In [11]:
!pwd
# docker rm $(docker ps -aq)

/home/jupyter/jw-repo2/spotify_mpd_two_tower


In [12]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{DOCKERNAME}

# FROM tensorflow/tensorflow:2.10.1-gpu
FROM gcr.io/deeplearning-platform-release/tf-gpu.2-11

ENV PYTHONUNBUFFERED True

# Copies the trainer code to the docker image.
# COPY ranking/* ./
COPY ranking /ranking

WORKDIR /ranking

RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r ./requirements.txt

RUN apt update && apt -y install nvtop

# RUN export PYTHONPATH=${PYTHONPATH}:${APP_HOME}/

Overwriting src/Dockerfile_rank


## (Optional) Build Training image with Docker

In [13]:
REPOSITORY                = "tfrs-ranking"  # f'{APP}-{FRAMEWORK}'
IMAGE_NAME                = f'{MODEL_ROOT_NAME}'

REMOTE_IMAGE_NAME         = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE_NAME}"

print(f"REPOSITORY        = {REPOSITORY}")
print(f"IMAGE_NAME        = {IMAGE_NAME}")
print(f"REMOTE_IMAGE_NAME = {REMOTE_IMAGE_NAME}")

REPOSITORY        = tfrs-ranking
IMAGE_NAME        = sp-rank-tfrs-v9
REMOTE_IMAGE_NAME = us-central1-docker.pkg.dev/hybrid-vertex/tfrs-ranking/sp-rank-tfrs-v9


#### Create Artifact Repository
If you don't have an existing artifact repository, create one using the gcloud command below

In [14]:
# ! gcloud artifacts repositories create $REPOSITORY --repository-format=docker --location=$REGION

#### config docker

In [15]:
! gcloud auth configure-docker $REGION-docker.pkg.dev --quiet


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


#### local build

In [16]:
# # set variables if running in terminal
print(f"export REMOTE_IMAGE_NAME={REMOTE_IMAGE_NAME}")
print(f"export DOCKERNAME={DOCKERNAME}")

# ! docker build -t $REMOTE_IMAGE_NAME -f $DOCKERNAME .

export REMOTE_IMAGE_NAME=us-central1-docker.pkg.dev/hybrid-vertex/tfrs-ranking/sp-rank-tfrs-v9
export DOCKERNAME=Dockerfile_rank


### Push image to registry

In [18]:
# ! docker push $REMOTE_IMAGE_NAME

## (Optional) Build Training image with Cloud Build

In [22]:
# Docker definitions for training
IMAGE_NAME               = f'{MODEL_ROOT_NAME}'
IMAGE_URI                = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

DOCKERNAME               = 'Dockerfile_rank'
MACHINE_TYPE             = 'e2-highcpu-32'
FILE_LOCATION            = './src'

print(f"DOCKERNAME       : {DOCKERNAME}")
print(f"IMAGE_URI        : {IMAGE_URI}")
print(f"MACHINE_TYPE     : {MACHINE_TYPE}")
print(f"FILE_LOCATION    : {FILE_LOCATION}")

DOCKERNAME       : Dockerfile_rank
IMAGE_URI        : gcr.io/hybrid-vertex/sp-rank-tfrs-v6
MACHINE_TYPE     : e2-highcpu-32
FILE_LOCATION    : ./src


### CloudBuild YAML

In [None]:
# %%writefile {REPO_DOCKER_PATH_PREFIX}/cloudbuild.yaml

# steps:
# - name: 'gcr.io/cloud-builders/docker'
#   args: ['build', '-t', '$_IMAGE_URI', '$_FILE_LOCATION', '-f', '$_FILE_LOCATION/Dockerfile.$_DOCKERNAME']
# images:
# - '$_IMAGE_URI'

### set gcloudignore

In [21]:
# ! gcloud config set gcloudignore/enabled true

In [20]:
# !gcloud meta list-files-for-upload

In [19]:
# ! gcloud builds submit --config src/cloudbuild.yaml \
#     --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION \
#     --timeout=2h \
#     --machine-type=$MACHINE_TYPE

## Prepare Train Job Specs

In [17]:
filehandler = open('vocab_dict.pkl', 'rb')
vocab_dict = pkl.load(filehandler)
filehandler.close()

In [18]:
for keys in vocab_dict:
    print(keys)

pl_name_src
track_name_pl
artist_name_pl
album_name_pl
artist_genres_pl
tracks_playlist_titles_pl
track_name_can
artist_name_can
album_name_can
artist_genres_can
track_pl_titles_can


### Training Accelerators

In [19]:
WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
REPLICA_COUNT = 1
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
PER_MACHINE_ACCELERATOR_COUNT = 1
REDUCTION_SERVER_COUNT = 0                                                      
REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
DISTRIBUTE_STRATEGY = 'single'

REGION = "asia-southeast1" # "us-central1" | europe-west4 | asia-southeast1

In [20]:
# WORKER_MACHINE_TYPE = 'a2-ultragpu-1g'
# REPLICA_COUNT = 1

# ACCELERATOR_TYPE = 'NVIDIA_A100_80GB'
# # ACCELERATOR_TYPE = vertex_ai.gapic.AcceleratorType.NVIDIA_A100_80GB

# PER_MACHINE_ACCELERATOR_COUNT = 1
# REDUCTION_SERVER_COUNT = 0                                                      
# REDUCTION_SERVER_MACHINE_TYPE = "n1-highcpu-16"
# DISTRIBUTE_STRATEGY = 'single'

### Vertex Experiments
create an experiemnt and experiment run

In [21]:
EXPERIMENT_PREFIX = 'audio-ranker-opt'
EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{VERSION}'
RUN_NAME = f'run-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"EXPERIMENT_NAME: {EXPERIMENT_NAME}")
print(f"RUN_NAME: {RUN_NAME}")

EXPERIMENT_NAME: audio-ranker-opt-v9
RUN_NAME: run-20230629-221807


### create Managed TensorBoard instance

In [22]:
SESSION_id = "".join(random.choices(string.ascii_lowercase + string.digits, k=3))

TENSORBOARD_DISPLAY_NAME=f"{EXPERIMENT_NAME}-{SESSION_id}"

# create new TB instance
tensorboard = vertex_ai.Tensorboard.create(
    display_name=TENSORBOARD_DISPLAY_NAME
    , project=PROJECT_ID
    , location=REGION
)

TB_RESOURCE_NAME = tensorboard.resource_name

# use existing TB instance
# TB_RESOURCE_NAME = 'projects/934903580331/locations/us-central1/tensorboards/6924469145035603968'

print(f"TB_RESOURCE_NAME: {TB_RESOURCE_NAME}")
print(f"TB display name: {tensorboard.display_name}")

TB_RESOURCE_NAME: projects/934903580331/locations/asia-southeast1/tensorboards/4272227196514336768
TB display name: audio-ranker-opt-v9-0uh


### training config

In [23]:
SEED = 1234

# =================================================
# trainconfig: GPU related
# =================================================
TF_GPU_THREAD_COUNT='8'      # '1' | '4' | '8'

# =================================================
# trainconfig: data input pipeline
# =================================================
BLOCK_LENGTH = 64            # 1, 8, 16, 32, 64
NUM_DATA_SHARDS = 4          # 2, 4, 8, 16, 32, 64
# TRAIN_PREFETCH=3

# =================================================
# trainconfig: training hparams
# =================================================
NUM_EPOCHS = 20
LEARNING_RATE = 0.01
BATCH_SIZE = 8192           # 4096, 2048, 1024, 512 
DROPOUT_RATE = 0.33

# model size
EMBEDDING_DIM = 256
PROJECTION_DIM = 50
LAYER_SIZES = '[512,256,128]' # '[512,256,128]' '[256,128]'
MAX_TOKENS = 20000     # vocab

# =================================================
# trainconfig: tensorboard
# =================================================
EMBED_FREQUENCY=0
HISTOGRAM_FREQUENCY=1
CHECKPOINT_FREQ='epoch'

In [24]:
# =================================================
# trainconfig: train & valid steps
# =================================================
train_sample_cnt = 8_205_265 # 8_205_265
valid_samples_cnt = 82_959

# validation & evaluation
VALID_FREQUENCY = 5
VALID_STEPS = valid_samples_cnt // BATCH_SIZE # 100
EPOCH_STEPS = train_sample_cnt // BATCH_SIZE

print(f"VALID_STEPS: {VALID_STEPS}")
print(f"EPOCH_STEPS: {EPOCH_STEPS}")

VALID_STEPS: 10
EPOCH_STEPS: 1001


### data sources

In [25]:
# =================================================
# trainconfig: gcs locations
# =================================================
OUTPUT_BUCKET = 'jt-tfrs-central-v4' # TODO: change this
OUTPUT_GCS_URI =f'gs://{OUTPUT_BUCKET}'

# ! gsutil mb -l $REGION $OUTPUT_GCS_URI

In [26]:
# =================================================
# trainconfig: Data sources
# =================================================
# BUCKET_DATA_DIR = 'spotify-data-regimes' 
# # data strategy: 08m
# CANDIDATE_PREFIX = 'jtv15-8m/candidates'
# TRAIN_DIR_PREFIX = 'jtv15-8m/train'     # train | train_v14
# VALID_DIR_PREFIX = 'jtv15-8m/valid'     # valid_v14

BUCKET_DATA_DIR = 'matching-engine-content'
DATA_VERSION = 'v2-0-0'

TRAIN_DIR_PREFIX = f'{DATA_VERSION}/train' # subset: valid_v9 | train_v9
VALID_DIR_PREFIX = f'{DATA_VERSION}/valid' # valid_v9 | train_v9
CANDIDATE_PREFIX = f'{DATA_VERSION}/candidates' 

### training args

In [27]:
# WORKER_CMD = ["python", "./task.py"]
# WORKER_CMD = ["python", "-m", "task"]
WORKER_CMD = ["python", "task.py"]

WORKER_ARGS = [
    f'--project={PROJECT_ID}',
    f'--train_output_gcs_bucket={OUTPUT_BUCKET}',
    f'--train_dir={BUCKET_DATA_DIR}',
    f'--train_dir_prefix={TRAIN_DIR_PREFIX}',
    f'--valid_dir={BUCKET_DATA_DIR}',
    f'--valid_dir_prefix={VALID_DIR_PREFIX}',
    f'--candidate_file_dir={BUCKET_DATA_DIR}',
    f'--candidate_files_prefix={CANDIDATE_PREFIX}',
    f'--experiment_name={EXPERIMENT_NAME}',
    f'--experiment_run={RUN_NAME}',
    f'--num_epochs={NUM_EPOCHS}',
    f'--batch_size={BATCH_SIZE}',
    f'--embedding_dim={EMBEDDING_DIM}',
    f'--projection_dim={PROJECTION_DIM}',
    f'--layer_sizes={LAYER_SIZES}',
    f'--learning_rate={LEARNING_RATE}',
    f'--valid_frequency={VALID_FREQUENCY}',
    f'--valid_steps={VALID_STEPS}',
    f'--epoch_steps={EPOCH_STEPS}',
    f'--distribute={DISTRIBUTE_STRATEGY}',
    f'--model_version={VERSION}',
    f'--pipeline_version={VERSION}',
    f'--seed={SEED}',
    f'--max_tokens={MAX_TOKENS}',
    f'--tb_resource_name={TB_RESOURCE_NAME}',
    f'--embed_frequency={EMBED_FREQUENCY}',
    f'--hist_frequency={HISTOGRAM_FREQUENCY}',
    f'--tf_gpu_thread_count={TF_GPU_THREAD_COUNT}',
    f'--block_length={BLOCK_LENGTH}',
    f'--num_data_shards={NUM_DATA_SHARDS}',
    f'--chkpt_freq={CHECKPOINT_FREQ}',
    f'--dropout_rate={DROPOUT_RATE}',
    # uncomment these to pass value of True (bool)
    f'--cache_train',                                # caches train_dataset
    f'--evaluate_model',                              # runs model.eval()
    f'--profiler',                                   # runs TB profiler
    # f'--set_jit',                                  # enables XLA
    # f'--compute_batch_metrics',
    # f'--use_cross_layer',
    f'--use_dropout',
]

In [28]:
from util import workerpool_specs

WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(
    image_uri=f"{REMOTE_IMAGE_NAME}:latest" # IMAGE_URI
    , args=WORKER_ARGS
    , cmd=WORKER_CMD
    , replica_count=REPLICA_COUNT
    , machine_type=WORKER_MACHINE_TYPE
    , accelerator_count=PER_MACHINE_ACCELERATOR_COUNT
    , accelerator_type=ACCELERATOR_TYPE
    , reduction_server_count=REDUCTION_SERVER_COUNT
    , reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE
)

from pprint import pprint
pprint(WORKER_POOL_SPECS)

[{'container_spec': {'args': ['--project=hybrid-vertex',
                              '--train_output_gcs_bucket=jt-tfrs-central-v4',
                              '--train_dir=matching-engine-content',
                              '--train_dir_prefix=v2-0-0/train',
                              '--valid_dir=matching-engine-content',
                              '--valid_dir_prefix=v2-0-0/valid',
                              '--candidate_file_dir=matching-engine-content',
                              '--candidate_files_prefix=v2-0-0/candidates',
                              '--experiment_name=audio-ranker-opt-v9',
                              '--experiment_run=run-20230629-221807',
                              '--num_epochs=20',
                              '--batch_size=8192',
                              '--embedding_dim=256',
                              '--projection_dim=50',
                              '--layer_sizes=[512,256,128]',
                              '--le

## copy training package to GCS

In [29]:
BASE_OUTPUT_DIR = f'gs://{OUTPUT_BUCKET}/{EXPERIMENT_NAME}/{RUN_NAME}'

# copy training Dockerfile
! gsutil cp $REPO_DOCKER_PATH_PREFIX/cloudbuild.yaml $BASE_OUTPUT_DIR/cloudbuild.yaml
! gsutil cp $REPO_DOCKER_PATH_PREFIX/Dockerfile_rank $BASE_OUTPUT_DIR/Dockerfile_rank
! gsutil cp vocab_dict.pkl $BASE_OUTPUT_DIR/vocab_dict.pkl

# # # copy training application code
! gsutil -m cp -r $REPO_DOCKER_PATH_PREFIX/ranking/* $BASE_OUTPUT_DIR/ranking

print(f"\n Copied training package and Dockerfile to {BASE_OUTPUT_DIR}\n")

Copying file://src/cloudbuild.yaml [Content-Type=application/octet-stream]...
/ [1 files][  178.0 B/  178.0 B]                                                
Operation completed over 1 objects/178.0 B.                                      
Copying file://src/Dockerfile_rank [Content-Type=application/octet-stream]...
/ [1 files][  406.0 B/  406.0 B]                                                
Operation completed over 1 objects/406.0 B.                                      
Copying file://vocab_dict.pkl [Content-Type=application/octet-stream]...
/ [1 files][ 18.5 MiB/ 18.5 MiB]                                                
Operation completed over 1 objects/18.5 MiB.                                     
Copying file://src/ranking/__init__.py [Content-Type=text/x-python]...
Copying file://src/ranking/__pycache__/__init__.cpython-37.pyc [Content-Type=application/x-python-code]...
Copying file://src/ranking/__pycache__/tf_ranking_model.cpython-37.pyc [Content-Type=application/x-pytho

## submit training job to Vertex

In [30]:
vertex_ai.init(
    project=PROJECT_ID
    , location=REGION
    , experiment=EXPERIMENT_NAME
)

JOB_NAME = f'train-{MODEL_ROOT_NAME}-{time.strftime("%Y%m%d-%H%M%S")}'

print(f"JOB_NAME: {JOB_NAME}")

JOB_NAME: train-sp-rank-tfrs-v9-20230629-221834


In [31]:
job = vertex_ai.CustomJob(
    display_name=JOB_NAME
    , worker_pool_specs=WORKER_POOL_SPECS
    , base_output_dir=BASE_OUTPUT_DIR
    , staging_bucket=f"{BASE_OUTPUT_DIR}/staging"
    # , location="us-east4"
)

In [32]:
job.run(
    tensorboard=TB_RESOURCE_NAME,
    service_account=VERTEX_SA,
    restart_job_on_worker_restart=False,
    enable_web_access=True,
    sync=False,
)

### TODO: 

> see [create_custom_job with experiments autologging](https://cloud.google.com/vertex-ai/docs/training/create-custom-job#create_custom_job-python)

In [None]:
# def create_custom_job_with_experiment_autologging_sample(
#     project: str,
#     location: str,
#     staging_bucket: str,
#     display_name: str,
#     script_path: str,
#     container_uri: str,
#     service_account: str,
#     experiment: str,
#     experiment_run: Optional[str] = None,
# ) -> None:
#     aiplatform.init(project=project, location=location, staging_bucket=staging_bucket)

#     # Ignore the next two lines of code if the experiment you are using already
#     # has backing tensorboard instance.
#     tb_instance = aiplatform.Tensorboard.create()
#     aiplatform.init(experiment=experiment, experiment_tensorboard=tb_instance)

#     job = aiplatform.CustomJob.from_local_script(
#         display_name=display_name,
#         script_path=script_path,
#         container_uri=container_uri,
#         enable_autolog=True,
#     )

#     job.run(
#         service_account=service_account,
#         experiment=experiment,
#         experiment_run=experiment_run,
#     )

## TensorBoard Profiler

In [70]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf

TB_LOGS_PATH = f'{BASE_OUTPUT_DIR}/logs' # 
print(f"TB_LOGS_PATH: {TB_LOGS_PATH}")

TB_LOGS_PATH: gs://jt-tfrs-central-v4/test-rank-src-v5/run-20230629-083212/logs


In [71]:
# %load_ext tensorboard
%reload_ext tensorboard

In [72]:
%tensorboard --logdir=$TB_LOGS_PATH