
# Build baseline tfrs model 

* source code for model can be found in `src/`

This notebook constructs the two tower model and saves the model to GCS


In [1]:
# !pip install tensorflow-recommenders google-cloud-aiplatform --user
! python3 -c "import google.cloud.aiplatform; print('aiplatform SDK version: {}'.format(google.cloud.aiplatform.__version__))"

aiplatform SDK version: 1.26.1


## Load env config

In [2]:
# naming convention for all cloud resources
VERSION        = "v1"                  # TODO
PREFIX         = f'ndr-{VERSION}'      # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = ndr-v1


In [3]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "ndr-v1"
VERSION                  = "v1"

APP                      = "sp"
MODEL_TYPE               = "2tower"
FRAMEWORK                = "tfrs"
DATA_VERSION             = "v1"
TRACK_HISTORY            = "5"

BUCKET_NAME              = "ndr-v1-hybrid-vertex-bucket"
BUCKET_URI               = "gs://ndr-v1-hybrid-vertex-bucket"
SOURCE_BUCKET            = "spotify-million-playlist-dataset"

DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://ndr-v1-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

CANDIDATE_PREFIX         = "candidates"
TRAIN_DIR_PREFIX      

In [4]:
# =============================== #
# included in env-setup.ipynb     #
# =============================== #

# REGION           = 'us-central1' 
# BUCKET_NAME      = 'matching-engine-content'    # location to store output
# BUCKET_URI       = f'gs://{BUCKET_NAME}'
# DATA_VERSION     = "v1-0-0"                     # version tag for dataflow pipeline
# TRAIN_DIR_PREFIX = f'valid'                       # subset: valid_v9 | train_v9
# VALID_DIR_PREFIX = f'valid'                       # valid_v9 | train_v9
# CANDIDATE_PREFIX = f'candidates' 

In [5]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
os.environ['TF_GPU_THREAD_MODE']='gpu_private'
os.environ['TF_GPU_ALLOCATOR']='cuda_malloc_async'
os.environ["CLOUD_ML_PROJECT_ID"] = PROJECT_ID

In [6]:
import json
import numpy as np
import pickle as pkl

import logging
import time
from pprint import pprint

# tensorflow
import tensorflow as tf
import tensorflow_recommenders as tfrs

from google.cloud import aiplatform as vertex_ai
from google.cloud import storage
storage_client = storage.Client(project=PROJECT_ID)

from src.two_tower_jt import two_tower as tt
from src.two_tower_jt import train_utils as train_utils
# from src.two_tower_jt import feature_sets

from util import feature_sets

In [7]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

## Create Dataset for local training and testing

Inspect the contents of the directory - you can change parameters in the header of the `two_tower.py` script

In [8]:
!tree src/two_tower_jt

[01;34msrc/two_tower_jt[00m
├── __init__.py
├── [01;34m__pycache__[00m
│   ├── __init__.cpython-37.pyc
│   ├── feature_sets.cpython-37.pyc
│   ├── test_instances.cpython-37.pyc
│   ├── train_config.cpython-37.pyc
│   ├── train_utils.cpython-37.pyc
│   ├── train_utils_v1.cpython-37.pyc
│   └── two_tower.cpython-37.pyc
├── feature_sets.py
├── interactive_train.py
├── requirements.txt
├── task.py
├── test_instances.py
├── train_config.py
├── train_utils.py
└── two_tower.py

1 directory, 16 files


## Create Dataset objects

In [9]:
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

batch_size = 1024 #*16

### Create Train dataset

### data input pipeline 

> interleave --> map --> batch

In [10]:
train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'data/{DATA_VERSION}/{TRAIN_DIR_PREFIX}/'):
    if '.tfrecords' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

# train_files    
train_dataset = tf.data.Dataset.from_tensor_slices(train_files).prefetch(
    tf.data.AUTOTUNE,
)

train_dataset = train_dataset.interleave(
    train_utils.full_parse,
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False,
).map(
    feature_sets.parse_tfrecord, 
    num_parallel_calls=tf.data.AUTOTUNE
).batch(
    batch_size 
).prefetch(
    tf.data.AUTOTUNE,
).with_options(
    options
)

train_dataset

<_OptionsDataset element_spec={'album_name_can': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'album_name_pl': TensorSpec(shape=(None, 5), dtype=tf.string, name=None), 'album_uri_can': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'album_uri_pl': TensorSpec(shape=(None, 5), dtype=tf.string, name=None), 'artist_followers_can': TensorSpec(shape=(None,), dtype=tf.float32, name=None), 'artist_genres_can': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'artist_genres_pl': TensorSpec(shape=(None, 5), dtype=tf.string, name=None), 'artist_name_can': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'artist_name_pl': TensorSpec(shape=(None, 5), dtype=tf.string, name=None), 'artist_pop_can': TensorSpec(shape=(None,), dtype=tf.float32, name=None), 'artist_pop_pl': TensorSpec(shape=(None, 5), dtype=tf.float32, name=None), 'artist_uri_can': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'artist_uri_pl': TensorSpec(shape=(None, 5), dtype=tf.string, name=None),

### Create Validation dataset

In [11]:
valid_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'data/{DATA_VERSION}/{VALID_DIR_PREFIX}/'):
    if '.tfrecords' in blob.name:
        valid_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))


valid_dataset = tf.data.Dataset.from_tensor_slices(valid_files).prefetch(
    tf.data.AUTOTUNE,
)

valid_dataset = valid_dataset.interleave(
    train_utils.full_parse,
    num_parallel_calls=tf.data.AUTOTUNE,
    cycle_length=tf.data.AUTOTUNE, 
    deterministic=False,
).map(
    feature_sets.parse_tfrecord, 
    num_parallel_calls=tf.data.AUTOTUNE
).batch(
    batch_size
).prefetch(
    tf.data.AUTOTUNE,
).with_options(
    options
)

# valid_dataset = valid_dataset.cache() #1gb machine mem + 400 MB in candidate ds (src/two-tower.py)

# valid_dataset

### Create Candidates dataset

In [12]:
candidate_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'data/{DATA_VERSION}/{CANDIDATE_PREFIX}/'):
    if '.tfrecords' in blob.name:
        candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

candidate_dataset = tf.data.Dataset.from_tensor_slices(candidate_files)

parsed_candidate_dataset = candidate_dataset.interleave(
    train_utils.full_parse,
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False
).map(
    feature_sets.parse_candidate_tfrecord_fn, 
    num_parallel_calls=tf.data.AUTOTUNE
).with_options(
    options
)

parsed_candidate_dataset = parsed_candidate_dataset.cache() #400 MB on machine mem
# parsed_candidate_dataset

In [13]:
# check dataset output
for x in parsed_candidate_dataset.batch(1).take(1):
    pprint(x)

{'album_name_can': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Thanatophobia'], dtype=object)>,
 'album_uri_can': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'spotify:album:5GBUYg5EqeDI0CuszAvDzj'], dtype=object)>,
 'artist_followers_can': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([27438.], dtype=float32)>,
 'artist_genres_can': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b"'indie garage rock'"], dtype=object)>,
 'artist_name_can': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Worn-Tin'], dtype=object)>,
 'artist_pop_can': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([40.], dtype=float32)>,
 'artist_uri_can': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'spotify:artist:7j8ds7BnqaEKuz1a1GN0J9'], dtype=object)>,
 'duration_ms_can': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([216923.], dtype=float32)>,
 'track_acousticness_can': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.655], dtype=float32)>,
 'track_danceability_can

# Local Training

Compile the model
Review the details of the model layers

## Adapt Layers

#### Adapt the text vectorizors - copy/paste to run one time

We are accessing the `TextVectorizor` layers in the model via the layer print-outs above

```python
# adpat the text vectorizors

MAX_PLAYLIST_LENGTH = 5 # this is set upstream by the BigQuery max length
    
model.query_tower.layers[0].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['pl_name_src']))
print('pl_name_src adapts complete')
model.query_tower.layers[7].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['track_name_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))
print('track_name_pl adapts complete')
model.query_tower.layers[9].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['artist_name_pl'], [-1, MAX_PLAYLIST_LENGTH, 1]))) 
print('artist_name_pl adapts complete')
model.query_tower.layers[11].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['album_name_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))
print('album_name_pl adapts complete')
model.query_tower.layers[12].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['artist_genres_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))
print('artist_genres_pl adapts complete')
# model.query_tower.layers[13].layers[0].adapt(
#     train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['tracks_playlist_titles_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))
# print('tracks_playlist_titles_pl adapts complete')

model.candidate_tower.layers[1].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['track_name_can'])) 
print('track_name_can adapts complete')
model.candidate_tower.layers[3].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['artist_name_can'])) 
print('artist_name_can adapts complete')
model.candidate_tower.layers[5].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['album_name_can'])) 
print('album_name_can adapts complete')
model.candidate_tower.layers[9].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['artist_genres_can'])) 
print('artist_genres_can adapts complete')
# model.candidate_tower.layers[11].layers[0].adapt(
#     train_dataset.unbatch().batch(10000).map(lambda x: x['track_pl_titles_can'])) 
# print('track_pl_titles_can adapts complete')
```

### save vocab dict 

Save the vocab dictionary for later so you will not have to adapt

```python
vocab_dict = {
    'pl_name_src' : model.query_tower.layers[0].layers[0].get_vocabulary(),
    'track_name_pl' : model.query_tower.layers[7].layers[0].get_vocabulary(),
    'artist_name_pl' : model.query_tower.layers[9].layers[0].get_vocabulary(),
    'album_name_pl' : model.query_tower.layers[11].layers[0].get_vocabulary(),
    'artist_genres_pl' : model.query_tower.layers[12].layers[0].get_vocabulary(),
    # 'tracks_playlist_titles_pl' : model.query_tower.layers[13].layers[0].get_vocabulary(),

    'track_name_can' : model.candidate_tower.layers[1].layers[0].get_vocabulary(),
    'artist_name_can' : model.candidate_tower.layers[3].layers[0].get_vocabulary(),
    'album_name_can' : model.candidate_tower.layers[5].layers[0].get_vocabulary(),
    'artist_genres_can' : model.candidate_tower.layers[9].layers[0].get_vocabulary(),
    # 'track_pl_titles_can' : model.candidate_tower.layers[11].layers[0].get_vocabulary(),
}
```

```python
import pickle as pkl

filehandler = open('vocab_dict.pkl', 'wb')
pkl.dump(vocab_dict, filehandler)

filehandler.close()

tt.upload_blob('two-tower-models', 'vocab_dict.pkl', 'vocabs/vocab_dict.pkl', `PROJECT_ID`)
````

### load saved vocab dict

In [14]:
# os.system('gsutil cp gs://two-tower-models/vocabs/vocab_dict.pkl .')  # TODO - paramterize

filehandler = open('vocab_dict.pkl', 'rb')
VOCAB_DICT = pkl.load(filehandler)
filehandler.close()

# VOCAB_DICT

## Build and Compile model

### TODO: config file

In [15]:
USE_CROSS_LAYER = True
USE_DROPOUT = True
SEED = 1234
MAX_PLAYLIST_LENGTH = 15
EMBEDDING_DIM = 128   
PROJECTION_DIM = 25  
SEED = 1234
DROPOUT_RATE = 0.33
MAX_TOKENS = 20000
LAYER_SIZES=[256,128]

LR = .1
opt = tf.keras.optimizers.Adagrad(LR)

In [16]:
model = tt.TheTwoTowers(
    layer_sizes=LAYER_SIZES, 
    vocab_dict=VOCAB_DICT, 
    parsed_candidate_dataset=parsed_candidate_dataset,
    embedding_dim=EMBEDDING_DIM,
    projection_dim=PROJECTION_DIM,
    seed=SEED,
    use_cross_layer=USE_CROSS_LAYER,
    use_dropout=USE_DROPOUT,
    dropout_rate=DROPOUT_RATE,
    # max_playlist_length=MAX_PLAYLIST_LENGTH,
    max_tokens=MAX_TOKENS,
)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [17]:
model.compile(optimizer=opt)

In [18]:
model

<src.two_tower_jt.two_tower.TheTwoTowers at 0x7f582c0bf150>

### inspect layers

In [19]:
## Quick look at the layers
print("Playlist (query) Tower:")

for i, l in enumerate(model.query_tower.layers):
    print(i, l.name)

Playlist (query) Tower:
0 pl_name_src_text_embedding
1 pl_collaborative_emb_model
2 pl_duration_ms_new_emb_model
3 num_pl_songs_new_emb_model
4 num_pl_artists_new_emb_model
5 num_pl_albums_new_emb_model
6 track_uri_pl_emb_model
7 track_name_pl_emb_model
8 artist_uri_pl_emb_model
9 artist_name_pl_emb_model
10 album_uri_pl_emb_model
11 album_name_pl_emb_model
12 artist_genres_pl_emb_model
13 duration_ms_songs_pl_emb_model
14 track_pop_pl_emb_model
15 artist_pop_pl_emb_model
16 artists_followers_pl_emb_model
17 track_danceability_pl_emb_model
18 track_energy_pl_emb_model
19 track_key_pl_emb_model
20 track_loudness_pl_emb_model
21 track_mode_pl_emb_model
22 track_speechiness_pl_emb_model
23 track_acousticness_pl_emb_model
24 track_instrumentalness_pl_emb_model
25 track_liveness_pl_emb_model
26 track_valence_pl_emb_model
27 track_tempo_pl_emb_model
28 time_signature_pl_emb_model
29 pl_cross_layer
30 pl_dense_layers


In [21]:
print("Track (candidate) Tower:")
for i, l in enumerate(model.candidate_tower.layers):
    print(i, l.name)

Track (candidate) Tower:
0 track_uri_can_emb_model
1 track_name_can_emb_model
2 artist_uri_can_emb_model
3 artist_name_can_emb_model
4 album_uri_can_emb_model
5 album_name_can_emb_model
6 duration_ms_can_emb_model
7 track_pop_can_emb_model
8 artist_pop_can_emb_model
9 artist_genres_can_emb_model
10 artists_followers_can_emb_model
11 track_danceability_can_emb_model
12 track_energy_can_emb_model
13 track_key_can_emb_model
14 track_loudness_can_emb_model
15 track_mode_can_emb_model
16 track_speechiness_can_emb_model
17 track_acousticness_can_emb_model
18 track_instrumentalness_can_emb_model
19 track_liveness_can_emb_model
20 track_valence_can_emb_model
21 track_tempo_can_emb_model
22 time_signature_can_emb_model
23 can_cross_layer
24 candidate_dense_layers


### setup Vertex Exeperiment

In [22]:
EXPERIMENT_NAME = f'local-train-v1'

invoke_time = time.strftime("%Y%m%d-%H%M%S")
RUN_NAME = f'run-{invoke_time}'

LOG_DIR = f"gs://{BUCKET_NAME}/{EXPERIMENT_NAME}/{RUN_NAME}/tb-logs"

vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    experiment=EXPERIMENT_NAME
)

print(f"RUN_NAME: {RUN_NAME}")
print(f"LOG_DIR: {LOG_DIR}")

RUN_NAME: run-20230919-150451
LOG_DIR: gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-150451/tb-logs


### setup Tensorboard callbacks

Setup tensorboard below so training is visible and we can inspect the graph

**TODO:** clean up notebook section 

> *Note:* While profiling does not work for managed Tensorboard at this time, you can inspect the profiler with an [inline Tensorboard in another notebook](https://www.tensorflow.org/tensorboard/tensorboard_in_notebooks). You may be prompted to install the tensorflow profiler library

#### Managed Tensorboard Resource

In [23]:
# use existing TB instance
# TB_RESOURCE_NAME = 'projects/934903580331/locations/us-central1/tensorboards/472921941339013120'

# # create new TB instance
TENSORBOARD_DISPLAY_NAME=f"{EXPERIMENT_NAME}"
tensorboard = vertex_ai.Tensorboard.create(display_name=TENSORBOARD_DISPLAY_NAME)
TB_RESOURCE_NAME = tensorboard.resource_name


print(f"TB_RESOURCE_NAME: {TB_RESOURCE_NAME}")

Creating Tensorboard
Create Tensorboard backing LRO: projects/934903580331/locations/us-central1/tensorboards/7680176680945582080/operations/1954146279286112256
Tensorboard created. Resource name: projects/934903580331/locations/us-central1/tensorboards/7680176680945582080
To use this Tensorboard in another session:
tb = aiplatform.Tensorboard('projects/934903580331/locations/us-central1/tensorboards/7680176680945582080')
TB_RESOURCE_NAME: projects/934903580331/locations/us-central1/tensorboards/7680176680945582080


In [24]:
def get_upload_logs_to_manged_tb_command(ttl_hrs, oneshot="false"):
    """
    Run this and copy/paste the command into terminal to have 
    upload the tensorboard logs from this machine to the managed tb instance
    Note that the log dir is at the granularity of the run to help select the proper
    timestamped run in Tensorboard
    You can also run this in one-shot mode after training is done 
    to upload all tb objects at once
    """
    return(f"""tb-gcp-uploader --tensorboard_resource_name={TB_RESOURCE_NAME} \
      --logdir={LOG_DIR} \
      --experiment_name={EXPERIMENT_NAME} \
      --one_shot={oneshot} \
      --event_file_inactive_secs={60*60*ttl_hrs}""")

# tensorboard callback
class UploadTBLogsBatchEnd(tf.keras.callbacks.Callback):
    '''
    ecapsulates one-shot log uploader via a custom callback

    '''
    def on_epoch_end(self, epoch, logs=None):
        os.system(
            get_upload_logs_to_manged_tb_command(
                ttl_hrs = 5, oneshot="true"
            )
        )

#### train config

* consider experiment and experiment-run naming convention so names don't collide

In [25]:
NUM_EPOCHS = 2
VALID_FREQUENCY = 5
HIST_FREQ = 0
EMBED_FREQ = 0

### Train model

* train model in-notebook
* write metrics to Vertex AI Experiment

In [26]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=LOG_DIR,
        histogram_freq=HIST_FREQ, 
        write_graph=True,
        embeddings_freq=EMBED_FREQ,
        # profile_batch=(20,50) #run profiler on steps 20-40 - enable this line if you want to run profiler from the utils/ notebook
    )

#start the timer and training
start_time = time.time()

layer_history = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    validation_freq=VALID_FREQUENCY,
    epochs=NUM_EPOCHS,
    steps_per_epoch=10,
    validation_steps=100,
    callbacks=[
        tensorboard_callback,
        UploadTBLogsBatchEnd()
    ], 
    verbose=1
)

end_time = time.time()
val_keys = [v for v in layer_history.history.keys()]
runtime_mins = int((end_time - start_time) / 60)

Epoch 1/2
[1m[2023-09-19T15:06:06][0m Started scanning logdir.
[1m[2023-09-19T15:06:07][0m Total uploaded: 8 scalars, 0 tensors, 1 binary objects (1.1 MB)
Epoch 2/2

W0919 15:06:38.495153 140220551669568 uploader.py:389] Please consider uploading to a new experiment instead of an existing one, as the former allows for better upload performance.


View your Tensorboard at https://us-central1.tensorboard.googleusercontent.com/experiment/projects+934903580331+locations+us-central1+tensorboards+7680176680945582080+experiments+local-train-v1
[1m[2023-09-19T15:06:38][0m Started scanning logdir.
[1m[2023-09-19T15:06:42][0m Total uploaded: 16 scalars, 0 tensors, 1 binary objects (1.1 MB)


### log Vertex Experiment

In [27]:
vertex_ai.start_run(RUN_NAME, tensorboard=TB_RESOURCE_NAME)

vertex_ai.log_params(
    {
        "layers": str(LAYER_SIZES), 
        "learning_rate": LR,
        "num_epochs": NUM_EPOCHS,
        "batch_size": batch_size,
        "valid_freq": VALID_FREQUENCY,
    }
)

# gather the metrics for the last epoch to be saved in metrics
metrics_dict = {"train-time-minutes": runtime_mins}

_ = [metrics_dict.update({key: layer_history.history[key][-1]}) for key in val_keys]

vertex_ai.log_metrics(metrics_dict)

vertex_ai.end_run()

Associating projects/934903580331/locations/us-central1/metadataStores/default/contexts/local-train-v1-run-20230919-150451 to Experiment: local-train-v1


In [28]:
print(f"Total runtime: {runtime_mins} minutes")

Total runtime: 1 minutes


In [29]:
#get metrics for the Vertex Experiment
metrics_dict

{'train-time-minutes': 1,
 'batch_categorical_accuracy_at_10': 0.01035156287252903,
 'batch_categorical_accuracy_at_50': 0.04902343824505806,
 'factorized_top_k/top_10_categorical_accuracy': 0.0,
 'factorized_top_k/top_50_categorical_accuracy': 0.0,
 'factorized_top_k/top_100_categorical_accuracy': 0.0,
 'loss': 7107.9453125,
 'regularization_loss': 0,
 'total_loss': 7107.9453125}

### Save each tower

In [30]:
#save towers (models)
tf.saved_model.save(model.query_tower, export_dir=f"gs://{BUCKET_NAME}/{EXPERIMENT_NAME}/{RUN_NAME}/model-dir/query_model")
tf.saved_model.save(model.candidate_tower, export_dir=f"gs://{BUCKET_NAME}/{EXPERIMENT_NAME}/{RUN_NAME}/model-dir/candidate_model")



INFO:tensorflow:Assets written to: gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-150451/model-dir/query_model/assets


INFO:tensorflow:Assets written to: gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-150451/model-dir/query_model/assets


INFO:tensorflow:Assets written to: gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-150451/model-dir/candidate_model/assets


INFO:tensorflow:Assets written to: gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-150451/model-dir/candidate_model/assets


## Evaluate Model

In [23]:
start_time = time.time()

eval_dict_v1 = model.evaluate(valid_dataset, return_dict=True)

end_time = time.time()

elapsed_mins = int((end_time - start_time) / 60)
print(f"elapsed_mins: {elapsed_mins}")

elapsed_mins: 549


In [24]:
eval_dict_v1

{'batch_categorical_accuracy_at_10': 0.011282681487500668,
 'batch_categorical_accuracy_at_50': 0.049578707665205,
 'factorized_top_k/top_10_categorical_accuracy': 0.0,
 'factorized_top_k/top_50_categorical_accuracy': 1.2054147191520315e-05,
 'factorized_top_k/top_100_categorical_accuracy': 3.616244066506624e-05,
 'loss': 40.9329719543457,
 'regularization_loss': 0,
 'total_loss': 40.9329719543457}

### Efficient eval

* approximate with scann

In [29]:
start_time = time.time()

scann = tfrs.layers.factorized_top_k.ScaNN(
    num_reordering_candidates=500,
    num_leaves_to_search=30
)
scann.index_from_dataset(
    candidates=parsed_candidate_dataset.batch(128).cache().map(
        lambda x: (
            x['track_uri_can'], 
            model.candidate_tower(x)
        )
    )
)

end_time = time.time()

elapsed_scann_mins = int((end_time - start_time) / 60)
print(f"elapsed_scann_mins: {elapsed_scann_mins}")

In [24]:
start_time = time.time()

model.task.factorized_metrics = tfrs.metrics.FactorizedTopK(
    candidates=scann
)
model.compile()

scann_result = model.evaluate(
    valid_dataset, 
    return_dict=True, 
    verbose=1
)

end_time = time.time()

elapsed_scann_eval_mins = int((end_time - start_time) / 60)
print(f"elapsed_scann_eval_mins: {elapsed_scann_eval_mins}")

### Evaluating the train job

#### You can access the experiment from the console via the experiment name you just declared:

<img src="./img/experiment-console.png" 
     align="center" 
     width="750"
     height="750"/>

#### After opening the TensorBoard instance:

<img src="./img/tensorboard.png" 
     align="center" 
     width="750"
     height="750"/>

<img src="./img/tb-metrics.png" 
     align="center" 
     width="750"
     height="750"/>

<img src="./img/tb-loss.png" 
     align="center" 
     width="750"
     height="750"/>

#### Also, while this is running - check out the Tensorboard profiler in `utils`

<img src="./img/tb-profiler.png" 
     align="center" 
     width="750"
     height="750"/>

#### Run `nvtop` - check out the installation script in `utils` - `install_nvtop.sh`

<img src="./img/nvtop-optimized.png" 
     align="center" 
     width="750"
     height="750"/>

# Save the candidate embeddings to GCS for use in Matching Engine later
These will be the files we use for the index

This does the following
1) Create a tf pipeline to convert embeddings to numpy
2) Serialize the candidate song emgeddings with the song_uri index and save to gcs

In [31]:
candidate_embeddings = parsed_candidate_dataset.batch(10000).map(
    lambda x: [
        x['track_uri_can'],
        train_utils.tf_if_null_return_zero(
            model.candidate_tower(x)
        )
    ]
)

In [46]:
# candidate_embeddings
# len(list(candidate_embeddings))

In [42]:
CANDIDATE_EMB_JSON = 'candidate_embeddings.json'

# Save to the required format
# make sure you start out with a clean empty file for the append write

# !rm $CANDIDATE_EMB_JSON > /dev/null
!touch $CANDIDATE_EMB_JSON

In [43]:
start_time = time.time()

for batch in candidate_embeddings:
    songs, embeddings = batch
    with open(CANDIDATE_EMB_JSON, 'a') as f:
        for song, emb in zip(songs.numpy(), embeddings.numpy()):
            f.write('{"id":"' + str(song) + '","embedding":[' + ",".join(str(x) for x in list(emb)) + ']}')
            f.write("\n")
            
end_time = time.time()

elapsed_mins = int((end_time - start_time) / 60)
print(f"elapsed_mins: {elapsed_mins}")

elapsed_mins: 3


In [47]:
embeddings

<tf.Tensor: shape=(3885, 128), dtype=float32, numpy=
array([[ 0.72976977, -0.6685658 , -1.3039492 , ...,  1.0261438 ,
         2.1801417 , -0.1081075 ],
       [ 0.72705394, -0.66507214, -1.3133432 , ...,  1.0277908 ,
         2.1814442 , -0.11300553],
       [ 0.73046064, -0.66539216, -1.3096614 , ...,  1.0261513 ,
         2.179594  , -0.1100015 ],
       ...,
       [ 0.7235094 , -0.66666913, -1.3144187 , ...,  1.0275027 ,
         2.1822722 , -0.11019386],
       [ 0.7158847 , -0.67372656, -1.3102845 , ...,  1.0365474 ,
         2.180481  , -0.10880204],
       [ 0.7246471 , -0.66796744, -1.3110669 , ...,  1.0285532 ,
         2.1808298 , -0.11085889]], dtype=float32)>

In [48]:
train_utils.upload_blob(
    bucket_name=BUCKET_NAME,
    source_file_name=CANDIDATE_EMB_JSON,
    destination_blob_name=f'{EXPERIMENT_NAME}/{RUN_NAME}/candidates/candidate_embeddings.json',
    project_id = PROJECT_ID
)

File candidate_embeddings.json uploaded to local-train-v1/run-20230919-150451/candidates/candidate_embeddings.json.


Do a quick line count from terminal - should look like this:

```
 wc -l candidate_embeddings.json 
 
 2249561 candidate_embeddings.json
```

<img src="./img/embeddings.png" 
     align="center" 
     width="750"
     height="750"/>

### Getting test instances

In [40]:
# len(list(valid_dataset.unbatch()))

In [41]:
# len(list(train_dataset.unbatch()))

In [49]:
for tensor_dict in valid_dataset.unbatch().skip(1000).take(1):
    td_keys = tensor_dict.keys()
    list_dict = {}
    for k in td_keys:
        list_dict.update({k: tensor_dict[k].numpy()})
    print(list_dict)

{'album_name_can': b'Brett Young', 'album_name_pl': array([b'The Fighters', b'Love Story', b'The First Time',
       b'Now That I Know Your Name', b'Brett Young'], dtype=object), 'album_uri_can': b'spotify:album:6MTPPsqX8KnmDWmnQmbxJ5', 'album_uri_pl': array([b'spotify:album:7dr4GfexRAIruQMG4GanB2',
       b'spotify:album:04n1d6ioeumQRlAj0JBID2',
       b'spotify:album:2H6xesu7JV1YEO1FvwxKZG',
       b'spotify:album:7yzqRhliyDR9Xl2looejao',
       b'spotify:album:6MTPPsqX8KnmDWmnQmbxJ5'], dtype=object), 'artist_followers_can': 973783.0, 'artist_genres_can': b"'contemporary country', 'country', 'country road'", 'artist_genres_pl': array([b"'contemporary country', 'country', 'country pop', 'country rap', 'country road', 'modern country rock'",
       b"'contemporary country', 'country', 'country road'",
       b"'contemporary country', 'country', 'pop'",
       b"'contemporary country', 'country pop', 'deep talent show'",
       b"'contemporary country', 'country', 'country road'"], dtyp

In [50]:
list_dict

{'album_name_can': b'Brett Young',
 'album_name_pl': array([b'The Fighters', b'Love Story', b'The First Time',
        b'Now That I Know Your Name', b'Brett Young'], dtype=object),
 'album_uri_can': b'spotify:album:6MTPPsqX8KnmDWmnQmbxJ5',
 'album_uri_pl': array([b'spotify:album:7dr4GfexRAIruQMG4GanB2',
        b'spotify:album:04n1d6ioeumQRlAj0JBID2',
        b'spotify:album:2H6xesu7JV1YEO1FvwxKZG',
        b'spotify:album:7yzqRhliyDR9Xl2looejao',
        b'spotify:album:6MTPPsqX8KnmDWmnQmbxJ5'], dtype=object),
 'artist_followers_can': 973783.0,
 'artist_genres_can': b"'contemporary country', 'country', 'country road'",
 'artist_genres_pl': array([b"'contemporary country', 'country', 'country pop', 'country rap', 'country road', 'modern country rock'",
        b"'contemporary country', 'country', 'country road'",
        b"'contemporary country', 'country', 'pop'",
        b"'contemporary country', 'country pop', 'deep talent show'",
        b"'contemporary country', 'country', 'countr

**Finished**