
# Build baseline tfrs model 

Look inside of `two_tower_src/` for the source code and model code

This notebook constructs the two tower model and saves the model to GCS

We will use managed Tensorboard for training. Before beginning, create a new tensorboard instance by going to Vertex -> Experiments -> Tensorboard Instances -> Create


In [1]:
# !pip install tensorflow-recommenders google-cloud-aiplatform --user

#### Restart kernel after installation

In [1]:
PROJECT_ID = 'hybrid-vertex' 
LOCATION = 'us-central1' 
path = 'gs://jt-tfrs-central' 

In [2]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
os.environ['TF_GPU_THREAD_MODE']='gpu_private'
os.environ['TF_GPU_ALLOCATOR']='cuda_malloc_async'

In [5]:
import json
import numpy as np
import pickle as pkl
from pprint import pprint

import tensorflow as tf
import logging
import time

import tensorflow_recommenders as tfrs

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)


from google.cloud import storage

from src.two_tower_jt import two_tower_lite as tt
#inside this tt module the data parsing functions, candidate dataset and model classes are found

## Create Dataset for local training and testing

Inspect the contents of the directory - you can change parameters in the header of the `two_tower.py` script

In [7]:
!tree src/two_tower_jt

[01;34msrc/two_tower_jt[00m
├── __init__.py
├── [01;34m__pycache__[00m
│   ├── __init__.cpython-37.pyc
│   ├── train_config.cpython-37.pyc
│   ├── two_tower.cpython-37.pyc
│   └── two_tower_lite.cpython-37.pyc
├── data-pipeline.py
├── interactive_train.py
├── requirements.txt
├── task.py
├── train_config.py
├── two_tower.py
└── two_tower_lite.py

1 directory, 12 files


## Create Dataset objects

In [8]:
storage_client = storage.Client()

from google.cloud import aiplatform as vertex_ai


options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

def full_parse(data):
    # used for interleave - takes tensors and returns a tf.dataset
    data = tf.data.TFRecordDataset(data)
    return data

batch_size = 1024 #*16

### Create Train dataset

### Option 1: outer paralellism

* parallelize transforms
* run multiple copies of the input pipeline over sharded inputs and combine the results

In [6]:
# file_path='gs://spotify-data-regimes/jtv10/valid_v9/*.tfrecords'

# filenames = tf.data.Dataset.list_files(file_path, shuffle=None)
# filenames.cache()

# NUM_SHARDS=2

# def make_dataset(shard_index):
#     files = filenames.shard(NUM_SHARDS, shard_index)
#     dataset = tf.data.TFRecordDataset(files)
#     return dataset.batch(batch_size)

# indices = tf.data.Dataset.range(NUM_SHARDS)

# train_dataset = indices.interleave(
#     make_dataset,
#     num_parallel_calls=tf.data.AUTOTUNE
# ).map(
#     tt.parse_tfrecord, 
#     num_parallel_calls=tf.data.AUTOTUNE
# ).repeat(
#     args.num_epochs
# ).prefetch(
#     tf.data.AUTOTUNE
# ).with_options(
#     options
# )
# # train_dataset

In [47]:
# for file in filenames:
#     print(file)

tf.Tensor(b'gs://spotify-data-regimes/jtv10/valid_v9/-00000-of-00006.tfrecords', shape=(), dtype=string)
tf.Tensor(b'gs://spotify-data-regimes/jtv10/valid_v9/-00005-of-00006.tfrecords', shape=(), dtype=string)
tf.Tensor(b'gs://spotify-data-regimes/jtv10/valid_v9/-00004-of-00006.tfrecords', shape=(), dtype=string)
tf.Tensor(b'gs://spotify-data-regimes/jtv10/valid_v9/-00002-of-00006.tfrecords', shape=(), dtype=string)
tf.Tensor(b'gs://spotify-data-regimes/jtv10/valid_v9/-00003-of-00006.tfrecords', shape=(), dtype=string)
tf.Tensor(b'gs://spotify-data-regimes/jtv10/valid_v9/-00001-of-00006.tfrecords', shape=(), dtype=string)


### Option 2: interleave --> map --> batch

In [9]:
train_dir = 'spotify-data-regimes'
train_dir_prefix = 'jtv10/valid_v9' # valid_v9 | train_v9

train_files = []
for blob in storage_client.list_blobs(f'{train_dir}', prefix=f'{train_dir_prefix}'):
    if '.tfrecords' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
    
train_dataset = tf.data.Dataset.from_tensor_slices(train_files).prefetch(
    tf.data.AUTOTUNE,
)

train_dataset = train_dataset.interleave(
    full_parse,
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False,
).map(
    tt.parse_tfrecord, 
    num_parallel_calls=tf.data.AUTOTUNE
).batch(
    batch_size 
).prefetch(
    tf.data.AUTOTUNE,
).with_options(
    options
)

# train_dataset

In [13]:
for x in train_dataset.batch(1).take(1):
    pprint(x)

{'album_uri_can': <tf.Tensor: shape=(1, 1024), dtype=string, numpy=
array([[b'spotify:album:0Jy2FoyFXxrHi6XIa8cHzU',
        b'spotify:album:0jAuXoXXgM4EjiRBEMHFav',
        b'spotify:album:1lAMkHFW0e51taMt34LUQ2', ...,
        b'spotify:album:6rg15XPgPhunxqpsUxmbfM',
        b'spotify:album:2E5QEP74QC8pJachhCViSZ',
        b'spotify:album:5jkFiJdMrdDxbhxCZ9hM5m']], dtype=object)>,
 'album_uri_pl': <tf.Tensor: shape=(1, 1024, 5), dtype=string, numpy=
array([[[b'spotify:album:39Mc1rLpJeiG5BYuSOwGet',
         b'spotify:album:2Z9WUERfMjOgQ6ze9TcGbF',
         b'spotify:album:50wolXldayJCEtNKyzJERs',
         b'spotify:album:0IuHVgAvbNDJnJepuSZ8Oz',
         b'spotify:album:2dqn5yOQWdyGwOpOIi9O4x'],
        [b'spotify:album:3IFuRY2wRpJa9FTXj4aTTB',
         b'spotify:album:2oPdRL0fDdnW9e1zoMnrDk',
         b'spotify:album:0jAuXoXXgM4EjiRBEMHFav',
         b'spotify:album:0I8K07F4nXcRexkF6BrDN5',
         b'spotify:album:0I8K07F4nXcRexkF6BrDN5'],
        [b'spotify:album:0IuHVgAvbNDJnJepuS

### Create Validation dataset

In [14]:
valid_dir = 'spotify-data-regimes'
valid_dir_prefix = 'jtv10/valid_v9'

valid_files = []
for blob in storage_client.list_blobs(f'{valid_dir}', prefix=f'{valid_dir_prefix}'):
    if '.tfrecords' in blob.name:
        valid_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))


valid_dataset = tf.data.Dataset.from_tensor_slices(valid_files).prefetch(
    tf.data.AUTOTUNE,
)

valid_dataset = valid_dataset.interleave(
    full_parse,
    num_parallel_calls=tf.data.AUTOTUNE,
    cycle_length=tf.data.AUTOTUNE, 
    deterministic=False,
).map(
    tt.parse_tfrecord, 
    num_parallel_calls=tf.data.AUTOTUNE
).batch(
    batch_size
).prefetch(
    tf.data.AUTOTUNE,
).with_options(
    options
)

# valid_dataset = valid_dataset.cache() #1gb machine mem + 400 MB in candidate ds (src/two-tower.py)

# valid_dataset

In [15]:
for x in valid_dataset.batch(1).take(1):
    pprint(x)

{'album_uri_can': <tf.Tensor: shape=(1, 1024), dtype=string, numpy=
array([[b'spotify:album:0Jy2FoyFXxrHi6XIa8cHzU',
        b'spotify:album:0jAuXoXXgM4EjiRBEMHFav',
        b'spotify:album:1lAMkHFW0e51taMt34LUQ2', ...,
        b'spotify:album:3j1hDhDSQSdlVcZf2kfecY',
        b'spotify:album:2K0iHDNNIzhuclKtsugZxt',
        b'spotify:album:7j7q1pRH9QoQPkEXOfmFsz']], dtype=object)>,
 'album_uri_pl': <tf.Tensor: shape=(1, 1024, 5), dtype=string, numpy=
array([[[b'spotify:album:39Mc1rLpJeiG5BYuSOwGet',
         b'spotify:album:2Z9WUERfMjOgQ6ze9TcGbF',
         b'spotify:album:50wolXldayJCEtNKyzJERs',
         b'spotify:album:0IuHVgAvbNDJnJepuSZ8Oz',
         b'spotify:album:2dqn5yOQWdyGwOpOIi9O4x'],
        [b'spotify:album:3IFuRY2wRpJa9FTXj4aTTB',
         b'spotify:album:2oPdRL0fDdnW9e1zoMnrDk',
         b'spotify:album:0jAuXoXXgM4EjiRBEMHFav',
         b'spotify:album:0I8K07F4nXcRexkF6BrDN5',
         b'spotify:album:0I8K07F4nXcRexkF6BrDN5'],
        [b'spotify:album:0IuHVgAvbNDJnJepuS

### Create Candidates dataset

In [17]:
CANDIDATE_FILE_DIR = 'spotify-data-regimes'
CANDIDATE_PREFIX = 'jtv10/candidates' 

candidate_files = []
for blob in storage_client.list_blobs(f"{CANDIDATE_FILE_DIR}", prefix=f'{CANDIDATE_PREFIX}'):
    if '.tfrecords' in blob.name:
        candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

candidate_dataset = tf.data.Dataset.from_tensor_slices(candidate_files)

parsed_candidate_dataset = candidate_dataset.interleave(
    full_parse,
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False
).map(
    tt.parse_candidate_tfrecord_fn, 
    num_parallel_calls=tf.data.AUTOTUNE
).with_options(
    options
)

parsed_candidate_dataset = parsed_candidate_dataset.cache() #400 MB on machine mem

# parsed_candidate_dataset

In [18]:
for x in parsed_candidate_dataset.batch(1).take(1):
    pprint(x)

{'album_uri_can': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'spotify:album:4U91uJBtdsedXEuRMjgZRP'], dtype=object)>,
 'artist_followers_can': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([28450.], dtype=float32)>,
 'artist_genres_can': <tf.Tensor: shape=(1,), dtype=string, numpy=
array([b"'edm', 'pop dance', 'progressive house', 'progressive trance', 'trance', 'uplifting trance'"],
      dtype=object)>,
 'artist_pop_can': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([49.], dtype=float32)>,
 'artist_uri_can': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'spotify:artist:7iVuXpgNEl87BwdwV1L6he'], dtype=object)>,
 'duration_ms_can': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([420000.], dtype=float32)>,
 'time_signature_can': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'4'], dtype=object)>,
 'track_acousticness_can': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.000864], dtype=float32)>,
 'track_danceability_can': <tf.Tensor: shape=(1,), 

# Local Training

Compile the model
Review the details of the model layers

## Adapt Layers

#### adapt normalization layers

In [22]:
# min_duration_ms_can = train_dataset.map(lambda x: x["duration_ms_can"]).reduce(np.Inf, tf.math.minimum)
# max_duration_ms_can = train_dataset.map(lambda x: x["duration_ms_can"]).reduce(-np.Inf, tf.math.maximum)


# print(f"max_duration_ms_can: {max_duration_ms_can}")
# print(f"min_duration_ms_can: {min_duration_ms_can}")

In [23]:
# duration_ms_can

# max_duration_ms_can = train_dataset.map(lambda x: x["duration_ms_can"]).reduce(
#     tf.cast(0.0, tf.float32), tf.maximum).numpy().max()

# min_duration_ms_can = train_dataset.map(lambda x: x["duration_ms_can"]).reduce(
#     tf.cast(0.0, tf.float32), tf.minimum).numpy().min()

# avg_duration_ms_can = train_dataset.map(lambda x: x["duration_ms_can"]).reduce(
#     tf.cast(0.0, tf.float32), tf.minimum).numpy().mean()

# var_duration_ms_can = train_dataset.map(lambda x: x["duration_ms_can"]).reduce(
#     tf.cast(0.0, tf.float32), tf.minimum).numpy().var()

# print(f"max_duration_ms_can: {max_duration_ms_can}")
# print(f"min_duration_ms_can: {min_duration_ms_can}")
# print(f"avg_duration_ms_can: {avg_duration_ms_can}")
# print(f"var_duration_ms_can: {var_duration_ms_can}")


# track_pop_can
# artist_pop_can
# artist_followers_can
# track_danceability_can
# track_energy_can
# track_loudness_can
# track_speechiness_can
# track_acousticness_can
# track_instrumentalness_can
# track_liveness_can
# track_valence_can
# track_tempo_can

# pl_duration_ms_new
# num_pl_songs_new     # | n_songs_pl_new
# num_pl_artists_new
# num_pl_albums_new



# duration_ms_songs_pl
# track_pop_pl
# artist_pop_pl
# artists_followers_pl
# track_danceability_pl
# track_energy_pl
# track_loudness_pl
# track_speechiness_pl
# track_acousticness_pl
# track_instrumentalness_pl
# track_liveness_pl
# track_valence_pl
# track_tempo_pl

In [None]:
# mean & variance --> normalization

# np.mean(a)
# np.var(a)

# min & max --> descritized buckets

# track_pop_pl_buckets = np.linspace(
#     vocab_dict['min_track_pop'], 
#     vocab_dict['max_track_pop'], 
#     num=10
# )


max_timestamp = train_dataset.map(lambda x: x["timestamp"]).reduce(
    tf.cast(0, tf.int64), tf.maximum).numpy().max()
min_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
    np.int64(1e9), tf.minimum).numpy().min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000)

print(f"Buckets: {timestamp_buckets[:3]}")

#### adapt text vectorization layers

In [None]:
track_name_pl
pl_name_src
artist_genres_pl

track_name_can
artist_genres_can







### load saved vocab dict

In [10]:
# os.system('gsutil cp gs://two-tower-models/vocabs/vocab_dict.pkl .')  # TODO - paramterize

filehandler = open('vocab_dict.pkl', 'rb')
vocab_dict = pkl.load(filehandler)
filehandler.close()

# vocab_dict

## Build and Compile model

In [11]:
layer_sizes=[512,256]

model = tt.TheTwoTowers(layer_sizes, vocab_dict, parsed_candidate_dataset)

LR = .1
opt = tf.keras.optimizers.Adagrad(LR)
model.compile(optimizer=opt)

In [12]:
model

<src.two_tower_jt.two_tower.TheTwoTowers at 0x7f3daa1a3b10>

### inspect layers

In [13]:
## Quick look at the layers
print("Playlist (query) Tower:")

for i, l in enumerate(model.query_tower.layers):
    print(i, l.name)

Playlist (query) Tower:
0 pl_name_src_text_embedding
1 pl_collaborative_emb_model
2 pl_duration_ms_new_emb_model
3 num_pl_songs_new_emb_model
4 num_pl_artists_new_emb_model
5 num_pl_albums_new_emb_model
6 track_uri_pl_emb_model
7 track_name_pl_emb_model
8 artist_uri_pl_emb_model
9 artist_name_pl_emb_model
10 album_uri_pl_emb_model
11 album_name_pl_emb_model
12 artist_genres_pl_emb_model
13 duration_ms_songs_pl_emb_model
14 track_pop_pl_emb_model
15 artist_pop_pl_emb_model
16 artists_followers_pl_emb_model
17 track_danceability_pl_emb_model
18 track_energy_pl_emb_model
19 track_key_pl_emb_model
20 track_loudness_pl_emb_model
21 track_mode_pl_emb_model
22 track_speechiness_pl_emb_model
23 track_acousticness_pl_emb_model
24 track_instrumentalness_pl_emb_model
25 track_liveness_pl_emb_model
26 track_valence_pl_emb_model
27 track_tempo_pl_emb_model
28 time_signature_pl_emb_model
29 pl_dense_layers


In [14]:
print("Track (candidate) Tower:")
for i, l in enumerate(model.candidate_tower.layers):
    print(i, l.name)

Track (candidate) Tower:
0 track_uri_can_emb_model
1 track_name_can_emb_model
2 artist_uri_can_emb_model
3 artist_name_can_emb_model
4 album_uri_can_emb_model
5 album_name_can_emb_model
6 duration_ms_can_emb_model
7 track_pop_can_emb_model
8 artist_pop_can_emb_model
9 artist_genres_can_emb_model
10 artists_followers_can_emb_model
11 track_danceability_can_emb_model
12 track_energy_can_emb_model
13 track_key_can_emb_model
14 track_loudness_can_emb_model
15 track_mode_can_emb_model
16 track_speechiness_can_emb_model
17 track_acousticness_can_emb_model
18 track_instrumentalness_can_emb_model
19 track_liveness_can_emb_model
20 track_valence_can_emb_model
21 track_tempo_can_emb_model
22 time_signature_can_emb_model
23 candidate_dense_layers


### setup Vertex Exeperiment

In [16]:
EXPERIMENT_NAME = f'build-local-v2'

vertex_ai.init(
    project=PROJECT_ID,
    location='us-central1',
    # experiment=EXPERIMENT_NAME
)

### setup Tensorboard callbacks

Setup tensorboard below so training is visible and we can inspect the graph

**TODO:** clean up notebook section 

> *Note:* While profiling does not work for managed Tensorboard at this time, you can inspect the profiler with an [inline Tensorboard in another notebook](https://www.tensorflow.org/tensorboard/tensorboard_in_notebooks). You may be prompted to install the tensorflow profiler library

#### Managed Tensorboard Resource

In [21]:
# use existing TB instance
# TB_RESOURCE_NAME = 'projects/934903580331/locations/us-central1/tensorboards/472921941339013120'

# # create new TB instance
TENSORBOARD_DISPLAY_NAME=f"{EXPERIMENT_NAME}"
tensorboard = vertex_ai.Tensorboard.create(display_name=TENSORBOARD_DISPLAY_NAME)
TB_RESOURCE_NAME = tensorboard.resource_name


print(f"TB_RESOURCE_NAME: {TB_RESOURCE_NAME}")
# projects/934903580331/locations/us-central1/tensorboards/472921941339013120

TB_RESOURCE_NAME: projects/934903580331/locations/us-central1/tensorboards/472921941339013120


In [22]:
def get_upload_logs_to_manged_tb_command(ttl_hrs, oneshot="false"):
    """
    Run this and copy/paste the command into terminal to have 
    upload the tensorboard logs from this machine to the managed tb instance
    Note that the log dir is at the granularity of the run to help select the proper
    timestamped run in Tensorboard
    You can also run this in one-shot mode after training is done 
    to upload all tb objects at once
    """
    return(f"""tb-gcp-uploader --tensorboard_resource_name={TB_RESOURCE_NAME} \
      --logdir={LOG_DIR} \
      --experiment_name={EXPERIMENT_NAME} \
      --one_shot={oneshot} \
      --event_file_inactive_secs={60*60*ttl_hrs}""")

#### train config

* consider experiment and experiment-run naming convention so names don't collide

In [17]:
invoke_time = time.strftime("%Y%m%d-%H%M%S")
RUN_NAME = f'run-{invoke_time}'

LOG_DIR = f"{path}/{EXPERIMENT_NAME}/{RUN_NAME}/tb-logs"

NUM_EPOCHS = 1
VALID_FREQUENCY=5

HIST_FREQ = 0
EMBED_FREQ = 0

print(f"RUN_NAME: {RUN_NAME}")
print(f"LOG_DIR: {LOG_DIR}")

RUN_NAME: run-20230104-153318
LOG_DIR: gs://jt-tfrs-central/build-local-v2/run-20230104-153318/tb-logs


### Train model

* train model in-notebook
* write metrics to Vertex AI Experiment

In [18]:
# tensorboard callback
class UploadTBLogsBatchEnd(tf.keras.callbacks.Callback):
    '''
    ecapsulates one-shot log uploader via a custom callback

    '''
    def on_epoch_end(self, epoch, logs=None):
        os.system(get_upload_logs_to_manged_tb_command(ttl_hrs = 5, oneshot="true"))
        
tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=LOG_DIR,
        histogram_freq=HIST_FREQ, 
        write_graph=True,
        embeddings_freq=EMBED_FREQ,
        # profile_batch=(20,50) #run profiler on steps 20-40 - enable this line if you want to run profiler from the utils/ notebook
    )

#start the timer and training
start_time = time.time()

layer_history = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    validation_freq=VALID_FREQUENCY,
    epochs=NUM_EPOCHS,
    steps_per_epoch=10,
    validation_steps=100,
    callbacks=[
        tensorboard_callback,
        UploadTBLogsBatchEnd()
    ], 
    verbose=2
)

end_time = time.time()
val_keys = [v for v in layer_history.history.keys()]
runtime_mins = int((end_time - start_time) / 60)


vertex_ai.start_run(RUN_NAME, tensorboard=TB_RESOURCE_NAME)

vertex_ai.log_params(
    {
        "layers": str(layer_sizes), 
        "learning_rate": LR,
        "num_epochs": NUM_EPOCHS,
        "batch_size": batch_size,
        "valid_freq": VALID_FREQUENCY,
    }
)

gather the metrics for the last epoch to be saved in metrics
metrics_dict = {"train-time-minutes": runtime_mins}

_ = [metrics_dict.update({key: layer_history.history[key][-1]}) for key in val_keys]

vertex_ai.log_metrics(metrics_dict) # JT TODO removed for 'total_loss' getting nan

vertex_ai.end_run()



In [20]:
layer_history

<keras.callbacks.History at 0x7fdcdc23e750>

In [19]:
model

<src.two_tower_jt.two_tower.TheTwoTowers at 0x7f3daa1a3b10>

## Evaluate Model

In [None]:
start_time = time.time()

eval_dict_v1 = model.evaluate(valid_dataset, return_dict=True)

end_time = time.time()

elapsed_mins = int((end_time - start_time) / 60)
print(f"elapsed_mins: {elapsed_mins}")

In [None]:
eval_dict_v1

### Efficient eval

* approximate with scann

In [23]:
start_time = time.time()

scann = tfrs.layers.factorized_top_k.ScaNN(
    num_reordering_candidates=500,
    num_leaves_to_search=30
)
scann.index_from_dataset(candidates=parsed_candidate_dataset.batch(128).cache().map(lambda x: (x['track_uri_can'], model.candidate_tower(x))))

end_time = time.time()

elapsed_scann_mins = int((end_time - start_time) / 60)
print(f"elapsed_scann_mins: {elapsed_scann_mins}")

In [24]:
start_time = time.time()

model.task.factorized_metrics = tfrs.metrics.FactorizedTopK(
    candidates=scann
)
model.compile()

scann_result = model.evaluate(valid_dataset, return_dict=True, verbose=1)

end_time = time.time()

elapsed_scann_eval_mins = int((end_time - start_time) / 60)
print(f"elapsed_scann_eval_mins: {elapsed_scann_eval_mins}")

#### You can access the experiment from the console via the experiment name you just declared:

![](img/experiment-console.png)

![](img/tensorboard.png)

### Also, while this is running - check out the Tensorboard profiler in `utils`.

![](img/tb-profiler.png)

### Run `nvtop` - check out the installation script in `utils` - `install_nvtop.sh`

![](img/nvtop-optimized.png)

In [14]:
print(f"Total runtime: {runtime_mins} minutes")

Total runtime: 105 minutes


### When complete you get a decent model with around 30-40 hit rate for top 1

![](img/tb-metrics.png)
![](img/tb-loss.png)

In [13]:
#get metrics for the Vertex Experiment
metrics_dict

{'train-time-minutes': 222,
 'factorized_top_k/top_1_categorical_accuracy': 0.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.0,
 'factorized_top_k/top_10_categorical_accuracy': 0.0,
 'factorized_top_k/top_50_categorical_accuracy': 0.0,
 'factorized_top_k/top_100_categorical_accuracy': 0.0,
 'loss': 100245.734375,
 'regularization_loss': 0,
 'total_loss': 100245.734375}

### Now, save the model

In [None]:
# first, create the bucket to store the tensorflow models
# ! gsutil mb -l us-central1 $path

In [14]:
#save the models

tf.saved_model.save(model.query_tower, export_dir=path + f"/{RUN_NAME}/query_model")
tf.saved_model.save(model.candidate_tower, export_dir=path + f"/{RUN_NAME}/candidate_model")

INFO:tensorflow:Assets written to: gs://two-tower-models/run-spotify-nb-train-full-jt-20221227-214932/query_model/assets
INFO:tensorflow:Assets written to: gs://two-tower-models/run-spotify-nb-train-full-jt-20221227-214932/candidate_model/assets


## Save the candidate embeddings to GCS for use in Matching Engine later
These will be the files we use for the index

This does the following
1) Create a tf pipeline to convert embeddings to numpy
2) Serialize the candidate song emgeddings with the song_uri index and save to gcs

In [15]:
# create a tf function to convert any bad null values
def tf_if_null_return_zero(val):
    """
    this function fills in nans to zeros - sometimes happens in embedding calcs.
    this will clean the embedding inputs downstream
    """
    return(tf.clip_by_value(val, -1e12, 1e12)) # a trick to remove NANs post tf2.0

In [16]:
candidate_embeddings = tt.parsed_candidate_dataset.batch(10000).map(lambda x: [x['track_uri_can'], tf_if_null_return_zero(model.candidate_tower(x))])

In [17]:
# Save to the required format
# make sure you start out with a clean empty file for the append write
!rm candidate_embeddings.json > /dev/null 
!touch candidate_embeddings.json
for batch in candidate_embeddings:
    songs, embeddings = batch
    with open("candidate_embeddings.json", 'a') as f:
        for song, emb in zip(songs.numpy(), embeddings.numpy()):
            f.write('{"id":"' + str(song) + '","embedding":[' + ",".join(str(x) for x in list(emb)) + ']}')
            f.write("\n")

rm: cannot remove 'candidate_embeddings.json': No such file or directory


In [19]:
tt.upload_blob(
    'two-tower-models', 
    'candidate_embeddings.json', 
    f'{RUN_NAME}/candidates/candidate_embeddings.json'
)

File candidate_embeddings.json uploaded to run-spotify-nb-train-full-jt-20221227-214932/candidates/candidate_embeddings.json.


Do a quick line count from terminal - should look like this:

```
(base) jupyter@tf28-jsw-sep-a100:~/spotify_mpd_two_tower$ wc -l candidate_embeddings.json 
2249561 candidate_embeddings.json
```

### Finished

Go on to the [03 notebook](03-matching-engine.ipynb)

You should see results similar to the screenshot below
![](img/embeddings.png)