# Build baseline tfrs model 

Look inside of `two_tower_src/` for the source code and model code

This notebook constructs the two tower model and saves the model to GCS

We will use managed Tensorboard for training. Before begininning, create a new tensorboard instance by going to Vertex -> Experiments -> Tensorboard Instances -> Create

![](img/create-a-tb.png)

In [1]:
# !pip install tensorflow-recommenders==0.6.0 --user

#### Restart kernel after installation

In [2]:
PROJECT_ID = 'hybrid-vertex'  # <--- TODO: CHANGE THIS
LOCATION = 'us-central1' 
path = 'gs://two-tower-models' #TODO change to your model directory

In [3]:
import os

os.environ['TF_GPU_THREAD_MODE']='gpu_private'
os.environ['TF_GPU_ALLOCATOR']='cuda_malloc_async'

In [4]:
import json

import tensorflow as tf
import logging
import time

import tensorflow_recommenders as tfrs


from google.cloud import storage

from two_tower_src import two_tower as tt
#inside this tt module the data parsing functions, candidate dataset and model classes are found

2022-10-19 01:06:12.527428: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-19 01:06:13.762299: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:214] Using CUDA malloc Async allocator for GPU: 0
2022-10-19 01:06:13.762667: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38238 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0
2022-10-19 01:06:13.765160: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:214] Using CUDA malloc Async allocator for GPU: 1
2022-10-19 01:06:13.765271: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/re

## Create Dataset for local training and testing

Inspect the contents of the directory - you can change parameters in the header of the `two_tower.py` script

In [5]:
!tree two_tower_src

[01;34mtwo_tower_src[00m
├── __init__.py
├── [01;34m__pycache__[00m
│   ├── __init__.cpython-37.pyc
│   └── two_tower.cpython-37.pyc
└── two_tower.py

1 directory, 4 files


### Playlist dataset

In [6]:
batch_size = 1024*8
train_dir = 'spotify-beam-v3'
train_dir_prefix = 'v6/train_last_5_v2/'

valid_dir = 'spotify-beam-v3'
valid_dir_prefix = 'v6/valid_last_5/'

client = storage.Client()
from google.cloud import aiplatform as vertex_ai


options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO
 

train_files = []
for blob in client.list_blobs(f'{train_dir}', prefix=f'{train_dir_prefix}', delimiter="/"):
    train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

def full_parse(data):
    # used for interleave - takes tensors and returns a tf.dataset
    data = tf.data.TFRecordDataset(data)
    return data
    
train_dataset = tf.data.Dataset.from_tensor_slices(train_files).prefetch(
    tf.data.AUTOTUNE,
)

train_dataset = train_dataset.interleave(
    full_parse,
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False,
).map(tt.parse_tfrecord, num_parallel_calls=tf.data.AUTOTUNE,).batch(
    batch_size 
).prefetch(
    tf.data.AUTOTUNE,
).with_options(options)


valid_files = []
for blob in client.list_blobs(f'{valid_dir}', prefix=f'{valid_dir_prefix}', delimiter="/"):
    valid_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))


valid_dataset = tf.data.Dataset.from_tensor_slices(valid_files).prefetch(
    tf.data.AUTOTUNE,
)

valid_dataset = valid_dataset.interleave(
    full_parse,
    num_parallel_calls=tf.data.AUTOTUNE,
    cycle_length=tf.data.AUTOTUNE, 
    deterministic=False,
).map(tt.parse_tfrecord, num_parallel_calls=tf.data.AUTOTUNE).batch(
    batch_size
).prefetch(
    tf.data.AUTOTUNE,
).with_options(options)

# Local Training

Compile the model
Review the details of the model layers

In [7]:
layer_sizes=[256]
strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
with strategy.scope():
    model = tt.TheTwoTowers(layer_sizes)
    LR = .001
    model.compile(optimizer=tf.keras.optimizers.Adam(LR))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensor

In [8]:
## Quick look at the layers
print("Playlist (query) Tower:")

for i, l in enumerate(model.query_tower.layers):
    print(i, l.name)

Playlist (query) Tower:
0 pl_name_emb_model
1 pl_collaborative_emb_model
2 pl_track_uri_emb_model
3 artist_name_pl_emb_model
4 track_uri_pl_emb_model
5 track_name_pl_emb_model
6 duration_ms_songs_pl_emb_model
7 album_name_pl_emb_model
8 artist_pop_pl_emb_model
9 artists_followers_pl_emb_model
10 track_pop_pl_emb_model
11 artist_genres_pl_emb_model
12 pl_dense_layers


In [9]:
print("Track (candidate) Tower:")
for i, l in enumerate(model.candidate_tower.layers):
    print(i, l.name)

Track (candidate) Tower:
0 artist_name_can_emb_model
1 track_name_can_emb_model
2 album_name_can_emb_model
3 artist_uri_can_emb_model
4 track_uri_can_emb_model
5 album_uri_can_emb_model
6 duration_ms_can_emb_model
7 artist_pop_can_emb_model
8 artists_followers_can_emb_model
9 track_pop_can_emb_model
10 artist_genres_can_emb_model
11 candidate_dense_layers


In [None]:
# adpat the text vectorizors
model.query_tower.layers[3].layers[0].adapt(
    train_dataset.unbatch().batch(40000).map(lambda x: tf.reshape(x['artist_name_pl'], [-1, 5, 1]))) #artist name pl
model.query_tower.layers[5].layers[0].adapt(
    train_dataset.unbatch().batch(40000).map(lambda x: tf.reshape(x['track_name_pl'], [-1, 5, 1]))) #track name pl
model.query_tower.layers[7].layers[0].adapt(
    train_dataset.unbatch().batch(40000).map(lambda x: tf.reshape(x['album_name_pl'], [-1, 5, 1]))) #album name pl
model.query_tower.layers[11].layers[0].adapt(
    train_dataset.unbatch().batch(40000).map(lambda x: tf.reshape(x['artist_genres_pl'], [-1, 5, 1]))) #artist genres pl

model.candidate_tower.layers[1].layers[0].adapt(
    train_dataset.unbatch().batch(40000).map(lambda x: x['track_name_can'])) #track name can
model.candidate_tower.layers[2].layers[0].adapt(
    train_dataset.unbatch().batch(40000).map(lambda x: x['album_name_can'])) #album name can
model.candidate_tower.layers[10].layers[0].adapt(
    train_dataset.unbatch().batch(40000).map(lambda x: x['albun_grenres_can'])) #artist genres can

In [None]:
## Save the vocab dictionary for later so you will not have to adapt

In [None]:
# adpat the text vectorizors
vocab_dict = {
    'artist_name_pl' : model.query_tower.layers[3].layers[0].get_vocabulary(), #artist name pl
    'track_name_pl' : model.query_tower.layers[5].layers[0].get_vocabulary(), #track name pl
    'album_name_pl' : model.query_tower.layers[7].layers[0].get_vocabulary(), #album name pl
    'artist_genres_pl' : model.query_tower.layers[11].layers[0].get_vocabulary(), #artist genres pl

    'track_name_can' : model.candidate_tower.layers[1].layers[0].get_vocabulary(), #track name can
    'album_name_can' : model.candidate_tower.layers[2].layers[0].get_vocabulary(), #album name can
    'artist_genres_can' : model.candidate_tower.layers[10].layers[0].get_vocabulary(), #artist genres can
}

In [None]:
import pickle as pkl

filehandler = open('vocab_dict.pkl', 'wb')
pkl.dump(vocab_dict, filehandler)

filehandler.close()

### Local Training

Setup tensorboard below so training is visible and we can inspect the graph

In [None]:
TB_RESOURCE_NAME = 'projects/934903580331/locations/us-central1/tensorboards/7336372589079560192' #fqn - project number then tensorboard id
invoke_time = time.strftime("%Y%m%d-%H%M%S")
EXPERIMENT_NAME = f'spotify-singe-node-train-full-data-v3-01'
RUN_NAME = EXPERIMENT_NAME+'run'+time.strftime("%Y%m%d-%H%M%S")
LOG_DIR = path+"/tb-logs/"+EXPERIMENT_NAME


def get_upload_logs_to_manged_tb_command(ttl_hrs, oneshot="false"):
    """
    Run this and copy/paste the command into terminal to have 
    upload the tensorboard logs from this machine to the managed tb instance
    Note that the log dir is at the granularity of the run to help select the proper
    timestamped run in Tensorboard
    You can also run this in one-shot mode after training is done 
    to upload all tb objects at once
    """
    return(f"""tb-gcp-uploader --tensorboard_resource_name={TB_RESOURCE_NAME} \
      --logdir={logs_dir} \
      --experiment_name={EXPERIMENT_NAME} \
      --one_shot={oneshot} \
      --event_file_inactive_secs={60*60*ttl_hrs}""")

vertex_ai.init(experiment=EXPERIMENT_NAME)
    

# we are going to ecapsulate this one-shot log uploader via a custom callback:

class UploadTBLogsBatchEnd(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        os.system(get_upload_logs_to_manged_tb_command(ttl_hrs = 5, oneshot="true"))

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=LOG_DIR,
        histogram_freq=0, 
        write_graph=True, 
        # profile_batch=(20,50) #run profiler on steps 20-40 - enable this line if you want to run profiler from the utils/ notebook
    )

### Training using tensorboard callback

While profiling does not work for managed Tensorboard at this time, you can inspect the profiler with an [inline Tensorboard in another notebook](https://www.tensorflow.org/tensorboard/tensorboard_in_notebooks). You may be prompted to install the tensorflow profiler library

In [None]:
NUM_EPOCHS = 20
RUN_NAME = f'run-{EXPERIMENT_NAME}-{time.strftime("%Y%m%d-%H%M%S")}'#be sure to think about run and experiment naming strategies so names don't collide

#start the run to collect metrics - note `.log_parameters()` is available but not used

#start the timer and training
start_time = time.time()
with strategy.scope():
    layer_history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        validation_freq=5,
        epochs=NUM_EPOCHS,
        # steps_per_epoch=2, #use this for development to run just a few steps
        callbacks=[tensorboard_callback,
                   UploadTBLogsBatchEnd()], #the tensorboard will be automatically associated with the experiment and log subsequent runs with this callback
        verbose=1
    )

end_time = time.time()
val_keys = [v for v in layer_history.history.keys()]
runtime_mins = int((end_time - start_time) / 60)

one_shot_cmd = get_upload_logs_to_manged_tb_command(oneshot="true")
! $one_shot_cmd

vertex_ai.start_run(RUN_NAME, tensorboard=TB_RESOURCE_NAME)

vertex_ai.log_params({"layers": str(layer_sizes), 
                      "learning_rate": LR,
                        "num_epochs": epochs,
                        "batch_size": batch_size,
                     })

#gather the metrics for the last epoch to be saved in metrics
metrics_dict = {"train-time-minutes": runtime_mins}
_ = [metrics_dict.update({key: layer_history.history[key][-1]}) for key in val_keys]
vertex_ai.log_metrics(metrics_dict)
vertex_ai.end_run()

#### You can access the experiment from the console via the experiment name you just declared:

![](img/experiment-console.png)

![](img/tensorboard.png)

### Also, while this is running - check out the Tensorboard profiler in `utils`.

![](img/tb-profiler.png)

### Run `nvtop` - check out the installation script in `utils` - `install_nvtop.sh`

![](img/nvtop-optimized.png)

In [None]:
print(f"Total runtime: {runtime_mins} minutes")

### When complete you get a decent model with around 30-40 hit rate for top 1

![](img/tb-metrics.png)
![](img/tb-loss.png)

In [None]:
#get metrics for the Vertex Experiment
metrics_dict

### Now, save the model

In [None]:
# first, create the bucket to store the tensorflow models
# ! gsutil mb -l us-central1 $path

In [None]:
#save the models

tf.saved_model.save(model.query_tower, export_dir=path + "/query_model")
tf.saved_model.save(model.candidate_tower, export_dir=path + "/candidate_model")

## Save the candidate embeddings to GCS for use in Matching Engine later
These will be the files we use for the index

This does the following
1) Create a tf pipeline to convert embeddings to numpy
2) Serialize the candidate song emgeddings with the song_uri index and save to gcs

In [None]:
# create a tf function to convert any bad null values
def tf_if_null_return_zero(val):
    """
    this function fills in nans to zeros - sometimes happens in embedding calcs.
    this will clean the embedding inputs downstream
    """
    return(tf.clip_by_value(val, -1e12, 1e12)) # a trick to remove NANs post tf2.0

In [None]:
candidate_embeddings = tt.parsed_candidate_dataset.batch(10000).map(lambda x: [x['track_uri_can'], tf_if_null_return_zero(model.candidate_tower(x))])

In [None]:
# Save to the required format
# make sure you start out with a clean empty file for the append write
!rm candidate_embeddings.json > /dev/null 
!touch candidate_embeddings.json
for batch in candidate_embeddings:
    songs, embeddings = batch
    with open("candidate_embeddings.json", 'a') as f:
        for song, emb in zip(songs.numpy(), embeddings.numpy()):
            f.write('{"id":"' + str(song) + '","embedding":[' + ",".join(str(x) for x in list(emb)) + ']}')
            f.write("\n")

In [None]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"
    # bucket_name = bucket_name.strip("gs://")
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )
upload_blob('two-tower-models', 'candidate_embeddings.json', 'candidates/candidate_embeddings.json')

Do a quick line count from terminal - should look like this:

```
(base) jupyter@tf28-jsw-sep-a100:~/spotify_mpd_two_tower$ wc -l candidate_embeddings.json 
2249561 candidate_embeddings.json
```

### Finished

Go on to the [03 notebook](03-matching-engine.ipynb)

You should see results similar to the screenshot below
![](img/embeddings.png)