# Build baseline tfrs model 

Look inside of `./two_tower_src/` for the source code and model code

This notebook constructs the two tower model and saves the model to GCS

In [2]:
# !pip install tensorflow-recommenders==0.6.0 --user

In [1]:
PROJECT_ID = 'hybrid-vertex'  # <--- TODO: CHANGE THIS
LOCATION = 'us-central1' 

In [3]:
import os

os.environ['TF_GPU_THREAD_MODE']='gpu_private'
os.environ['TF_GPU_ALLOCATOR']='cuda_malloc_async'

In [5]:
import json

import tensorflow as tf
import logging

import tensorflow_recommenders as tfrs


from google.cloud import storage

from two_tower_src import two_tower as tt
#inside this tt module the data parsing functions, candidate dataset and model classes are found

## Create Dataset for local training and testing

### Playlist dataset

In [6]:
batch_size = 45000
train_dir = 'spotify-beam-v3'
train_dir_prefix = 'v6/train_last_5_v2/'

valid_dir = 'spotify-beam-v3'
valid_dir_prefix = 'v6/valid_last_5/'

client = storage.Client()

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO
 

train_files = []
for blob in client.list_blobs(f'{train_dir}', prefix=f'{train_dir_prefix}', delimiter="/"):
    train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

def full_parse(data):
    # used for interleave - takes tensors and returns a tf.dataset
    data = tf.data.TFRecordDataset(data)
    return data
    
train_dataset = tf.data.Dataset.from_tensor_slices(train_files).prefetch(
    tf.data.AUTOTUNE,
)

train_dataset = train_dataset.interleave(
    full_parse,
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False,
).map(tt.parse_tfrecord, num_parallel_calls=tf.data.AUTOTUNE,).batch(
    batch_size 
).prefetch(
    tf.data.AUTOTUNE,
).with_options(options)


valid_files = []
for blob in client.list_blobs(f'{valid_dir}', prefix=f'{valid_dir_prefix}', delimiter="/"):
    valid_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))


valid_dataset = tf.data.Dataset.from_tensor_slices(valid_files).prefetch(
    tf.data.AUTOTUNE,
)

valid_dataset = valid_dataset.interleave(
    full_parse,
    num_parallel_calls=tf.data.AUTOTUNE,
    cycle_length=tf.data.AUTOTUNE, 
    deterministic=False,
).map(tt.parse_tfrecord, num_parallel_calls=tf.data.AUTOTUNE).batch(
    batch_size
).prefetch(
    tf.data.AUTOTUNE,
).with_options(options)

# Local Training

In [7]:
layer_sizes=[256,128]
with tf.device('/GPU:0'):
    model = tt.TheTwoTowers(layer_sizes)

    model.compile(optimizer=tf.keras.optimizers.Adagrad(0.01))

In [8]:
## Quick look at the layers
print("Playlist (query) Tower:")

for i, l in enumerate(model.query_tower.layers):
    print(i, l.name)

Playlist (query) Tower:
0 pl_name_emb_model
1 pl_collaborative_emb_model
2 pl_track_uri_emb_model
3 n_songs_pl_emb_model
4 n_artists_pl_emb_model
5 n_albums_pl_emb_model
6 artist_name_pl_emb_model
7 track_uri_pl_emb_model
8 track_name_pl_emb_model
9 duration_ms_songs_pl_emb_model
10 album_name_pl_emb_model
11 artist_pop_pl_emb_model
12 artists_followers_pl_emb_model
13 track_pop_pl_emb_model
14 artist_genres_pl_emb_model
15 pl_cross_layer
16 pl_dense_layers


In [9]:
print("Track (candidate) Tower:")
for i, l in enumerate(model.candidate_tower.layers):
    print(i, l.name)

Track (candidate) Tower:
0 artist_name_can_emb_model
1 track_name_can_emb_model
2 album_name_can_emb_model
3 artist_uri_can_emb_model
4 track_uri_can_emb_model
5 album_uri_can_emb_model
6 duration_ms_can_normalized
7 track_pop_can_normalized
8 artist_pop_can_normalized
9 artist_followers_can_normalized
10 artist_genres_can_emb_model
11 can_cross_layer
12 candidate_dense_layers


### Local training for ten Epochs

In [10]:
import time

In [11]:
NUM_EPOCHS = 1
start_time = time.time()
with tf.device('/GPU:0'):
    layer_history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        validation_freq=5,
        epochs=NUM_EPOCHS,
        # steps_per_epoch=2, #use this for devlopment to run just a few steps
        # callbacks=tensorboard_cb,
        # verbose=0
    )
end_time = time.time()

2022-10-10 18:24:41.422616: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Training for 1 epoch


KeyError: 'val_factorized_top_k/top_1_categorical_accuracy'

In [12]:
runtime_mins = int((end_time - start_time) / 60)
print(f"Total runtime: {runtime_mins} minutes")

Total runtime: 21 minutes


In [None]:
val_keys = [v for v in layer_history.history.keys() if 'val' in v]
print([(key, layer_history.history[key]) for key in val_keys])
                      #'val_factorized_top_k/top_1_categorical_accuracy']]

### Now, save the model

In [13]:
path = 'gs://two-tower-models' #TODO change to your model directory

In [None]:
# first, create the bucket to store the tensorflow models
# ! gsutil mb -l us-central1 $path

In [14]:
# first save the query 

#save the models
tf.saved_model.save(model.query_tower, export_dir=path + "/query_model")
tf.saved_model.save(model.candidate_tower, export_dir=path + "/candidate_model")

2022-10-10 18:52:22.326126: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: gs://two-tower-models/query_model/assets


INFO:tensorflow:Assets written to: gs://two-tower-models/query_model/assets


INFO:tensorflow:Assets written to: gs://two-tower-models/candidate_model/assets


INFO:tensorflow:Assets written to: gs://two-tower-models/candidate_model/assets


## Save the candidate embeddings to GCS for use in Matching Engine later
These will be the files we use for the index

In [26]:
# create a tf function to convert any bad null values
def tf_if_null_return_zero(val):
    """
    this function fills in nans to zeros - sometimes happens in embedding calcs.
    this will clean the embedding inputs downstream
    """
    return(tf.clip_by_value(val, -1e12, 1e12)) # a trick to remove NANs post tf2.0

In [46]:
candidate_embeddings = tt.parsed_candidate_dataset.batch(10000).map(lambda x: [x['track_uri_can'], tf_if_null_return_zero(model.candidate_tower(x))])

In [47]:
candidate_embeddings

<MapDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 128), dtype=tf.float32, name=None))>

In [51]:
# Save to the required format

for batch in candidate_embeddings:
    songs, embeddings = batch
    with open("candidate_embeddings.json", 'w') as f:
        for song, emb in zip(songs.numpy(), embeddings.numpy()):
            f.write('{"id":"' + str(song) + '","embedding":[' + ",".join(str(x) for x in list(emb)) + ']}')
            f.write("\n")

In [54]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"
    # bucket_name = bucket_name.strip("gs://")
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )
upload_blob('two-tower-models', 'candidate_embeddings.json', 'candidates/candidate_embeddings.json')

File candidate_embeddings.json uploaded to candidate_embeddings.json.


### Finished

Go on to the 04 notebook

You should see results similar to the screenshot below
![](img/embeddings.png)