# Candidate Generation

After Two-Tower training, the `candidate_tower` is used to convert all candidate items into embeddings.

The embeddings are indexed and deployed to an index endpoint for serving.

Steps performed in this notebook:

1. Create a test dataset to send to the query endpoint 
2. Submit the `endpoint.predict()` calls to get the query vector representation
3. Inspect the records and familiarize, check for data quality


## Load env config

In [1]:
# naming convention for all cloud resources
VERSION        = "v1"                  # TODO
PREFIX         = f'ndr-{VERSION}'      # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = ndr-v1


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "cpg-cdp"
PROJECT_NUM              = "939655404703"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "genai-haystack-vpc"

VERTEX_SA                = "939655404703-compute@developer.gserviceaccount.com"

PREFIX                   = "ndr-v1"
VERSION                  = "v1"

APP                      = "sp"
MODEL_TYPE               = "2tower"
FRAMEWORK                = "tfrs"
DATA_VERSION             = "v1"
TRACK_HISTORY            = "5"

BUCKET_NAME              = "ndr-v1-cpg-cdp-bucket"
BUCKET_URI               = "gs://ndr-v1-cpg-cdp-bucket"
SOURCE_BUCKET            = "spotify-million-playlist-dataset"

DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://ndr-v1-cpg-cdp-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

CANDIDATE_PREFIX         = "candidates"
TRAIN_DIR_PREFIX         = "train"
VALID_DIR_PREFIX   

### imports

In [3]:
import json
import numpy as np
import pickle as pkl
from pprint import pprint
import time

import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# GPU
import gc
from numba import cuda

import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_io as tfio

from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.blob import Blob

import google.cloud.aiplatform as vertex_ai

from src.two_tower_jt import two_tower as tt
from src.two_tower_jt import train_utils
from src.two_tower_jt import feature_sets

In [4]:
# gpus = tf.config.experimental.list_physical_devices('GPU')
# for gpu in gpus:
#     tf.config.experimental.set_memory_growth(gpu, True)
    
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [5]:
device = cuda.get_current_device()
device.reset()
gc.collect()

22

## Embedding index path

In [6]:
# TODO - grab from saved candiate_embedding.json URI
# local-train-v1/run-20230919-135222/candidates/candidate_embeddings.json

EXPERIMENT_NAME   = "local-train-v2"      # TODO
RUN_NAME          = "run-20230925-234505" # TODO

PATH_TO_INDEX_DIR = f'{EXPERIMENT_NAME}/{RUN_NAME}'
print(f"PATH_TO_INDEX_DIR: {PATH_TO_INDEX_DIR}")

PATH_TO_INDEX_DIR: local-train-v2/run-20230925-234505


In [7]:
! gsutil ls $BUCKET_URI/$EXPERIMENT_NAME/$RUN_NAME

gs://ndr-v1-cpg-cdp-bucket/local-train-v2/run-20230925-234505/
gs://ndr-v1-cpg-cdp-bucket/local-train-v2/run-20230925-234505/model-dir/


In [8]:
# full gcs path
CANDIDATE_MODEL_GCS_PATH = f'{PATH_TO_INDEX_DIR}/model-dir/candidate_model'
# CANDIDATE_MODEL_GCS_PATH = f'{PATH_TO_INDEX_DIR}/candidate_model' # tmp - TODO

CANDIDATE_MODEL_DIR = f'{BUCKET_URI}/{CANDIDATE_MODEL_GCS_PATH}'
print(f"CANDIDATE_MODEL_DIR: {CANDIDATE_MODEL_DIR}")

CANDIDATE_MODEL_DIR: gs://ndr-v1-cpg-cdp-bucket/local-train-v2/run-20230925-234505/model-dir/candidate_model


In [9]:
! gsutil ls $CANDIDATE_MODEL_DIR

gs://ndr-v1-cpg-cdp-bucket/local-train-v2/run-20230925-234505/model-dir/candidate_model/
gs://ndr-v1-cpg-cdp-bucket/local-train-v2/run-20230925-234505/model-dir/candidate_model/fingerprint.pb
gs://ndr-v1-cpg-cdp-bucket/local-train-v2/run-20230925-234505/model-dir/candidate_model/saved_model.pb
gs://ndr-v1-cpg-cdp-bucket/local-train-v2/run-20230925-234505/model-dir/candidate_model/assets/
gs://ndr-v1-cpg-cdp-bucket/local-train-v2/run-20230925-234505/model-dir/candidate_model/variables/


## Load Candidate `SavedModel`

In [10]:
candidate_tower_uri = f'{CANDIDATE_MODEL_DIR}' # vertex trained
loaded_candidate_model = tf.saved_model.load(candidate_tower_uri)

loaded_candidate_model.signatures

_SignatureMap({'serving_default': <ConcreteFunction signature_wrapper(*, track_speechiness_can, track_valence_can, album_name_can, track_mode_can, album_uri_can, track_time_signature_can, track_uri_can, track_name_can, artist_pop_can, track_danceability_can, artist_genres_can, track_instrumentalness_can, track_key_can, artist_name_can, artist_uri_can, track_liveness_can, track_energy_can, duration_ms_can, artist_followers_can, track_acousticness_can, track_loudness_can, track_pop_can, track_tempo_can) at 0x7F67E80B8A00>})

In [11]:
print(list(loaded_candidate_model.signatures.keys()))

['serving_default']


In [12]:
candidate_predictor = loaded_candidate_model.signatures["serving_default"]
print(candidate_predictor.structured_outputs)

{'output_1': TensorSpec(shape=(None, 128), dtype=tf.float32, name='output_1')}


In [13]:
candidate_predictor.output_shapes

{'output_1': TensorShape([None, 128])}

## Candidate Dataset

### helper functions

In [14]:
storage_client = storage.Client(project=PROJECT_ID)

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

In [15]:
candidate_features = feature_sets.get_candidate_features()
candidate_features

{'track_uri_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'track_name_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'artist_uri_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'artist_name_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'album_uri_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'album_name_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'duration_ms_can': FixedLenFeature(shape=(), dtype=tf.float32, default_value=None),
 'track_pop_can': FixedLenFeature(shape=(), dtype=tf.float32, default_value=None),
 'artist_pop_can': FixedLenFeature(shape=(), dtype=tf.float32, default_value=None),
 'artist_genres_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'artist_followers_can': FixedLenFeature(shape=(), dtype=tf.float32, default_value=None),
 'track_danceability_can': FixedLenFeature(shape=(), dtype=tf.float32, defa

## Candidate Records

In [16]:
CANDIDATE_PREFIX = f'candidates' 

In [17]:
candidate_files = []

for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'data/{DATA_VERSION}/{CANDIDATE_PREFIX}'):
    candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

candidate_dataset = tf.data.Dataset.from_tensor_slices(candidate_files)

parsed_candidate_dataset = candidate_dataset.interleave(
    # lambda x: tf.data.TFRecordDataset(x),
    train_utils.full_parse,
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False
).map(
    feature_sets.parse_candidate_tfrecord_fn, 
    num_parallel_calls=tf.data.AUTOTUNE
).with_options(
    options
)

# parsed_candidate_dataset = parsed_candidate_dataset.cache() #400 MB on machine mem

In [18]:
for features in parsed_candidate_dataset.take(1):
    pprint(features)
    print("_______________")

{'album_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'Umrika (Original Motion Picture Soundtrack)'>,
 'album_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:album:1hu9GSsmHdixMdQMcmg59m'>,
 'artist_followers_can': <tf.Tensor: shape=(), dtype=float32, numpy=165156.0>,
 'artist_genres_can': <tf.Tensor: shape=(), dtype=string, numpy=b"'compositional ambient', 'neo-classical'">,
 'artist_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b"Dustin O'Halloran">,
 'artist_pop_can': <tf.Tensor: shape=(), dtype=float32, numpy=62.0>,
 'artist_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:artist:6UEYawMcp2M4JFoXVOtZEq'>,
 'duration_ms_can': <tf.Tensor: shape=(), dtype=float32, numpy=126640.0>,
 'track_acousticness_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.958>,
 'track_danceability_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.107>,
 'track_energy_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.00108>,
 'track_instrumentalness_can': <tf.Te

In [19]:
parsed_candidate_dataset

<_OptionsDataset element_spec={'album_name_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'album_uri_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_followers_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'artist_genres_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_name_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_pop_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'artist_uri_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'duration_ms_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_acousticness_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_danceability_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_energy_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_instrumentalness_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_key_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'track_liveness_can': TensorSpec(shape=

## Generate Embeddings

* use candidate_predictor to produce embeddings for each candidate item
* store embeddings in list
* zip candidate embeddings and candidate IDs together
* write json or csv file for ANN Index

In [26]:
# previously created embedding output
# !gsutil cp gs://jt-tfrs-central/pipe-dev-2tower-tfrs-jtv10/run-20221228-210041/candidates/candidate_embeddings.json candidate_embs_20221228_210041.json

### candidate embedding vectors

In [64]:
start_time = time.time()

embs_iter = parsed_candidate_dataset.batch(10000).map(
    lambda data: ( data["track_uri_can"]
                  ,loaded_candidate_model(data)
                 )
)

embs = []
for emb in embs_iter:
    embs.append(emb)

end_time = time.time()
elapsed_time = int((end_time - start_time) / 60)
print(f"elapsed_time: {elapsed_time}")

print(f"Length of embs: {len(embs)}")

elapsed_time: 4
Length of embs: 225


In [65]:
len(embs)

225

Clean embedding output...

In [67]:
x,y = embs[0]

In [68]:
y

<tf.Tensor: shape=(10000, 128), dtype=float32, numpy=
array([[-8.7547451e-03,  4.2965940e-01, -1.0579184e+00, ...,
         1.6157944e+00,  4.3617558e-01, -2.7654117e-01],
       [-1.9227499e-02,  4.5463973e-01, -1.0684842e+00, ...,
         1.6337469e+00,  4.2655101e-01, -2.8603643e-01],
       [-9.2109451e-03,  4.4887713e-01, -1.0705423e+00, ...,
         1.6187896e+00,  4.5349970e-01, -2.6657170e-01],
       ...,
       [-1.3473387e-02,  4.3278840e-01, -1.0706969e+00, ...,
         1.6102661e+00,  4.4447958e-01, -2.7042493e-01],
       [-4.3208394e-03,  4.6292099e-01, -1.0493214e+00, ...,
         1.6673964e+00,  4.1343227e-01, -2.9339111e-01],
       [ 9.2331978e-04,  4.6530294e-01, -1.0548621e+00, ...,
         1.6409998e+00,  4.2484501e-01, -2.6843005e-01]], dtype=float32)>

In [79]:
start_time = time.time()

cleaned_embs = [] #clean up the output
track_uris = []
for ids , embedding in embs:
    cleaned_embs.extend(embedding.numpy())
    track_uris.extend(ids.numpy())

end_time = time.time()
elapsed_time = int((end_time - start_time) / 60)
print(f"elapsed_time: {elapsed_time}")

elapsed_time: 0


In [82]:
print(f"Length of cleaned_embs: {len(cleaned_embs)}")
cleaned_embs[0]

Length of cleaned_embs: 2243885


array([-0.00875475,  0.4296594 , -1.0579184 ,  0.33055162,  0.75509065,
       -1.2682275 ,  0.40074062,  1.2643234 ,  1.4407821 , -0.81765157,
        1.0556922 , -0.38197348, -0.50490004,  0.16666569, -0.8488423 ,
        0.6206088 ,  1.0948801 , -1.4514679 ,  1.6594619 ,  1.393434  ,
        1.3103865 ,  0.5871945 , -0.77169776, -0.77867335,  0.30685037,
       -1.091643  , -0.02446085, -0.44908383,  0.5722308 ,  1.1759523 ,
       -0.8426078 ,  0.14493963,  0.90350306, -0.5260758 , -0.45274788,
       -0.93923473,  0.75078434, -1.1178484 ,  0.5522108 ,  0.16904452,
       -0.47102752, -1.2308649 , -0.39195207,  1.0723114 , -0.15880573,
        0.36832523, -1.0887698 , -1.3357592 ,  0.8934939 , -0.79218334,
       -0.21843508,  0.17024872,  1.3688272 ,  0.8668576 , -0.1617419 ,
       -0.48149663,  0.46772346,  0.99565965, -0.7618725 ,  0.1475042 ,
       -0.6342002 ,  0.33067462,  0.15164517, -1.3497765 , -1.3240782 ,
        0.76521456, -1.2219937 , -0.7931432 ,  0.97855496,  0.74

### candidate IDs

In [83]:
# clean product IDs
# track_uris = [ids.numpy() for ids , _ in embs]

# print(f"Length of track_uris: {len(track_uris)}")

In [84]:
# track_uris_cleaned = [str(z).replace("b'","").replace("'","") for z in track_uris]
track_uris_decoded = [z.decode("utf-8") for z in track_uris]

print(f"Length of track_uris_decoded: {len(track_uris_decoded)}")

track_uris_decoded[0]

Length of track_uris_decoded: 2243885


'spotify:track:4yHc5LZQBLi0H3PpvcM0S8'

In [85]:
print(f"Length of track_uris: {len(track_uris)}")
print(f"Length of track_uris_cleaned: {len(track_uris_decoded)}")

Length of track_uris: 2243885
Length of track_uris_cleaned: 2243885


In [86]:
# Length of track_uris_cleaned: 2243885


### Check for bad records

In [87]:
cleaned_embs[0]

array([-0.00875475,  0.4296594 , -1.0579184 ,  0.33055162,  0.75509065,
       -1.2682275 ,  0.40074062,  1.2643234 ,  1.4407821 , -0.81765157,
        1.0556922 , -0.38197348, -0.50490004,  0.16666569, -0.8488423 ,
        0.6206088 ,  1.0948801 , -1.4514679 ,  1.6594619 ,  1.393434  ,
        1.3103865 ,  0.5871945 , -0.77169776, -0.77867335,  0.30685037,
       -1.091643  , -0.02446085, -0.44908383,  0.5722308 ,  1.1759523 ,
       -0.8426078 ,  0.14493963,  0.90350306, -0.5260758 , -0.45274788,
       -0.93923473,  0.75078434, -1.1178484 ,  0.5522108 ,  0.16904452,
       -0.47102752, -1.2308649 , -0.39195207,  1.0723114 , -0.15880573,
        0.36832523, -1.0887698 , -1.3357592 ,  0.8934939 , -0.79218334,
       -0.21843508,  0.17024872,  1.3688272 ,  0.8668576 , -0.1617419 ,
       -0.48149663,  0.46772346,  0.99565965, -0.7618725 ,  0.1475042 ,
       -0.6342002 ,  0.33067462,  0.15164517, -1.3497765 , -1.3240782 ,
        0.76521456, -1.2219937 , -0.7931432 ,  0.97855496,  0.74

In [88]:
bad_records = []

for i, emb in enumerate(cleaned_embs):
    bool_emb = np.isnan(emb)
    for val in bool_emb:
        if val:
            bad_records.append(i)
            
bad_record_filter = np.unique(bad_records)

print(f"bad_records: {len(bad_records)}")
print(f"bad_record_filter: {len(bad_record_filter)}")

bad_records: 0
bad_record_filter: 0


In [89]:
# bad_record_filter[0]

In [90]:
track_uris_valid = []
emb_valid = []

for i, pair in enumerate(zip(track_uris_decoded, cleaned_embs)):
    if i in bad_record_filter:
        pass
    else:
        t_uri, embed = pair
        track_uris_valid.append(t_uri)
        emb_valid.append(embed)

In [91]:
emb_valid[0]

array([-0.00875475,  0.4296594 , -1.0579184 ,  0.33055162,  0.75509065,
       -1.2682275 ,  0.40074062,  1.2643234 ,  1.4407821 , -0.81765157,
        1.0556922 , -0.38197348, -0.50490004,  0.16666569, -0.8488423 ,
        0.6206088 ,  1.0948801 , -1.4514679 ,  1.6594619 ,  1.393434  ,
        1.3103865 ,  0.5871945 , -0.77169776, -0.77867335,  0.30685037,
       -1.091643  , -0.02446085, -0.44908383,  0.5722308 ,  1.1759523 ,
       -0.8426078 ,  0.14493963,  0.90350306, -0.5260758 , -0.45274788,
       -0.93923473,  0.75078434, -1.1178484 ,  0.5522108 ,  0.16904452,
       -0.47102752, -1.2308649 , -0.39195207,  1.0723114 , -0.15880573,
        0.36832523, -1.0887698 , -1.3357592 ,  0.8934939 , -0.79218334,
       -0.21843508,  0.17024872,  1.3688272 ,  0.8668576 , -0.1617419 ,
       -0.48149663,  0.46772346,  0.99565965, -0.7618725 ,  0.1475042 ,
       -0.6342002 ,  0.33067462,  0.15164517, -1.3497765 , -1.3240782 ,
        0.76521456, -1.2219937 , -0.7931432 ,  0.97855496,  0.74

In [92]:
len(emb_valid)

2243885

In [93]:
track_uris_valid[0]

'spotify:track:4yHc5LZQBLi0H3PpvcM0S8'

In [94]:
len(track_uris_valid)

2243885

### tmp - dealing with bad track uris

## Write embedding vectors to json file

In [95]:
VERSION = 'local'
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

embeddings_index_filename = f'candidate_embs_{VERSION}_{TIMESTAMP}.json'

with open(f'{embeddings_index_filename}', 'w') as f:
    for prod, emb in zip(track_uris_valid, emb_valid):
        f.write('{"id":"' + str(prod) + '",')
        f.write('"embedding":[' + ",".join(str(x) for x in list(emb)) + "]}")
        f.write("\n")

## Upload json to GCS

In [96]:
INDEX_GCS_URI = f'gs://{BUCKET_NAME}/{PATH_TO_INDEX_DIR}/candidates-index-{VERSION}'
print(f"INDEX_GCS_URI: {INDEX_GCS_URI}")

DESTINATION_BLOB_NAME = embeddings_index_filename
SOURCE_FILE_NAME = embeddings_index_filename

print(f"DESTINATION_BLOB_NAME: {DESTINATION_BLOB_NAME}")
print(f"SOURCE_FILE_NAME: {SOURCE_FILE_NAME}")

INDEX_GCS_URI: gs://ndr-v1-cpg-cdp-bucket/local-train-v2/run-20230925-234505/candidates-index-local
DESTINATION_BLOB_NAME: candidate_embs_local_20230926-021921.json
SOURCE_FILE_NAME: candidate_embs_local_20230926-021921.json


In [97]:
blob = Blob.from_string(os.path.join(INDEX_GCS_URI, DESTINATION_BLOB_NAME))
blob.bucket._client = storage_client
blob.upload_from_filename(SOURCE_FILE_NAME)

# Inspect track_uris

* id in the track_uri should be 22 characters (total of 36 characters including `spotify:track:`)
* some track_uris have an id that is 21 characters long
* these are not present in the source data (BigQuery)

In [98]:
len(track_uris_valid)

print(f"count of track_uris_valid: {len(track_uris_valid)}\n")
print(f"ex: track_uris_valid[0]: {track_uris_valid[0]}\n")
print(f"length of a track_uris_valid: {len(track_uris_valid[0])}\n")

count of track_uris_valid: 2243885

ex: track_uris_valid[0]: spotify:track:4yHc5LZQBLi0H3PpvcM0S8

length of a track_uris_valid: 36



In [99]:
short = []
normal = []
long = []

for track_id in track_uris_valid:
    if len(track_id)==36:
        normal.append(track_id)
    if len(track_id)<36:
        short.append(track_id)
    if len(track_id)>36:
        long.append(track_id)
        
print(f"short: {len(short)}")
print(f"normal: {len(normal)}")
print(f"long: {len(long)}")

short: 0
normal: 2243885
long: 0


**Finished**