# Candidate Generation

After Two-Tower training, the `candidate_tower` is used to convert all candidate items into embeddings.

The embeddings are indexed and deployed to an endpoint for serving.

Steps performed in this notebook:

* `TODO`

In [2]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1


In [4]:
import json
import numpy as np
import pickle as pkl
from pprint import pprint
import time

import os

import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_io as tfio

from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.blob import Blob

import google.cloud.aiplatform as vertex_ai

In [5]:
# gs://jt-tfrs-central/pipe-dev-2tower-tfrs-jtv10/run-20221228-210041/model-dir/candidate_model/saved_model.pb

BUCKET = 'jt-tfrs-central' # -v2
BUCKET_URI = f'gs://{BUCKET}'

# stable
# CANDIDATE_MODEL_GCS_PATH = 'pipe-dev-2tower-tfrs-jtv10/run-20221228-210041/model-dir/candidate_model'
# CANDIDATE_MODEL_GCS_PATH = 'a50-epoch/run-20221230-160518/model-dir/candidate_model'

# experimental paths
# PATH_TO_INDEX_DIR = 'eval-jtv14-full-v6-jtv14/run-20230113-192032' # 30e-8m
# EXPERIMENT_TAG = '30e-jtv14-v6-8m'

PATH_TO_INDEX_DIR = 'new-50e-full-jtv12/run-20230110-150417' # 50e-65m
EXPERIMENT_TAG = 'demo-50e-jtv12-65m'


# full gcs path
CANDIDATE_MODEL_GCS_PATH = f'{PATH_TO_INDEX_DIR}/model-dir/candidate_model'
CANDIDATE_MODEL_DIR = f'{BUCKET_URI}/{CANDIDATE_MODEL_GCS_PATH}'

print(f"CANDIDATE_MODEL_DIR: {CANDIDATE_MODEL_DIR}")

CANDIDATE_MODEL_DIR: gs://jt-tfrs-central/new-50e-full-jtv12/run-20230110-150417/model-dir/candidate_model


In [6]:
! gsutil ls $CANDIDATE_MODEL_DIR

gs://jt-tfrs-central/new-50e-full-jtv12/run-20230110-150417/model-dir/candidate_model/
gs://jt-tfrs-central/new-50e-full-jtv12/run-20230110-150417/model-dir/candidate_model/saved_model.pb
gs://jt-tfrs-central/new-50e-full-jtv12/run-20230110-150417/model-dir/candidate_model/assets/
gs://jt-tfrs-central/new-50e-full-jtv12/run-20230110-150417/model-dir/candidate_model/variables/


## Load Candidate `SavedModel`

In [7]:
candidate_tower_uri = f'{CANDIDATE_MODEL_DIR}' # vertex trained

loaded_candidate_model = tf.saved_model.load(candidate_tower_uri)

loaded_candidate_model.signatures

2023-01-25 19:01:16.126361: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-25 19:01:16.789738: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38224 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0


_SignatureMap({'serving_default': <ConcreteFunction signature_wrapper(*, artist_followers_can, duration_ms_can, track_mode_can, track_key_can, album_name_can, track_acousticness_can, track_danceability_can, track_name_can, track_uri_can, artist_pop_can, track_valence_can, track_tempo_can, time_signature_can, artist_genres_can, album_uri_can, track_speechiness_can, track_liveness_can, track_pop_can, track_energy_can, artist_uri_can, artist_name_can, track_instrumentalness_can, track_loudness_can) at 0x7FA10C291F90>})

In [8]:
print(list(loaded_candidate_model.signatures.keys()))

['serving_default']


In [9]:
candidate_predictor = loaded_candidate_model.signatures["serving_default"]
print(candidate_predictor.structured_outputs)

{'output_1': TensorSpec(shape=(None, 32), dtype=tf.float32, name='output_1')}


In [10]:
candidate_predictor.output_shapes

{'output_1': TensorShape([None, 32])}

## Candidate Dataset

### helper functions

In [11]:
storage_client = storage.Client(project=PROJECT_ID)

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

In [12]:
candidate_features = {
    "track_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),            
    "track_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "artist_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "artist_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "album_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),           
    "album_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
    "duration_ms_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
    "track_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
    "artist_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "artist_genres_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "artist_followers_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    # new
    # "track_pl_titles_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "track_danceability_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_energy_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_key_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "track_loudness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_mode_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
    "track_speechiness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_acousticness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_instrumentalness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_liveness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_valence_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "track_tempo_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
    "time_signature_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
}

In [13]:
def parse_candidate_tfrecord_fn(example):
    """
    Reads candidate serialized examples from gcs and converts to tfrecord
    """
    # example = tf.io.parse_single_example(
    example = tf.io.parse_example(
        example, 
        features=candidate_features
    )
    return example

def full_parse(data):
    # used for interleave - takes tensors and returns a tf.dataset
    data = tf.data.TFRecordDataset(data)
    return data

## Candidate Records

In [14]:
CANDIDATE_FILE_DIR = 'spotify-data-regimes'
CANDIDATE_PREFIX = 'jtv14-8m/candidates' # jtv10 | jtv14-8m

# SAMPLE_FILES = [
#     "gs://spotify-tfrecords-blog/tfrecords_v1/train/output-00000-of-00796.tfrecord",
#     "gs://spotify-tfrecords-blog/tfrecords_v1/train/output-00002-of-00796.tfrecord"
# ]

In [15]:
candidate_files = []
for blob in storage_client.list_blobs(f"{CANDIDATE_FILE_DIR}", prefix=f'{CANDIDATE_PREFIX}'):
    if '.tfrecords' in blob.name:
        candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

candidate_dataset = tf.data.Dataset.from_tensor_slices(candidate_files)

parsed_candidate_dataset = candidate_dataset.interleave(
    # lambda x: tf.data.TFRecordDataset(x),
    full_parse,
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False
).map(parse_candidate_tfrecord_fn, num_parallel_calls=tf.data.AUTOTUNE).with_options(options)

parsed_candidate_dataset = parsed_candidate_dataset.cache() #400 MB on machine mem

In [16]:
for features in parsed_candidate_dataset.take(1):
    pprint(features)
    print("_______________")

{'album_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'4U'>,
 'album_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:album:4U91uJBtdsedXEuRMjgZRP'>,
 'artist_followers_can': <tf.Tensor: shape=(), dtype=float32, numpy=28450.0>,
 'artist_genres_can': <tf.Tensor: shape=(), dtype=string, numpy=b"'edm', 'pop dance', 'progressive house', 'progressive trance', 'trance', 'uplifting trance'">,
 'artist_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'Maor Levi'>,
 'artist_pop_can': <tf.Tensor: shape=(), dtype=float32, numpy=49.0>,
 'artist_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:artist:7iVuXpgNEl87BwdwV1L6he'>,
 'duration_ms_can': <tf.Tensor: shape=(), dtype=float32, numpy=420000.0>,
 'time_signature_can': <tf.Tensor: shape=(), dtype=string, numpy=b'4'>,
 'track_acousticness_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.000864>,
 'track_danceability_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.615>,
 'track_energy_can': <tf.Tensor: sh

2023-01-25 19:01:49.277958: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [17]:
parsed_candidate_dataset

<CacheDataset element_spec={'album_name_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'album_uri_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_followers_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'artist_genres_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_name_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_pop_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'artist_uri_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'duration_ms_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'time_signature_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'track_acousticness_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_danceability_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_energy_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_instrumentalness_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_key_can': TensorSpec(shape=(),

In [32]:
# raw_dataset = tf.data.TFRecordDataset(candidate_files)

In [35]:
# parsed_candidate_dataset_v1 = raw_dataset.map(parse_candidate_tfrecord_fn)

# for features in parsed_candidate_dataset_v1.take(1):
#     pprint(features)
#     print("_______________")

{'album_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'4U'>,
 'album_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:album:4U91uJBtdsedXEuRMjgZRP'>,
 'artist_followers_can': <tf.Tensor: shape=(), dtype=float32, numpy=28450.0>,
 'artist_genres_can': <tf.Tensor: shape=(), dtype=string, numpy=b"'edm', 'pop dance', 'progressive house', 'progressive trance', 'trance', 'uplifting trance'">,
 'artist_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'Maor Levi'>,
 'artist_pop_can': <tf.Tensor: shape=(), dtype=float32, numpy=49.0>,
 'artist_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:artist:7iVuXpgNEl87BwdwV1L6he'>,
 'duration_ms_can': <tf.Tensor: shape=(), dtype=float32, numpy=420000.0>,
 'time_signature_can': <tf.Tensor: shape=(), dtype=string, numpy=b'4'>,
 'track_acousticness_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.000864>,
 'track_danceability_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.615>,
 'track_energy_can': <tf.Tensor: sh

## Generate Embeddings

* use candidate_predictor to produce embeddings for each candidate item
* store embeddings in list
* zip candidate embeddings and candidate IDs together
* write json or csv file for ANN Index

In [20]:
# previously created embedding output
# !gsutil cp gs://jt-tfrs-central/pipe-dev-2tower-tfrs-jtv10/run-20221228-210041/candidates/candidate_embeddings.json candidate_embs_20221228_210041.json

Copying gs://jt-tfrs-central/pipe-dev-2tower-tfrs-jtv10/run-20221228-210041/candidates/candidate_embeddings.json...
- [1 files][882.4 MiB/882.4 MiB]   68.6 MiB/s                                   
Operation completed over 1 objects/882.4 MiB.                                    


### candidate embedding vectors

In [36]:
# start_time = time.time()

# embs_iter = parsed_candidate_dataset_v1.batch(1000).map(
#     lambda data: candidate_predictor(
#         track_uri_can = data["track_uri_can"],
#         track_name_can = data['track_name_can'],
#         artist_uri_can = data['artist_uri_can'],
#         artist_name_can = data['artist_name_can'],
#         album_uri_can = data['album_uri_can'],
#         album_name_can = data['album_name_can'],
#         duration_ms_can = data['duration_ms_can'],
#         track_pop_can = data['track_pop_can'],
#         artist_pop_can = data['artist_pop_can'],
#         artist_genres_can = data['artist_genres_can'],
#         artist_followers_can = data['artist_followers_can'],
#         track_danceability_can = data['track_danceability_can'],
#         track_energy_can = data['track_energy_can'],
#         track_key_can = data['track_key_can'],
#         track_loudness_can = data['track_loudness_can'],
#         track_mode_can = data['track_mode_can'],
#         track_speechiness_can = data['track_speechiness_can'],
#         track_acousticness_can = data['track_acousticness_can'],
#         track_instrumentalness_can = data['track_instrumentalness_can'],
#         track_liveness_can = data['track_liveness_can'],
#         track_valence_can = data['track_valence_can'],
#         track_tempo_can = data['track_tempo_can'],
#         time_signature_can = data['time_signature_can']
#     )
# )

# embs = []
# for emb in embs_iter:
#     embs.append(emb)

# end_time = time.time()
# elapsed_time = int((end_time - start_time) / 60)
# print(f"elapsed_time: {elapsed_time}")

# print(f"Length of embs: {len(embs)}")
# embs[0]

elapsed_time: 1
Length of embs: 2263


{'output_1': <tf.Tensor: shape=(1000, 32), dtype=float32, numpy=
 array([[-2.1645777 , -0.02569133,  1.1920751 , ...,  0.86502624,
          0.8500546 , -0.95461535],
        [-2.4586706 ,  0.18307656,  1.631067  , ...,  0.8345346 ,
          1.6621102 ,  0.6864418 ],
        [-0.40462023,  0.20384306,  1.3977935 , ...,  0.48818517,
          1.202213  , -0.55006284],
        ...,
        [ 1.4562918 ,  0.09062123, -1.6182501 , ..., -0.15559816,
         -1.3291026 ,  0.05066007],
        [ 0.8157986 , -0.8581199 , -1.3720565 , ...,  0.43869933,
          0.01423812,  0.78625923],
        [ 1.8851609 , -0.46664304, -0.84363604, ...,  0.15309703,
         -0.84201324, -0.55973303]], dtype=float32)>}

In [18]:
start_time = time.time()

embs_iter = parsed_candidate_dataset.batch(1).map(
    lambda data: candidate_predictor(
        track_uri_can = data["track_uri_can"],
        track_name_can = data['track_name_can'],
        artist_uri_can = data['artist_uri_can'],
        artist_name_can = data['artist_name_can'],
        album_uri_can = data['album_uri_can'],
        album_name_can = data['album_name_can'],
        duration_ms_can = data['duration_ms_can'],
        track_pop_can = data['track_pop_can'],
        artist_pop_can = data['artist_pop_can'],
        artist_genres_can = data['artist_genres_can'],
        artist_followers_can = data['artist_followers_can'],
        track_danceability_can = data['track_danceability_can'],
        track_energy_can = data['track_energy_can'],
        track_key_can = data['track_key_can'],
        track_loudness_can = data['track_loudness_can'],
        track_mode_can = data['track_mode_can'],
        track_speechiness_can = data['track_speechiness_can'],
        track_acousticness_can = data['track_acousticness_can'],
        track_instrumentalness_can = data['track_instrumentalness_can'],
        track_liveness_can = data['track_liveness_can'],
        track_valence_can = data['track_valence_can'],
        track_tempo_can = data['track_tempo_can'],
        time_signature_can = data['time_signature_can']
    )
)

embs = []
for emb in embs_iter:
    embs.append(emb)

end_time = time.time()
elapsed_time = int((end_time - start_time) / 60)
print(f"elapsed_time: {elapsed_time}")

print(f"Length of embs: {len(embs)}")
embs[0]

elapsed_time: 2
Length of embs: 2263


{'output_1': <tf.Tensor: shape=(1000, 32), dtype=float32, numpy=
 array([[-18.286968,  17.23571 , -19.978462, ...,  18.750835, -15.788128,
         -15.501454],
        [-18.57132 ,  14.503975, -20.06953 , ...,  20.857546, -17.366013,
         -13.32492 ],
        [-19.951595,  18.043613, -19.234167, ...,  20.239983, -16.306307,
         -13.712457],
        ...,
        [-21.273186,  17.025822, -20.385431, ...,  20.012783, -15.652888,
         -15.942813],
        [-20.724955,  16.600971, -20.99828 , ...,  18.18766 , -15.127804,
         -15.904375],
        [-21.274338,  17.190042, -20.05886 , ...,  18.788134, -16.438744,
         -16.168468]], dtype=float32)>}

In [22]:
len(embs[0])

{'output_1': <tf.Tensor: shape=(1000, 32), dtype=float32, numpy=
 array([[-22.484255,  16.76609 , -20.048857, ...,  18.939014, -15.826862,
         -14.891063],
        [-21.822483,  16.105349, -21.608675, ...,  18.199999, -15.427704,
         -18.101727],
        [-22.031239,  16.899841, -19.461222, ...,  20.720133, -15.775556,
         -15.618858],
        ...,
        [-22.87466 ,  16.337912, -19.301922, ...,  22.696274, -15.440932,
         -15.426448],
        [-23.492395,  15.230312, -20.434055, ...,  21.689314, -14.461754,
         -15.414692],
        [-23.185263,  15.206906, -21.171255, ...,  22.503517, -14.496807,
         -15.805237]], dtype=float32)>}

Clean embedding output...

In [50]:
start_time = time.time()

cleaned_embs = [x['output_1'].numpy()[0] for x in embs] #clean up the output

end_time = time.time()
elapsed_time = int((end_time - start_time) / 60)
print(f"elapsed_time: {elapsed_time}")

elapsed_time: 0


In [51]:
print(f"Length of cleaned_embs: {len(cleaned_embs)}")
cleaned_embs[0]

Length of cleaned_embs: 2262292


array([-18.544695 ,  17.253334 , -20.327787 ,  22.050714 , -18.613064 ,
       -19.057636 , -22.897966 , -18.084929 ,  19.14662  ,  18.770685 ,
        14.877865 , -18.66579  , -15.141766 , -16.15653  ,  21.366915 ,
       -18.90796  ,  16.964298 ,  22.686796 , -20.013926 ,  15.816182 ,
       -18.804203 , -22.604815 ,  19.63804  ,   0.8306754, -19.153149 ,
        19.5806   ,  18.624025 ,  18.50726  ,  19.357174 ,  18.676226 ,
       -16.975777 , -13.846929 ], dtype=float32)

### candidate IDs

In [52]:
# clean product IDs
track_uris = [x['track_uri_can'].numpy() for x in parsed_candidate_dataset]

print(f"Length of track_uris: {len(track_uris)}")

track_uris[0]

Length of track_uris: 2262292


b'spotify:track:3EK4rJ1JAcqpbNN2xG5hhR'

In [53]:
# track_uris_cleaned = [str(z).replace("b'","").replace("'","") for z in track_uris]
track_uris_decoded = [z.decode("utf-8") for z in track_uris]

print(f"Length of track_uris_decoded: {len(track_uris_decoded)}")

track_uris_decoded[0]

Length of track_uris_decoded: 2262292


'spotify:track:3EK4rJ1JAcqpbNN2xG5hhR'

In [54]:
print(f"Length of track_uris: {len(track_uris)}")
print(f"Length of track_uris_cleaned: {len(track_uris_decoded)}")

Length of track_uris: 2262292
Length of track_uris_cleaned: 2262292


### Check for bad records

In [55]:
cleaned_embs[0]

array([-18.544695 ,  17.253334 , -20.327787 ,  22.050714 , -18.613064 ,
       -19.057636 , -22.897966 , -18.084929 ,  19.14662  ,  18.770685 ,
        14.877865 , -18.66579  , -15.141766 , -16.15653  ,  21.366915 ,
       -18.90796  ,  16.964298 ,  22.686796 , -20.013926 ,  15.816182 ,
       -18.804203 , -22.604815 ,  19.63804  ,   0.8306754, -19.153149 ,
        19.5806   ,  18.624025 ,  18.50726  ,  19.357174 ,  18.676226 ,
       -16.975777 , -13.846929 ], dtype=float32)

In [56]:
bad_records = []

for i, emb in enumerate(cleaned_embs):
    bool_emb = np.isnan(emb)
    for val in bool_emb:
        if val:
            bad_records.append(i)
            
bad_record_filter = np.unique(bad_records)

print(f"bad_records: {len(bad_records)}")
print(f"bad_record_filter: {len(bad_record_filter)}")

bad_records: 0
bad_record_filter: 0


In [57]:
# bad_record_filter[0]

In [58]:
track_uris_valid = []
emb_valid = []

for i, pair in enumerate(zip(track_uris_decoded, cleaned_embs)):
    if i in bad_record_filter:
        pass
    else:
        t_uri, embed = pair
        track_uris_valid.append(t_uri)
        emb_valid.append(embed)

In [59]:
emb_valid[0]

array([-18.544695 ,  17.253334 , -20.327787 ,  22.050714 , -18.613064 ,
       -19.057636 , -22.897966 , -18.084929 ,  19.14662  ,  18.770685 ,
        14.877865 , -18.66579  , -15.141766 , -16.15653  ,  21.366915 ,
       -18.90796  ,  16.964298 ,  22.686796 , -20.013926 ,  15.816182 ,
       -18.804203 , -22.604815 ,  19.63804  ,   0.8306754, -19.153149 ,
        19.5806   ,  18.624025 ,  18.50726  ,  19.357174 ,  18.676226 ,
       -16.975777 , -13.846929 ], dtype=float32)

In [60]:
len(emb_valid)

2262292

In [61]:
track_uris_valid[0]

'spotify:track:3EK4rJ1JAcqpbNN2xG5hhR'

In [62]:
len(track_uris_valid)

2262292

### tmp - dealing with bad track uris

## Write embedding vectors to json file

In [63]:
VERSION = 'local_50e_small'
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

embeddings_index_filename = f'candidate_embs_{VERSION}_{TIMESTAMP}.json'

with open(f'{embeddings_index_filename}', 'w') as f:
    for prod, emb in zip(track_uris_valid, emb_valid):
        f.write('{"id":"' + str(prod) + '",')
        f.write('"embedding":[' + ",".join(str(x) for x in list(emb)) + "]}")
        f.write("\n")

## Upload json to GCS

In [64]:
# jt-tfrs-central/pipe-dev-2tower-tfrs-jtv10/run-20221228-210041

# BUCKET = 'jt-tfrs-central'
# PATH_TO_INDEX_DIR = 'a50-epoch/run-20221230-160518'
INDEX_GCS_URI = f'gs://{BUCKET}/{PATH_TO_INDEX_DIR}/candidates-index-small-demo'

print(f"INDEX_GCS_URI: {INDEX_GCS_URI}")

DESTINATION_BLOB_NAME = embeddings_index_filename
SOURCE_FILE_NAME = embeddings_index_filename

print(f"DESTINATION_BLOB_NAME: {DESTINATION_BLOB_NAME}")
print(f"SOURCE_FILE_NAME: {SOURCE_FILE_NAME}")

INDEX_GCS_URI: gs://jt-tfrs-central/new-50e-full-jtv12/run-20230110-150417/candidates-index-local
DESTINATION_BLOB_NAME: candidate_embs_local_v1_50e_20230116-154747.json
SOURCE_FILE_NAME: candidate_embs_local_v1_50e_20230116-154747.json


In [65]:
blob = Blob.from_string(os.path.join(INDEX_GCS_URI, DESTINATION_BLOB_NAME))
blob.bucket._client = storage_client
blob.upload_from_filename(SOURCE_FILE_NAME)

# Inspect track_uris

* id in the track_uri should be 22 characters (total of 36 characters including `spotify:track:`)
* some track_uris have an id that is 21 characters long
* these are not present in the source data (BigQuery)

In [124]:
len(track_uris_valid)

print(f"count of track_uris_valid: {len(track_uris_valid)}\n")
print(f"ex: track_uris_valid[0]: {track_uris_valid[0]}\n")
print(f"length of a track_uris_valid: {len(track_uris_valid[0])}\n")

count of track_uris_valid: 2262292

ex: track_uris_valid[0]: spotify:track:3EK4rJ1JAcqpbNN2xG5hhR

length of a track_uris_valid: 36



In [125]:
short = []
normal = []
long = []

for track_id in track_uris_valid:
    if len(track_id)==36:
        normal.append(track_id)
    if len(track_id)<36:
        short.append(track_id)
    if len(track_id)>36:
        long.append(track_id)
        
print(f"short: {len(short)}")
print(f"normal: {len(normal)}")
print(f"long: {len(long)}")

short: 0
normal: 2262292
long: 0
