# Candidate Generation

After Two-Tower training, the `candidate_tower` is used to convert all candidate items into embeddings.

The embeddings are indexed and deployed to an index endpoint for serving.

Steps performed in this notebook:

1. Create a test dataset to send to the query endpoint 
2. Submit the `endpoint.predict()` calls to get the query vector representation
3. Inspect the records and familiarize, check for data quality


## Load env config

In [1]:
# naming convention for all cloud resources
VERSION        = "v1"                  # TODO
PREFIX         = f'ndr-{VERSION}'      # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = ndr-v1


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "ndr-v1"
VERSION                  = "v1"

APP                      = "sp"
MODEL_TYPE               = "2tower"
FRAMEWORK                = "tfrs"
DATA_VERSION             = "v1"
TRACK_HISTORY            = "5"

BUCKET_NAME              = "ndr-v1-hybrid-vertex-bucket"
BUCKET_URI               = "gs://ndr-v1-hybrid-vertex-bucket"
SOURCE_BUCKET            = "spotify-million-playlist-dataset"

DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://ndr-v1-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

CANDIDATE_PREFIX         = "candidates"
TRAIN_DIR_PREFIX      

### imports

In [3]:
import json
import numpy as np
import pickle as pkl
from pprint import pprint
import time

import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# GPU
import gc
from numba import cuda

import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_io as tfio

from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.blob import Blob

import google.cloud.aiplatform as vertex_ai

from src.two_tower_jt import two_tower as tt
from src.two_tower_jt import train_utils
from src.two_tower_jt import feature_sets

In [4]:
# gpus = tf.config.experimental.list_physical_devices('GPU')
# for gpu in gpus:
#     tf.config.experimental.set_memory_growth(gpu, True)
    
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

In [None]:
device = cuda.get_current_device()
device.reset()
gc.collect()

## Embedding index path

In [6]:
# TODO - grab from saved candiate_embedding.json URI
# local-train-v1/run-20230919-135222/candidates/candidate_embeddings.json

EXPERIMENT_NAME   = "local-train-v1"      # TODO
RUN_NAME          = "run-20230919-135222" # TODO

PATH_TO_INDEX_DIR = f'{EXPERIMENT_NAME}/{RUN_NAME}'
print(f"PATH_TO_INDEX_DIR: {PATH_TO_INDEX_DIR}")

PATH_TO_INDEX_DIR: local-train-v1/run-20230919-135222


In [14]:
! gsutil ls $BUCKET_URI/$EXPERIMENT_NAME/$RUN_NAME

gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/
gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/candidate_model/
gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/candidates/
gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/query_model/
gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/tb-logs/


In [15]:
# full gcs path
CANDIDATE_MODEL_GCS_PATH = f'{PATH_TO_INDEX_DIR}/model-dir/candidate_model'
# CANDIDATE_MODEL_GCS_PATH = f'{PATH_TO_INDEX_DIR}/candidate_model' # tmp - TODO

CANDIDATE_MODEL_DIR = f'{BUCKET_URI}/{CANDIDATE_MODEL_GCS_PATH}'
print(f"CANDIDATE_MODEL_DIR: {CANDIDATE_MODEL_DIR}")

CANDIDATE_MODEL_DIR: gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/candidate_model


In [16]:
! gsutil ls $CANDIDATE_MODEL_DIR

gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/candidate_model/
gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/candidate_model/fingerprint.pb
gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/candidate_model/saved_model.pb
gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/candidate_model/assets/
gs://ndr-v1-hybrid-vertex-bucket/local-train-v1/run-20230919-135222/candidate_model/variables/


## Load Candidate `SavedModel`

In [17]:
candidate_tower_uri = f'{CANDIDATE_MODEL_DIR}' # vertex trained
loaded_candidate_model = tf.saved_model.load(candidate_tower_uri)

loaded_candidate_model.signatures

_SignatureMap({'serving_default': <ConcreteFunction signature_wrapper(*, artist_genres_can, track_uri_can, track_time_signature_can, artist_name_can, track_valence_can, track_loudness_can, track_tempo_can, track_liveness_can, album_uri_can, track_name_can, track_instrumentalness_can, duration_ms_can, artist_followers_can, album_name_can, track_mode_can, track_speechiness_can, artist_uri_can, track_key_can, track_pop_can, track_acousticness_can, track_energy_can, track_danceability_can, artist_pop_can) at 0x7FD987B08510>})

In [18]:
print(list(loaded_candidate_model.signatures.keys()))

['serving_default']


In [19]:
candidate_predictor = loaded_candidate_model.signatures["serving_default"]
print(candidate_predictor.structured_outputs)

{'output_1': TensorSpec(shape=(None, 128), dtype=tf.float32, name='output_1')}


In [20]:
candidate_predictor.output_shapes

{'output_1': TensorShape([None, 128])}

## Candidate Dataset

### helper functions

In [21]:
storage_client = storage.Client(project=PROJECT_ID)

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

In [22]:
candidate_features = feature_sets.get_candidate_features()
candidate_features

{'track_uri_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'track_name_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'artist_uri_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'artist_name_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'album_uri_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'album_name_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'duration_ms_can': FixedLenFeature(shape=(), dtype=tf.float32, default_value=None),
 'track_pop_can': FixedLenFeature(shape=(), dtype=tf.float32, default_value=None),
 'artist_pop_can': FixedLenFeature(shape=(), dtype=tf.float32, default_value=None),
 'artist_genres_can': FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
 'artist_followers_can': FixedLenFeature(shape=(), dtype=tf.float32, default_value=None),
 'track_danceability_can': FixedLenFeature(shape=(), dtype=tf.float32, defa

## Candidate Records

In [23]:
CANDIDATE_PREFIX = f'data/{DATA_VERSION}/candidates' 

In [24]:
candidate_files = []

for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'data/{DATA_VERSION}/{CANDIDATE_PREFIX}'):
    candidate_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

candidate_dataset = tf.data.Dataset.from_tensor_slices(candidate_files)

parsed_candidate_dataset = candidate_dataset.interleave(
    # lambda x: tf.data.TFRecordDataset(x),
    train_utils.full_parse,
    cycle_length=tf.data.AUTOTUNE, 
    num_parallel_calls=tf.data.AUTOTUNE,
    deterministic=False
).map(
    feature_sets.parse_candidate_tfrecord_fn, 
    num_parallel_calls=tf.data.AUTOTUNE
).with_options(
    options
)

parsed_candidate_dataset = parsed_candidate_dataset.cache() #400 MB on machine mem

In [25]:
for features in parsed_candidate_dataset.take(1):
    pprint(features)
    print("_______________")

{'album_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'Thanatophobia'>,
 'album_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:album:5GBUYg5EqeDI0CuszAvDzj'>,
 'artist_followers_can': <tf.Tensor: shape=(), dtype=float32, numpy=27438.0>,
 'artist_genres_can': <tf.Tensor: shape=(), dtype=string, numpy=b"'indie garage rock'">,
 'artist_name_can': <tf.Tensor: shape=(), dtype=string, numpy=b'Worn-Tin'>,
 'artist_pop_can': <tf.Tensor: shape=(), dtype=float32, numpy=40.0>,
 'artist_uri_can': <tf.Tensor: shape=(), dtype=string, numpy=b'spotify:artist:7j8ds7BnqaEKuz1a1GN0J9'>,
 'duration_ms_can': <tf.Tensor: shape=(), dtype=float32, numpy=216923.0>,
 'track_acousticness_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.655>,
 'track_danceability_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.321>,
 'track_energy_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.555>,
 'track_instrumentalness_can': <tf.Tensor: shape=(), dtype=float32, numpy=0.752>,
 'track_key_can': 

In [26]:
parsed_candidate_dataset

<CacheDataset element_spec={'album_name_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'album_uri_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_followers_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'artist_genres_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_name_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'artist_pop_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'artist_uri_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'duration_ms_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_acousticness_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_danceability_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_energy_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_instrumentalness_can': TensorSpec(shape=(), dtype=tf.float32, name=None), 'track_key_can': TensorSpec(shape=(), dtype=tf.string, name=None), 'track_liveness_can': TensorSpec(shape=(),

## Generate Embeddings

* use candidate_predictor to produce embeddings for each candidate item
* store embeddings in list
* zip candidate embeddings and candidate IDs together
* write json or csv file for ANN Index

In [20]:
# previously created embedding output
# !gsutil cp gs://jt-tfrs-central/pipe-dev-2tower-tfrs-jtv10/run-20221228-210041/candidates/candidate_embeddings.json candidate_embs_20221228_210041.json

Copying gs://jt-tfrs-central/pipe-dev-2tower-tfrs-jtv10/run-20221228-210041/candidates/candidate_embeddings.json...
- [1 files][882.4 MiB/882.4 MiB]   68.6 MiB/s                                   
Operation completed over 1 objects/882.4 MiB.                                    


### candidate embedding vectors

In [27]:
start_time = time.time()

embs_iter = parsed_candidate_dataset.batch(1).map(
    lambda data: candidate_predictor(
        track_uri_can = data["track_uri_can"],
        track_name_can = data['track_name_can'],
        artist_uri_can = data['artist_uri_can'],
        artist_name_can = data['artist_name_can'],
        album_uri_can = data['album_uri_can'],
        album_name_can = data['album_name_can'],
        duration_ms_can = data['duration_ms_can'],
        track_pop_can = data['track_pop_can'],
        artist_pop_can = data['artist_pop_can'],
        artist_genres_can = data['artist_genres_can'],
        artist_followers_can = data['artist_followers_can'],
        track_danceability_can = data['track_danceability_can'],
        track_energy_can = data['track_energy_can'],
        track_key_can = data['track_key_can'],
        track_loudness_can = data['track_loudness_can'],
        track_mode_can = data['track_mode_can'],
        track_speechiness_can = data['track_speechiness_can'],
        track_acousticness_can = data['track_acousticness_can'],
        track_instrumentalness_can = data['track_instrumentalness_can'],
        track_liveness_can = data['track_liveness_can'],
        track_valence_can = data['track_valence_can'],
        track_tempo_can = data['track_tempo_can'],
        track_time_signature_can = data['track_time_signature_can']
    )
)

embs = []
for emb in embs_iter:
    embs.append(emb)

end_time = time.time()
elapsed_time = int((end_time - start_time) / 60)
print(f"elapsed_time: {elapsed_time}")

print(f"Length of embs: {len(embs)}")
embs[0]

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
elapsed_time: 141
Length of embs: 2243885


{'output_1': <tf.Tensor: shape=(1, 128), dtype=float32, numpy=
 array([[ 1.4558885 ,  0.6574247 , -1.6407717 , -1.5887606 , -0.8956078 ,
         -0.99810946, -0.40141183,  0.24247266,  0.6559755 , -0.0870877 ,
          0.4873863 , -1.192624  ,  0.05810613,  0.65198994, -1.1014075 ,
          0.4844147 ,  1.9804701 , -0.15204096,  1.7292233 ,  0.60755455,
          0.52051014,  0.4429888 ,  0.29365444, -0.4539186 ,  0.42860448,
         -0.54497916, -0.4762391 , -0.6574926 , -0.365402  ,  0.8710664 ,
         -0.6119961 , -0.8461934 , -0.4577797 , -0.01408611, -0.78435844,
          0.28907436,  0.43767878, -0.13469195,  0.28730276, -0.57131904,
         -0.9392587 , -0.44394577, -0.53406644,  0.8327801 ,  0.51560533,
          1.273033  ,  0.32302907, -0.7242595 ,  1.4502484 , -1.918178  ,
          0.18540913,  0.02203418,  1.2570223 ,  0.779727  ,  0.64376956,
         -0.98304224,  1.9677243 ,  0.3062334 ,  1.4187527 ,  0.7121415 ,
         -0.24123847,  0.06877614, -0.9489165 , -

In [36]:
len(embs)

2243885

Clean embedding output...

In [37]:
start_time = time.time()

cleaned_embs = [x['output_1'].numpy()[0] for x in embs] #clean up the output

end_time = time.time()
elapsed_time = int((end_time - start_time) / 60)
print(f"elapsed_time: {elapsed_time}")

elapsed_time: 0


In [38]:
print(f"Length of cleaned_embs: {len(cleaned_embs)}")
cleaned_embs[0]

Length of cleaned_embs: 2243885


array([ 1.4558885 ,  0.6574247 , -1.6407717 , -1.5887606 , -0.8956078 ,
       -0.99810946, -0.40141183,  0.24247266,  0.6559755 , -0.0870877 ,
        0.4873863 , -1.192624  ,  0.05810613,  0.65198994, -1.1014075 ,
        0.4844147 ,  1.9804701 , -0.15204096,  1.7292233 ,  0.60755455,
        0.52051014,  0.4429888 ,  0.29365444, -0.4539186 ,  0.42860448,
       -0.54497916, -0.4762391 , -0.6574926 , -0.365402  ,  0.8710664 ,
       -0.6119961 , -0.8461934 , -0.4577797 , -0.01408611, -0.78435844,
        0.28907436,  0.43767878, -0.13469195,  0.28730276, -0.57131904,
       -0.9392587 , -0.44394577, -0.53406644,  0.8327801 ,  0.51560533,
        1.273033  ,  0.32302907, -0.7242595 ,  1.4502484 , -1.918178  ,
        0.18540913,  0.02203418,  1.2570223 ,  0.779727  ,  0.64376956,
       -0.98304224,  1.9677243 ,  0.3062334 ,  1.4187527 ,  0.7121415 ,
       -0.24123847,  0.06877614, -0.9489165 , -1.7127383 , -0.68183535,
        1.1871426 , -0.05245163, -0.889681  , -0.10291553, -0.48

### candidate IDs

In [39]:
# clean product IDs
track_uris = [x['track_uri_can'].numpy() for x in parsed_candidate_dataset]

print(f"Length of track_uris: {len(track_uris)}")

track_uris[0]

Length of track_uris: 2243885


b'spotify:track:2XZ3bL3ROk605SPpy6Dn9C'

In [40]:
# track_uris_cleaned = [str(z).replace("b'","").replace("'","") for z in track_uris]
track_uris_decoded = [z.decode("utf-8") for z in track_uris]

print(f"Length of track_uris_decoded: {len(track_uris_decoded)}")

track_uris_decoded[0]

Length of track_uris_decoded: 2243885


'spotify:track:2XZ3bL3ROk605SPpy6Dn9C'

In [41]:
print(f"Length of track_uris: {len(track_uris)}")
print(f"Length of track_uris_cleaned: {len(track_uris_decoded)}")

Length of track_uris: 2243885
Length of track_uris_cleaned: 2243885


### Check for bad records

In [42]:
cleaned_embs[0]

array([ 1.4558885 ,  0.6574247 , -1.6407717 , -1.5887606 , -0.8956078 ,
       -0.99810946, -0.40141183,  0.24247266,  0.6559755 , -0.0870877 ,
        0.4873863 , -1.192624  ,  0.05810613,  0.65198994, -1.1014075 ,
        0.4844147 ,  1.9804701 , -0.15204096,  1.7292233 ,  0.60755455,
        0.52051014,  0.4429888 ,  0.29365444, -0.4539186 ,  0.42860448,
       -0.54497916, -0.4762391 , -0.6574926 , -0.365402  ,  0.8710664 ,
       -0.6119961 , -0.8461934 , -0.4577797 , -0.01408611, -0.78435844,
        0.28907436,  0.43767878, -0.13469195,  0.28730276, -0.57131904,
       -0.9392587 , -0.44394577, -0.53406644,  0.8327801 ,  0.51560533,
        1.273033  ,  0.32302907, -0.7242595 ,  1.4502484 , -1.918178  ,
        0.18540913,  0.02203418,  1.2570223 ,  0.779727  ,  0.64376956,
       -0.98304224,  1.9677243 ,  0.3062334 ,  1.4187527 ,  0.7121415 ,
       -0.24123847,  0.06877614, -0.9489165 , -1.7127383 , -0.68183535,
        1.1871426 , -0.05245163, -0.889681  , -0.10291553, -0.48

In [43]:
bad_records = []

for i, emb in enumerate(cleaned_embs):
    bool_emb = np.isnan(emb)
    for val in bool_emb:
        if val:
            bad_records.append(i)
            
bad_record_filter = np.unique(bad_records)

print(f"bad_records: {len(bad_records)}")
print(f"bad_record_filter: {len(bad_record_filter)}")

bad_records: 56192
bad_record_filter: 439


In [44]:
# bad_record_filter[0]

In [45]:
track_uris_valid = []
emb_valid = []

for i, pair in enumerate(zip(track_uris_decoded, cleaned_embs)):
    if i in bad_record_filter:
        pass
    else:
        t_uri, embed = pair
        track_uris_valid.append(t_uri)
        emb_valid.append(embed)

In [46]:
emb_valid[0]

array([ 1.4558885 ,  0.6574247 , -1.6407717 , -1.5887606 , -0.8956078 ,
       -0.99810946, -0.40141183,  0.24247266,  0.6559755 , -0.0870877 ,
        0.4873863 , -1.192624  ,  0.05810613,  0.65198994, -1.1014075 ,
        0.4844147 ,  1.9804701 , -0.15204096,  1.7292233 ,  0.60755455,
        0.52051014,  0.4429888 ,  0.29365444, -0.4539186 ,  0.42860448,
       -0.54497916, -0.4762391 , -0.6574926 , -0.365402  ,  0.8710664 ,
       -0.6119961 , -0.8461934 , -0.4577797 , -0.01408611, -0.78435844,
        0.28907436,  0.43767878, -0.13469195,  0.28730276, -0.57131904,
       -0.9392587 , -0.44394577, -0.53406644,  0.8327801 ,  0.51560533,
        1.273033  ,  0.32302907, -0.7242595 ,  1.4502484 , -1.918178  ,
        0.18540913,  0.02203418,  1.2570223 ,  0.779727  ,  0.64376956,
       -0.98304224,  1.9677243 ,  0.3062334 ,  1.4187527 ,  0.7121415 ,
       -0.24123847,  0.06877614, -0.9489165 , -1.7127383 , -0.68183535,
        1.1871426 , -0.05245163, -0.889681  , -0.10291553, -0.48

In [47]:
len(emb_valid)

2243446

In [48]:
track_uris_valid[0]

'spotify:track:2XZ3bL3ROk605SPpy6Dn9C'

In [49]:
len(track_uris_valid)

2243446

### tmp - dealing with bad track uris

## Write embedding vectors to json file

In [33]:
VERSION = 'local'
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")

embeddings_index_filename = f'candidate_embs_{VERSION}_{TIMESTAMP}.json'

with open(f'{embeddings_index_filename}', 'w') as f:
    for prod, emb in zip(track_uris_valid, emb_valid):
        f.write('{"id":"' + str(prod) + '",')
        f.write('"embedding":[' + ",".join(str(x) for x in list(emb)) + "]}")
        f.write("\n")

## Upload json to GCS

In [34]:
INDEX_GCS_URI = f'gs://{BUCKET_NAME}/{PATH_TO_INDEX_DIR}/candidates-index-{VERSION}'
print(f"INDEX_GCS_URI: {INDEX_GCS_URI}")

DESTINATION_BLOB_NAME = embeddings_index_filename
SOURCE_FILE_NAME = embeddings_index_filename

print(f"DESTINATION_BLOB_NAME: {DESTINATION_BLOB_NAME}")
print(f"SOURCE_FILE_NAME: {SOURCE_FILE_NAME}")

INDEX_GCS_URI: gs://jt-tfrs-central-v2/8m-tfrs-v100-jtv15/run-20230125-205451/candidates-index-local
DESTINATION_BLOB_NAME: candidate_embs_local_20230130-180710.json
SOURCE_FILE_NAME: candidate_embs_local_20230130-180710.json


In [35]:
blob = Blob.from_string(os.path.join(INDEX_GCS_URI, DESTINATION_BLOB_NAME))
blob.bucket._client = storage_client
blob.upload_from_filename(SOURCE_FILE_NAME)

# Inspect track_uris

* id in the track_uri should be 22 characters (total of 36 characters including `spotify:track:`)
* some track_uris have an id that is 21 characters long
* these are not present in the source data (BigQuery)

In [36]:
len(track_uris_valid)

print(f"count of track_uris_valid: {len(track_uris_valid)}\n")
print(f"ex: track_uris_valid[0]: {track_uris_valid[0]}\n")
print(f"length of a track_uris_valid: {len(track_uris_valid[0])}\n")

count of track_uris_valid: 2262292

ex: track_uris_valid[0]: spotify:track:6Nx4UYbpHuU4x5mozUDaQQ

length of a track_uris_valid: 36



In [39]:
short = []
normal = []
long = []

for track_id in track_uris_valid:
    if len(track_id)==36:
        normal.append(track_id)
    if len(track_id)<36:
        short.append(track_id)
    if len(track_id)>36:
        long.append(track_id)
        
print(f"short: {len(short)}")
print(f"normal: {len(normal)}")
print(f"long: {len(long)}")

short: 0
normal: 2262292
long: 0


**Finished**