# Inspect Layer Vocabs

### pip if needed

In [1]:
import os

# # The Vertex AI Workbench Notebook product has specific requirements
# IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME") and not os.getenv("VIRTUAL_ENV")
# IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
#     "/opt/deeplearning/metadata/env_version"
# )

# # Vertex AI Notebook requires dependencies to be installed with '--user'
# USER_FLAG = ""
# if IS_WORKBENCH_NOTEBOOK:
#     USER_FLAG = "--user"

# ! pip3 install --upgrade {USER_FLAG} -q google-cloud-storage {USER_FLAG} \
#                                         kfp \
#                                         google-cloud-pipeline-components

# if not os.getenv("IS_TESTING"):
#     # Automatically restart kernel after installs
#     import IPython

#     app = IPython.Application.instance()
#     app.kernel.do_shutdown(True)

In [2]:
# !pip install kfp --user --q
# !pip install google-cloud-pipeline-components==1.0.8 --user --q

In [3]:
# ! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
# ! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"
# ! python3 -c "import google.cloud.aiplatform; print('aiplatform SDK version: {}'.format(google.cloud.aiplatform.__version__))"

### Set env vars

In [17]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
BUCKET = 'spotify-data-regimes'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")

VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'

PROJECT_ID: hybrid-vertex
PROJECT_NUM: 934903580331
LOCATION: us-central1


### import packages

In [5]:
import os
import json
import pickle as pkl
import logging
import time

import tensorflow as tf

from google.cloud import storage
from google.cloud import aiplatform as vertex_ai

# Pipelines
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.types import artifact_types

# Kubeflow SDK
# TODO: fix these
from kfp.v2 import dsl
import kfp
import kfp.v2.dsl
from kfp.v2.google import client as pipelines_client
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component)


storage_client = storage.Client()

vertex_ai.init(project=PROJECT_ID,location=LOCATION)

2022-12-28 05:47:33.949389: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-28 05:47:34.084115: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-28 05:47:34.132967: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-28 05:47:34.894657: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

## Load previously created vocab pickle

In [6]:
# !gsutil cp gs://spotify-data-regimes/test/jw_vocab_dict_1220.pkl .
# !gsutil cp gs://two-tower-models/vocabs/vocab_dict.pkl .

In [7]:
# VOCAB_LOCAL_FILE = 'vocab_dict.pkl'

# filehandler = open(f'{VOCAB_LOCAL_FILE}', 'rb')
# vocab_dict_load = pkl.load(filehandler)
# filehandler.close()
# vocab_dict_load

# Vocab pipeline

In [8]:
os.getcwd()
# !pwd

'/home/jupyter/jw-repo/spotify_mpd_two_tower'

In [9]:
REPO_DOCKER_PATH_PREFIX = 'src'
PIPELINES_SUB_DIR = 'vocab_pipes'

In [10]:
! rm -rf {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}
! mkdir {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}

## Pipeline Components

### Adapt: `text` layer

In [11]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/adapt_fixed_text_layer_vocab.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=[
        # 'google-cloud-aiplatform==1.18.1',
        'google-cloud-storage',
        'tensorflow==2.8.3',
    ],
)
def adapt_fixed_text_layer_vocab(
    project: str,
    location: str,
    version: str,
    data_dir_bucket_name: str,
    data_dir_path_prefix: str,
    vocab_path_prefix: str,
    max_playlist_length: int,
    max_tokens: int,
    ngrams: int,
    feature_name: str,
    # feat_type: str,
) -> NamedTuple('Outputs', [
    ('vocab_gcs_uri', str),
    # ('feature_name', str),
]):

    """
    custom pipeline component to adapt the `pl_name_src` layer
    writes vocab to pickled dict in GCS
    dict combined with other layer vocabs and used in Two Tower training
    """
    
    # import packages
    import os
    import logging
    import pickle as pkl
    import time
    
    from google.cloud import storage
    
    import tensorflow as tf
    
    # setup clients
    storage_client = storage.Client()
    
    logging.info(f"feature_name: {feature_name}")
    # logging.info(f"feat_type: {feat_type}")
    
    MAX_PLAYLIST_LENGTH = max_playlist_length
    logging.info(f"MAX_PLAYLIST_LENGTH: {MAX_PLAYLIST_LENGTH}")
    
    # ===================================================
    # tfrecord parser
    # ===================================================
    feats = {
        # ===================================================
        # candidate track features
        # ===================================================
        "track_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),            
        "track_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "artist_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "artist_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "album_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),           
        "album_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
        "duration_ms_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
        "track_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
        "artist_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "artist_genres_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "artist_followers_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        # "track_pl_titles_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "track_danceability_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_energy_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_key_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "track_loudness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_mode_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "track_speechiness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_acousticness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_instrumentalness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_liveness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_valence_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_tempo_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "time_signature_can": tf.io.FixedLenFeature(dtype=tf.string, shape=()), # track_time_signature_can
    
        # ===================================================
        # summary playlist features
        # ===================================================
        "pl_name_src" : tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
        'pl_collaborative_src' : tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
        # 'num_pl_followers_src' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
        'pl_duration_ms_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'num_pl_songs_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), # n_songs_pl_new | num_pl_songs_new
        'num_pl_artists_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'num_pl_albums_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
        # 'avg_track_pop_pl_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
        # 'avg_artist_pop_pl_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
        # 'avg_art_followers_pl_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 

        # ===================================================
        # ragged playlist features
        # ===================================================
        # bytes / string
        "track_uri_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_name_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "artist_uri_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "artist_name_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "album_uri_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "album_name_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "artist_genres_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        # "tracks_playlist_titles_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_key_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_mode_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "time_signature_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 

        # Float List
        "duration_ms_songs_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_pop_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "artist_pop_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "artists_followers_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_danceability_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_energy_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_loudness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_speechiness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_acousticness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_instrumentalness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_liveness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_valence_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_tempo_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    }
    
    # parsing function
    def parse_tfrecord(example):
        """
        Reads a serialized example from GCS and converts to tfrecord
        """
        # example = tf.io.parse_single_example(
        example = tf.io.parse_example(
            example,
            feats
            # features=feats
        )
        return example

    # list blobs (tfrecords)
    train_files = []
    for blob in storage_client.list_blobs(f'{data_dir_bucket_name}', prefix=f'{data_dir_path_prefix}'):
        if '.tfrecords' in blob.name:
            train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
#     for blob in storage_client.list_blobs(f'{data_dir_bucket_name}', prefix=f'{data_dir_path_prefix}', delimiter="/"):
#         train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
#     # skip folder path prefix
#     train_files = train_files[1:]
    logging.info(f"TFRecord file count: {len(train_files)}")
    
    # ===================================================
    # create TF dataset
    # ===================================================
    logging.info(f"Creating TFRecordDataset...")
    train_dataset = tf.data.TFRecordDataset(train_files)
    train_parsed = train_dataset.map(parse_tfrecord)
    
    # ===================================================
    # adapt layer for feature
    # ===================================================
    
    # if feat_type == 'ragged':
    #     start = time.time()
    #     text_layer = tf.keras.layers.TextVectorization()
    #     text_layer.adapt(train_parsed.map(lambda x: tf.reshape(x[f'{feature_name}'], [-1, MAX_PLAYLIST_LENGTH, 1])))
    #     end = time.time()
    # else:
    #     start = time.time()
    #     text_layer = tf.keras.layers.TextVectorization()
    #     text_layer.adapt(train_parsed.map(lambda x: x[f'{feature_name}']))
    #     end = time.time()
    
    start = time.time()
    
    text_layer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens,
        ngrams=ngrams
    )
    
    text_layer.adapt(train_parsed.map(lambda x: x[f'{feature_name}']))
    end = time.time()

    logging.info(f'Layer adapt elapsed time: {round((end - start), 2)} seconds')
    
    # ===================================================
    # test layer
    # ===================================================
#     logging.info(f"Inspect {feature_name} layer's tokens and learned vocab...")
    
#     for row in train_parsed.batch(1).map(lambda x: x[f'{feature_name}']).take(1):
#       logging.info(f"{feature_name} tokens: {text_layer(row)}")
        
#     logging.info(f"{feature_name} vocab: {text_layer.get_vocabulary()[0:5]}")
    
    # ===================================================
    # write vocab to pickled dict --> gcs
    # ===================================================
    logging.info(f"Writting pickled dict to GCS...")
                 
    VOCAB_LOCAL_FILE = f'{feature_name}_vocab_dict.pkl'
    VOCAB_GCS_OBJ = f'{vocab_path_prefix}/{VOCAB_LOCAL_FILE}' # destination folder prefix and blob name
    VOCAB_DICT = {f'{feature_name}' : text_layer.get_vocabulary(),}
    
    logging.info(f"VOCAB_LOCAL_FILE: {VOCAB_LOCAL_FILE}")
    logging.info(f"VOCAB_GCS_OBJ: {VOCAB_GCS_OBJ}")

    # pickle
    filehandler = open(f'{VOCAB_LOCAL_FILE}', 'wb')
    pkl.dump(VOCAB_DICT, filehandler)
    filehandler.close()
    
    # upload to GCS
    bucket_client = storage_client.bucket(data_dir_bucket_name)
    blob = bucket_client.blob(VOCAB_GCS_OBJ)
    blob.upload_from_filename(VOCAB_LOCAL_FILE)
    
    vocab_uri = f'gs://{data_dir_bucket_name}/{VOCAB_GCS_OBJ}'
    
    logging.info(f"File {VOCAB_LOCAL_FILE} uploaded to {vocab_uri}")
    
    return(
        vocab_uri,
        # feature_name,
    )

Writing src/vocab_pipes/adapt_fixed_text_layer_vocab.py


### ragged adapts

In [12]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/adapt_ragged_text_layer_vocab.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=[
        # 'google-cloud-aiplatform==1.18.1',
        'google-cloud-storage',
        'tensorflow==2.8.3',
    ],
)
def adapt_ragged_text_layer_vocab(
    project: str,
    location: str,
    version: str,
    data_dir_bucket_name: str,
    data_dir_path_prefix: str,
    vocab_path_prefix: str,
    max_playlist_length: int,
    max_tokens: int,
    ngrams: int,
    feature_name: str,
    # feat_type: str,
) -> NamedTuple('Outputs', [
    ('vocab_gcs_uri', str),
    # ('feature_name', str),
]):

    """
    custom pipeline component to adapt the `pl_name_src` layer
    writes vocab to pickled dict in GCS
    dict combined with other layer vocabs and used in Two Tower training
    """
    
    # import packages
    import os
    import logging
    import pickle as pkl
    import time
    
    from google.cloud import storage
    
    import tensorflow as tf
    
    # setup clients
    storage_client = storage.Client()
    
    logging.info(f"feature_name: {feature_name}")
    # logging.info(f"feat_type: {feat_type}")
    
    MAX_PLAYLIST_LENGTH = max_playlist_length
    logging.info(f"MAX_PLAYLIST_LENGTH: {MAX_PLAYLIST_LENGTH}")
    
    # ===================================================
    # tfrecord parser
    # ===================================================
    feats = {
        # ===================================================
        # candidate track features
        # ===================================================
        "track_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),            
        "track_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "artist_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "artist_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "album_uri_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),           
        "album_name_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
        "duration_ms_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
        "track_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),      
        "artist_pop_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "artist_genres_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "artist_followers_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        # "track_pl_titles_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "track_danceability_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_energy_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_key_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "track_loudness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_mode_can":tf.io.FixedLenFeature(dtype=tf.string, shape=()),
        "track_speechiness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_acousticness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_instrumentalness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_liveness_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_valence_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "track_tempo_can":tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        "time_signature_can": tf.io.FixedLenFeature(dtype=tf.string, shape=()), # track_time_signature_can
    
        # ===================================================
        # summary playlist features
        # ===================================================
        "pl_name_src" : tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
        'pl_collaborative_src' : tf.io.FixedLenFeature(dtype=tf.string, shape=()), 
        # 'num_pl_followers_src' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
        'pl_duration_ms_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'num_pl_songs_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), # n_songs_pl_new | num_pl_songs_new
        'num_pl_artists_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()),
        'num_pl_albums_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
        # 'avg_track_pop_pl_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
        # 'avg_artist_pop_pl_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 
        # 'avg_art_followers_pl_new' : tf.io.FixedLenFeature(dtype=tf.float32, shape=()), 

        # ===================================================
        # ragged playlist features
        # ===================================================
        # bytes / string
        "track_uri_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_name_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "artist_uri_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "artist_name_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "album_uri_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "album_name_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "artist_genres_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        # "tracks_playlist_titles_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_key_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_mode_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)),
        "time_signature_pl": tf.io.FixedLenFeature(dtype=tf.string, shape=(MAX_PLAYLIST_LENGTH,)), 

        # Float List
        "duration_ms_songs_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_pop_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "artist_pop_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "artists_followers_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_danceability_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_energy_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_loudness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_speechiness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_acousticness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_instrumentalness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_liveness_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_valence_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
        "track_tempo_pl": tf.io.FixedLenFeature(dtype=tf.float32, shape=(MAX_PLAYLIST_LENGTH,)),
    }
    
    # parsing function
    def parse_tfrecord(example):
        """
        Reads a serialized example from GCS and converts to tfrecord
        """
        # example = tf.io.parse_single_example(
        example = tf.io.parse_example(
            example,
            feats
            # features=feats
        )
        return example

    # list blobs (tfrecords)
    train_files = []
    for blob in storage_client.list_blobs(f'{data_dir_bucket_name}', prefix=f'{data_dir_path_prefix}'):
        if '.tfrecords' in blob.name:
            train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
#     for blob in storage_client.list_blobs(f'{data_dir_bucket_name}', prefix=f'{data_dir_path_prefix}', delimiter="/"):
#         train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
#     # skip folder path prefix
#     train_files = train_files[1:]
    logging.info(f"TFRecord file count: {len(train_files)}")
    
    # ===================================================
    # create TF dataset
    # ===================================================
    logging.info(f"Creating TFRecordDataset...")
    train_dataset = tf.data.TFRecordDataset(train_files)
    train_parsed = train_dataset.map(parse_tfrecord)
    
    # ===================================================
    # adapt layer for feature
    # ===================================================
    
    # if feat_type == 'ragged':
    #     start = time.time()
    #     text_layer = tf.keras.layers.TextVectorization()
    #     text_layer.adapt(train_parsed.map(lambda x: tf.reshape(x[f'{feature_name}'], [-1, MAX_PLAYLIST_LENGTH, 1])))
    #     end = time.time()
    # else:
    #     start = time.time()
    #     text_layer = tf.keras.layers.TextVectorization()
    #     text_layer.adapt(train_parsed.map(lambda x: x[f'{feature_name}']))
    #     end = time.time()
    
    start = time.time()
    
    text_layer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens,
        ngrams=ngrams
    )
    
    text_layer.adapt(train_parsed.map(lambda x: tf.reshape(x[f'{feature_name}'], [-1, MAX_PLAYLIST_LENGTH, 1])))
    end = time.time()

    logging.info(f'Layer adapt elapsed time: {round((end - start), 2)} seconds')
    
    # ===================================================
    # test layer
    # ===================================================
#     logging.info(f"Inspect {feature_name} layer's tokens and learned vocab...")
    
#     for row in train_parsed.batch(1).map(lambda x: x[f'{feature_name}']).take(1):
#       logging.info(f"{feature_name} tokens: {text_layer(row)}")
        
#     logging.info(f"{feature_name} vocab: {text_layer.get_vocabulary()[0:5]}")
    
    # ===================================================
    # write vocab to pickled dict --> gcs
    # ===================================================
    logging.info(f"Writting pickled dict to GCS...")
                 
    VOCAB_LOCAL_FILE = f'{feature_name}_vocab_dict.pkl'
    VOCAB_GCS_OBJ = f'{vocab_path_prefix}/{VOCAB_LOCAL_FILE}' # destination folder prefix and blob name
    VOCAB_DICT = {f'{feature_name}' : text_layer.get_vocabulary(),}
    
    logging.info(f"VOCAB_LOCAL_FILE: {VOCAB_LOCAL_FILE}")
    logging.info(f"VOCAB_GCS_OBJ: {VOCAB_GCS_OBJ}")

    # pickle
    filehandler = open(f'{VOCAB_LOCAL_FILE}', 'wb')
    pkl.dump(VOCAB_DICT, filehandler)
    filehandler.close()
    
    # upload to GCS
    bucket_client = storage_client.bucket(data_dir_bucket_name)
    blob = bucket_client.blob(VOCAB_GCS_OBJ)
    blob.upload_from_filename(VOCAB_LOCAL_FILE)
    
    vocab_uri = f'gs://{data_dir_bucket_name}/{VOCAB_GCS_OBJ}'
    
    logging.info(f"File {VOCAB_LOCAL_FILE} uploaded to {vocab_uri}")
    
    return(
        vocab_uri,
        # feature_name,
    )

Writing src/vocab_pipes/adapt_ragged_text_layer_vocab.py


### create master vocab

In [13]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/create_master_vocab.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=[
        # 'google-cloud-aiplatform==1.18.1',
        'google-cloud-storage',
        'numpy',
        # 'tensorflow==2.8.3',
    ],
)
def create_master_vocab(
    project: str,
    location: str,
    version: str,
    data_dir_bucket_name: str,
    data_dir_path_prefix: str,
    vocab_path_prefix: str,
    master_dict_path_prefix: str,
    vocab_uri_1: str,
    vocab_uri_2: str,
    vocab_uri_3: str,
    vocab_uri_4: str,
    vocab_uri_5: str,
    vocab_uri_6: str,
    vocab_uri_7: str,
    vocab_uri_8: str,
    vocab_uri_9: str,
    # vocab_uri_10: str,
    # vocab_uri_11: str,
) -> NamedTuple('Outputs', [
    ('master_vocab_gcs_uri', str),
    # ('feature_name': str),
]):

    """
    combine layer dictionaires to master dictionary
    master dictionary passed to train job for layer vocabs
    """
    
    # import packages
    import os
    import logging
    import pickle as pkl
    import time
    import numpy as np
    
    from google.cloud import storage
    
    # setup clients
    storage_client = storage.Client()

    # ===================================================
    # Create list of all layer vocab dict uris
    # ===================================================
    
    vocab_dict_uris = [
        vocab_uri_1, vocab_uri_2, 
        vocab_uri_3, vocab_uri_4, 
        vocab_uri_5, vocab_uri_6, 
        vocab_uri_7, vocab_uri_8, 
        vocab_uri_9, 
        # vocab_uri_10, 
        # vocab_uri_11,
    ]
        
    # vocab_dict_uris = []
    # for blob in storage_client.list_blobs(f'{data_dir_bucket_name}', prefix=f'{vocab_path_prefix}'):
    #     if '.pkl' in blob.name:
    #         vocab_dict_uris.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
    # logging.info(f"vocab_dict_uris[0]: {vocab_dict_uris[0]}")
    
    # skip folder path prefix
    # vocab_dict_uris = vocab_dict_uris[1:]
    logging.info(f"count of vocab_dict_uris: {len(vocab_dict_uris)}")
    logging.info(f"vocab_dict_uris: {vocab_dict_uris}")
    
    # ===================================================
    # load pickled dicts
    # ===================================================
    
    loaded_pickle_list = []
    for i, pickled_dict in enumerate(vocab_dict_uris):
        
        with open(f"v{i}_vocab_pre_load", 'wb') as local_vocab_file:
            storage_client.download_blob_to_file(pickled_dict, local_vocab_file)

        with open(f"v{i}_vocab_pre_load", 'rb') as pickle_file:
            loaded_vocab_dict = pkl.load(pickle_file)
            
        loaded_pickle_list.append(loaded_vocab_dict)
        
    # ===================================================
    # create master vocab dict
    # ===================================================
    master_dict = {}
    for loaded_dict in loaded_pickle_list:
        master_dict.update(loaded_dict)
    
    # ===================================================
    # Upload master to GCS
    # ===================================================
    MASTER_VOCAB_LOCAL_FILE = f'{version}_master_vocab_dict.pkl'
    MASTER_VOCAB_GCS_OBJ = f'{master_dict_path_prefix}/{MASTER_VOCAB_LOCAL_FILE}' # destination folder prefix and blob name
    
    # pickle
    filehandler = open(f'{MASTER_VOCAB_LOCAL_FILE}', 'wb')
    pkl.dump(master_dict, filehandler)
    filehandler.close()
    
    # upload to GCS
    bucket_client = storage_client.bucket(data_dir_bucket_name)
    blob = bucket_client.blob(MASTER_VOCAB_GCS_OBJ)
    blob.upload_from_filename(MASTER_VOCAB_LOCAL_FILE)
    
    master_vocab_uri = f'gs://{data_dir_bucket_name}/{MASTER_VOCAB_GCS_OBJ}'
    
    logging.info(f"File {MASTER_VOCAB_LOCAL_FILE} uploaded to {master_vocab_uri}")
    
    return(
        master_vocab_uri,
    )

Writing src/vocab_pipes/create_master_vocab.py


### config

In [14]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/config.py

"""Vertex pipeline configurations."""

import os


PROJECT_ID = os.getenv("PROJECT_ID", "")
LOCATION = os.getenv("LOCATION", "us-central1")
BUCKET = os.getenv("BUCKET", "")

INSTANCE_TYPE = os.getenv("INSTANCE_TYPE", "n1-highmem-64")
CPU_LIMIT = os.getenv("CPU_LIMIT", "64")
MEMORY_LIMIT = os.getenv("MEMORY_LIMIT", "416")
GPU_LIMIT = os.getenv("GPU_LIMIT", "4")
GPU_TYPE = os.getenv("GPU_TYPE", "NVIDIA_TESLA_T4")

MACHINE_TYPE = os.getenv("MACHINE_TYPE", "a2-highgpu-4g")
REPLICA_COUNT = os.getenv("REPLICA_COUNT", "1")
ACCELERATOR_TYPE = os.getenv("ACCELERATOR_TYPE", "NVIDIA_TESLA_A100")
ACCELERATOR_NUM = os.getenv("ACCELERATOR_NUM", "4")
NUM_WORKERS = os.getenv("NUM_WORKERS", "4")

Writing src/vocab_pipes/config.py


## Build & Compile Pipeline

`parallelFor` [docs](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html#kfp.dsl.ParallelFor)

```
with dsl.ParallelFor([{'a': 1, 'b': 10}, {'a': 2, 'b': 20}]) as item:
  op1 = ContainerOp(..., args=['echo {}'.format(item.a)])
  op2 = ContainerOp(..., args=['echo {}'.format(item.b])
```

> In this case `op1` would be executed twice, once with case `args=['echo 1']` and once with case `args=['echo 2']`

aggregating output 
* [stack overflow](https://stackoverflow.com/questions/73052584/set-the-name-for-each-parallelfor-iteration-in-kfp-v2-on-vertex-ai)
* [github issue](https://github.com/kubeflow/pipelines/issues/3412)
```
@func_to_container_op
def print_results(results: ForLoopResults()):
    print(results["process_parameters"])
```

### pipe configs

* see [example](https://github.com/GoogleCloudPlatform/nvidia-merlin-on-vertex-aihttps://github.com/GoogleCloudPlatform/nvidia-merlin-on-vertex-ai)

In [18]:
os.environ['PROJECT_ID'] = PROJECT_ID
os.environ['LOCATION'] = LOCATION
os.environ['BUCKET'] = BUCKET

PIPELINE_VERSION = 'jtv10' # pipeline code
PIPELINE_TAG = f'tfrs-vocab-pipe-{PIPELINE_VERSION}'

os.environ['PIPELINE_VERSION'] = PIPELINE_VERSION
os.environ['PIPELINE_TAG'] = PIPELINE_TAG

print("PIPELINE_TAG:", PIPELINE_TAG)

# Instance configuration
GPU_LIMIT = '8'
GPU_TYPE = 'NVIDIA_TESLA_A100'
CPU_LIMIT = '96'
MEMORY_LIMIT = '624' # '680'

os.environ['GPU_LIMIT'] = GPU_LIMIT
os.environ['GPU_TYPE'] = GPU_TYPE
os.environ['CPU_LIMIT'] = CPU_LIMIT
os.environ['MEMORY_LIMIT'] = MEMORY_LIMIT

PIPELINE_TAG: tfrs-vocab-pipe-jtv10


In [19]:
from src.vocab_pipes import adapt_ragged_text_layer_vocab, adapt_fixed_text_layer_vocab, create_master_vocab, config

@kfp.v2.dsl.pipeline(
    name=f'{PIPELINE_VERSION}-{PIPELINE_TAG}'.replace('_', '-')
)
def pipeline(
    project: str,
    location: str,
    pipeline_version: str,
    data_version: str,
    data_dir_bucket_name: str,
    data_dir_path_prefix: str,
    vocab_path_prefix: str,
    master_dict_path_prefix: str,
    max_playlist_length: int,
    max_tokens: int,
    ngrams: int,
    # feature_name_list: str,
    fixed_features_list: list,
    ragged_features_list: list,
):
    
    from kfp.v2.components import importer_node
    from google_cloud_pipeline_components.types import artifact_types
    
    # ================================================================================
    # ParallelFor with feature lists
    # ================================================================================
    
#     # ====================================================
#     # fixed-length features
#     # ====================================================
    
#     fixed_for_loop_op = kfp.dsl.ParallelFor(fixed_features_list)
    
#     with fixed_for_loop_op as fixed_feat_param:
        
#         adapt_fixed_features_op = (
#             adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
#                 project=project,
#                 location=location,
#                 version=pipeline_version,
#                 data_dir_bucket_name=data_dir_bucket_name,
#                 data_dir_path_prefix=data_dir_path_prefix,
#                 vocab_path_prefix=vocab_path_prefix,
#                 max_playlist_length=max_playlist_length,
#                 feature_name=fixed_feat_param,
#             )
#             .set_display_name(f"adapt: {fixed_feat_param}") # fixed_features_list
#             # .set_caching_options(True)
#         )
        
#     # ====================================================
#     # ragged features
#     # ====================================================
        
#     ragged_for_loop_op = kfp.dsl.ParallelFor(ragged_features_list)
    
#     with ragged_for_loop_op as ragged_feat_param:
        
#         adapt_ragged_features_op = (
#             adapt_ragged_text_layer_vocab.adapt_ragged_text_layer_vocab(
#                 project=project,
#                 location=location,
#                 version=pipeline_version,
#                 data_dir_bucket_name=data_dir_bucket_name,
#                 data_dir_path_prefix=data_dir_path_prefix,
#                 vocab_path_prefix=vocab_path_prefix,
#                 max_playlist_length=max_playlist_length,
#                 feature_name=ragged_feat_param,
#             )
#             .set_display_name(f"adapt: {ragged_feat_param}") # ragged_features_list
#             # .set_caching_options(True)
#         )
        
    # ================================================================================
    # explicit components
    # ================================================================================
    # fixed length feats

    # pl_name_src
    adapt_pl_name_src_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='pl_name_src',
        )
        .set_display_name(f"adapt: pl_name_src")
        .set_cpu_limit(config.CPU_LIMIT)
        .set_memory_limit(config.MEMORY_LIMIT)
        # .set_gpu_limit(config.GPU_LIMIT)
        # .set_caching_options(True)
    )
    
    # track_name_can
    adapt_track_name_can_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='track_name_can',
        )
        .set_display_name(f"adapt: track_name_can")
        .set_cpu_limit(config.CPU_LIMIT)
        .set_memory_limit(config.MEMORY_LIMIT)
        # .set_gpu_limit(config.GPU_LIMIT)
        # .set_caching_options(True)
    )
    # artist_name_can
    adapt_artist_name_can_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='artist_name_can',
        )
        .set_display_name(f"adapt: artist_name_can")
        .set_cpu_limit(config.CPU_LIMIT)
        .set_memory_limit(config.MEMORY_LIMIT)
        # .set_gpu_limit(config.GPU_LIMIT)
        # .set_caching_options(True)
    )
    # album_name_can
    adapt_album_name_can_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='album_name_can',
        )
        .set_display_name(f"adapt: album_name_can")
        .set_cpu_limit(config.CPU_LIMIT)
        .set_memory_limit(config.MEMORY_LIMIT)
        # .set_gpu_limit(config.GPU_LIMIT)
        # .set_caching_options(True)
    )
    # artist_genres_can
    adapt_artist_genres_can_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='artist_genres_can',
        )
        .set_display_name(f"adapt: artist_genres_can")
        .set_cpu_limit(config.CPU_LIMIT)
        .set_memory_limit(config.MEMORY_LIMIT)
        # .set_gpu_limit(config.GPU_LIMIT)
        # .set_caching_options(True)
    )
        
    # raggeds       

    # track_name_pl
    adapt_track_name_pl_features_op = (
        adapt_ragged_text_layer_vocab.adapt_ragged_text_layer_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='track_name_pl',
        )
        .set_display_name(f"adapt: track_name_pl") # ragged_features_list
        .set_cpu_limit(config.CPU_LIMIT)
        .set_memory_limit(config.MEMORY_LIMIT)
        # .set_gpu_limit(config.GPU_LIMIT)
        # .set_caching_options(True)
    )
    # artist_name_pl
    adapt_artist_name_pl_op = (
        adapt_ragged_text_layer_vocab.adapt_ragged_text_layer_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='artist_name_pl',
        )
        .set_display_name(f"adapt: artist_name_pl") # ragged_features_list
        .set_cpu_limit(config.CPU_LIMIT)
        .set_memory_limit(config.MEMORY_LIMIT)
        # .set_gpu_limit(config.GPU_LIMIT)
        # .set_caching_options(True)
    )
    # album_name_pl
    adapt_album_name_pl_op = (
        adapt_ragged_text_layer_vocab.adapt_ragged_text_layer_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='album_name_pl',
        )
        .set_display_name(f"adapt: album_name_pl") # ragged_features_list
        .set_cpu_limit(config.CPU_LIMIT)
        .set_memory_limit(config.MEMORY_LIMIT)
        # .set_gpu_limit(config.GPU_LIMIT)
        # .set_caching_options(True)
    )
    # artist_genres_pl
    adapt_artist_genres_pl_op = (
        adapt_ragged_text_layer_vocab.adapt_ragged_text_layer_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='artist_genres_pl',
        )
        .set_display_name(f"adapt: artist_genres_pl") # ragged_features_list
        .set_cpu_limit(config.CPU_LIMIT)
        .set_memory_limit(config.MEMORY_LIMIT)
        # .set_gpu_limit(config.GPU_LIMIT)
        # .set_caching_options(True)
    )
        
        
    # ====================================================
    # Aggregate all Dicts
    # ====================================================
    
    create_master_vocab_op = (
        create_master_vocab.create_master_vocab(
            project=project,
            location=location,
            version=pipeline_version,
            data_dir_bucket_name=data_dir_bucket_name,
            data_dir_path_prefix=data_dir_path_prefix,
            vocab_path_prefix=vocab_path_prefix,
            master_dict_path_prefix=master_dict_path_prefix,
            vocab_uri_1=adapt_pl_name_src_op.outputs['vocab_gcs_uri'], 
            vocab_uri_2=adapt_track_name_can_op.outputs['vocab_gcs_uri'], 
            vocab_uri_3=adapt_artist_name_can_op.outputs['vocab_gcs_uri'], 
            vocab_uri_4=adapt_album_name_can_op.outputs['vocab_gcs_uri'], 
            vocab_uri_5=adapt_artist_genres_can_op.outputs['vocab_gcs_uri'], 
            vocab_uri_6=adapt_track_name_pl_features_op.outputs['vocab_gcs_uri'], 
            vocab_uri_7=adapt_artist_name_pl_op.outputs['vocab_gcs_uri'], 
            vocab_uri_8=adapt_album_name_pl_op.outputs['vocab_gcs_uri'], 
            vocab_uri_9=adapt_artist_genres_pl_op.outputs['vocab_gcs_uri'],
        )
        # .after(fixed_for_loop_op).after(ragged_for_loop_op)
        .set_display_name("create master vocab")
    )

## Compile pipeline

In [20]:
BUCKET = 'spotify-data-regimes'
BUCKET_URI = f'gs://{BUCKET}'
PIPELINE_ROOT = 'vocab-pipelines-root'
DATA_VERSION = 'jtv10'
PIPE_GCS_DIR = f'{BUCKET_URI}/{DATA_VERSION}/{PIPELINE_ROOT}/{PIPELINE_VERSION}'
print(f"PIPE_GCS_DIR: {PIPE_GCS_DIR}")

REPO_DOCKER_PATH_PREFIX = 'src'
PIPELINES_SUB_DIR = 'vocab_pipes'

PIPE_GCS_DIR: gs://spotify-data-regimes/jtv10/vocab-pipelines-root/jtv10


In [22]:
kfp.v2.compiler.Compiler().compile(
  pipeline_func=pipeline, 
  package_path='custom_container_pipeline_spec.json',
)

In [23]:
!gsutil cp custom_container_pipeline_spec.json $PIPE_GCS_DIR/pipeline_spec.json

Copying file://custom_container_pipeline_spec.json [Content-Type=application/json]...
/ [1 files][143.5 KiB/143.5 KiB]                                                
Operation completed over 1 objects/143.5 KiB.                                    


### pipeline args

In [25]:
project = PROJECT_ID
location = LOCATION

data_dir_bucket_name = 'spotify-data-regimes'
data_dir_path_prefix = f'{DATA_VERSION}/train_v9' # train_minimum | 'train_subset | train_flat_last_5_v8
# vocab_path_prefix = "jtv8/test_vocabs"
vocab_path_prefix = f"{DATA_VERSION}/vocabs/layer-dicts-{PIPELINE_VERSION}"
master_dict_path_prefix = f'{DATA_VERSION}/vocabs'

max_playlist_length = 5
fixed_features_list = [
    'pl_name_src',
    'track_name_can',
    'artist_name_can',
    'album_name_can',
    'artist_genres_can',
    # 'track_pl_titles_can',
]

ragged_features_list = [
    'track_name_pl',
    'artist_name_pl',
    'album_name_pl',
    'artist_genres_pl',
    # 'tracks_playlist_titles_pl',
]

print(f"vocab_path_prefix: {vocab_path_prefix}")

vocab_path_prefix: jtv10/vocabs/layer-dicts-jtv10


## submit pipeline

In [26]:
overwrite = True

job = vertex_ai.PipelineJob(
    display_name=f'{PIPELINE_VERSION}-{PIPELINE_TAG}'.replace('_', '-'),
    template_path='custom_container_pipeline_spec.json',
    pipeline_root=f'{PIPE_GCS_DIR}',
    # enable_caching=False,
    parameter_values={
        'project': PROJECT_ID,
        'location': LOCATION,
        'pipeline_version': PIPELINE_VERSION,
        'data_version': DATA_VERSION,
        'data_dir_bucket_name': data_dir_bucket_name,
        'data_dir_path_prefix': data_dir_path_prefix,
        'vocab_path_prefix': vocab_path_prefix,
        'master_dict_path_prefix': master_dict_path_prefix,
        'max_playlist_length': 5,
        'max_tokens': 20000,
        'ngrams': 2,
        # 'feature_name_list': feature_name_list,  # regular list
        'fixed_features_list': fixed_features_list,
        'ragged_features_list': ragged_features_list,
    },
)
job.run(sync=False)

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/jtv10-tfrs-vocab-pipe-jtv10-20221228055018
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/jtv10-tfrs-vocab-pipe-jtv10-20221228055018')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/jtv10-tfrs-vocab-pipe-jtv10-20221228055018?project=934903580331
PipelineJob projects/934903580331/locations/us-central1/pipelineJobs/jtv10-tfrs-vocab-pipe-jtv10-20221228055018 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/934903580331/locations/us-central1/pipelineJobs/jtv10-tfrs-vocab-pipe-jtv10-20221228055018 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/934903580331/locations/us-central1/pipelineJobs/jtv10-tfrs-vocab-pipe-jtv10-20221228055018 current state:
PipelineState.PIPELINE_STATE_RUNNING

# Local Testing (wip)

In [12]:
# data
train_dir = 'spotify-data-regimes'
train_dir_prefix = 'jtv8/train_minimum' # train_minimum | train_subset
delimiter = '/'

train_files = []
# for blob in storage_client.list_blobs(f'{train_dir}', prefix=f'{train_dir_prefix}', delimiter=f'{delimiter}'):
for blob in storage_client.list_blobs(f'{train_dir}', prefix=f'{train_dir_prefix}'):
    if '.tfrecords' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
    
len(train_files)

2

In [14]:
train_files

['gs://spotify-data-regimes/jtv8/train_minimum/-00000-of-02375.tfrecords',
 'gs://spotify-data-regimes/jtv8/train_minimum/-00001-of-02375.tfrecords']

In [15]:
from two_tower_jt import two_tower as tt

def full_parse(data):
    # used for interleave - takes tensors and returns a tf.dataset
    data = tf.data.TFRecordDataset(data)
    return data

train_dataset = tf.data.TFRecordDataset(train_files)
# train_dataset = tf.data.Dataset.from_tensor_slices(train_files_n).prefetch(
#     tf.data.AUTOTUNE,
# )

train_parsed = train_dataset.map(tt.parse_tfrecord)

2022-12-22 19:56:36.479899: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-22 19:56:37.179742: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 37629 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0


In [17]:
# train_parsed

## test layer adapts

In [20]:
MAX_PLAYLIST_LENGTH=5

### adapt test `track_name_pl`

In [18]:
start = time.time()

# pl_name_src
track_name_pl_text_layer = tf.keras.layers.TextVectorization()
track_name_pl_text_layer.adapt(train_parsed.map(lambda x: x['track_name_pl']))

end = time.time()

print(f'Elapsed time: {round((end - start), 2)} seconds')

Elapsed time: 101.56 seconds


In [23]:
track_name_pl_text_layer.get_vocabulary()[0:5]

['', '[UNK]', 'the', 'you', 'feat']

In [22]:
start = time.time()

# pl_name_src
track_name_pl_text_layer_2 = tf.keras.layers.TextVectorization()
track_name_pl_text_layer_2.adapt(train_parsed.map(lambda x: tf.reshape(x['track_name_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))

end = time.time()

print(f'Elapsed time: {round((end - start), 2)} seconds')

Elapsed time: 95.15 seconds


In [24]:
track_name_pl_text_layer_2.get_vocabulary()[0:5]

['', '[UNK]', 'the', 'you', 'feat']

### adapt test `artist_name_pl`

### adapt test `album_name_pl`

### adapt test `artist_genres_pl`

In [25]:
start = time.time()

# pl_name_src
artist_genres_pl_text_layer = tf.keras.layers.TextVectorization()
artist_genres_pl_text_layer.adapt(train_parsed.map(lambda x: x['artist_genres_pl']))

end = time.time()

print(f'Elapsed time: {round((end - start), 2)} seconds')

Elapsed time: 98.09 seconds


In [26]:
artist_genres_pl_text_layer.get_vocabulary()[0:5]

['', '[UNK]', 'pop', 'rock', 'rap']

In [None]:
start = time.time()

# pl_name_src
artist_genres_pl_text_layer_2 = tf.keras.layers.TextVectorization()
artist_genres_pl_text_layer_2.adapt(train_parsed.map(lambda x: tf.reshape(x['artist_genres_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))

end = time.time()

print(f'Elapsed time: {round((end - start), 2)} seconds')

In [None]:
artist_genres_pl_text_layer_2.get_vocabulary()[0:5]

### adapt test `pl_name_src`

In [48]:
start = time.time()

# pl_name_src
pl_name_src_text_layer = tf.keras.layers.TextVectorization() # max_tokens=MAX_TOKENS,ngrams=2,
pl_name_src_text_layer.adapt(train_parsed.map(lambda x: x['pl_name_src']))

end = time.time()

print(f'Elapsed time: {round((end - start), 2)} seconds')

Elapsed time: 695.44 seconds


In [49]:
for row in train_parsed.batch(1).map(lambda x: x["pl_name_src"]).take(1):
  print(pl_name_src_text_layer(row))

tf.Tensor([[723]], shape=(1, 1), dtype=int64)


In [51]:
pl_name_src_text_layer.get_vocabulary()[0:5]

['', '[UNK]', 'country', 'music', 'rock']

In [81]:
BUCKET = 'spotify-data-regimes'
VOCAB_LOCAL_FILE = 'v3_test_pl_dict.pkl'
VOCAB_GCS_OBJ = f'jtv8/test_vocabs/{VOCAB_LOCAL_FILE}'

print(f"VOCAB_LOCAL_FILE {VOCAB_LOCAL_FILE}")
print(f"VOCAB_GCS_OBJ {VOCAB_GCS_OBJ}")

VOCAB_LOCAL_FILE v3_test_pl_dict.pkl
VOCAB_GCS_OBJ jtv8/test_vocabs/v3_test_pl_dict.pkl


In [82]:
test_pl_dict = {
    'v3_pl_name_src' : pl_name_src_text_layer.get_vocabulary()[0:5],
}

filehandler = open(f'{VOCAB_LOCAL_FILE}', 'wb')
pkl.dump(test_pl_dict, filehandler)
filehandler.close()

tt.upload_blob(f'{BUCKET}', f'{VOCAB_LOCAL_FILE}', f'{VOCAB_GCS_OBJ}')

File v3_test_pl_dict.pkl uploaded to jtv8/test_vocabs/v3_test_pl_dict.pkl.


In [83]:
# filehandler = open(f'{VOCAB_LOCAL_FILE}', 'rb')
# vocab_dict_load = pkl.load(filehandler)
# filehandler.close()
# vocab_dict_load

# https://storage.cloud.google.com/spotify-data-regimes/jtv8/test_vocabs/test_pl_dict.pkl

with open("v3_new_vocab_pre_load", 'wb') as local_vocab_jt3:
        storage_client.download_blob_to_file(
            f"gs://{BUCKET}/{VOCAB_GCS_OBJ}", local_vocab_jt3
        )
# local_vocab_jt1 --> <_io.BufferedWriter name='new_vocab_pre_load'>

In [84]:
with open("v3_new_vocab_pre_load", 'rb') as pickle_file_v3:
    loaded_vocab_dict_v3 = pkl.load(pickle_file_v3)
    
# loaded_vocab_dict
# loaded_vocab_dict_v2

In [89]:
loaded_vocab_dict_v1

{'pl_name_src': ['', '[UNK]', 'country', 'music', 'rock']}

In [91]:
# Python code to merge dict using update() method
def merge_dicts(dict_list):
    dict_1 = dict_list[0]
    
    for vocab_dict in dict_list[1:]:
        dict_1.update(vocab_dict)
    
    return(print("dicts updated"))

merge_dicts(dict_list=[loaded_vocab_dict_v1, loaded_vocab_dict_v2, loaded_vocab_dict_v3])

loaded_vocab_dict_v1

dicts updated


{'pl_name_src': ['', '[UNK]', 'country', 'music', 'rock'],
 'v2_pl_name_src': ['', '[UNK]', 'country', 'music', 'rock'],
 'v3_pl_name_src': ['', '[UNK]', 'country', 'music', 'rock']}

In [94]:
vocab_path_prefix = 'jtv8/test_vocabs/'

vocab_dict_uris = []
for blob in storage_client.list_blobs(f'{BUCKET}', prefix=f'{vocab_path_prefix}', delimiter="/"):
    vocab_dict_uris.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

print(f"vocab_dict_uris: {vocab_dict_uris}")
print(f"count of vocab_dict_uris: {len(vocab_dict_uris)}")

# skip folder path prefix
vocab_dict_uris_removed = vocab_dict_uris[1:]
print(f"count of vocab_dict_uris_removed: {len(vocab_dict_uris_removed)}")
print(f"vocab_dict_uris_removed: {vocab_dict_uris_removed}")

loaded_pickle_list = []
for i, pickled_dict in enumerate(vocab_dict_uris_removed):

    with open(f"v{i}_vocab_pre_load", 'wb') as local_vocab_file:
        storage_client.download_blob_to_file(pickled_dict, local_vocab_file)

    with open(f"v{i}_vocab_pre_load", 'rb') as pickle_file:
        loaded_vocab_dict = pkl.load(pickle_file)
    
    loaded_pickle_list.append(loaded_vocab_dict)
    
loaded_pickle_list

vocab_dict_uris: ['gs://spotify-data-regimes/jtv8/test_vocabs/', 'gs://spotify-data-regimes/jtv8/test_vocabs/test_pl_dict.pkl', 'gs://spotify-data-regimes/jtv8/test_vocabs/v2_test_pl_dict.pkl', 'gs://spotify-data-regimes/jtv8/test_vocabs/v3_test_pl_dict.pkl']
count of vocab_dict_uris: 4
count of vocab_dict_uris_removed: 3
vocab_dict_uris_removed: ['gs://spotify-data-regimes/jtv8/test_vocabs/test_pl_dict.pkl', 'gs://spotify-data-regimes/jtv8/test_vocabs/v2_test_pl_dict.pkl', 'gs://spotify-data-regimes/jtv8/test_vocabs/v3_test_pl_dict.pkl']


[{'pl_name_src': ['', '[UNK]', 'country', 'music', 'rock']},
 {'v2_pl_name_src': ['', '[UNK]', 'country', 'music', 'rock']},
 {'v3_pl_name_src': ['', '[UNK]', 'country', 'music', 'rock']}]

In [96]:
master_dict = {}
for thing in loaded_pickle_list:
    master_dict.update(thing)
    
master_dict

{'pl_name_src': ['', '[UNK]', 'country', 'music', 'rock'],
 'v2_pl_name_src': ['', '[UNK]', 'country', 'music', 'rock'],
 'v3_pl_name_src': ['', '[UNK]', 'country', 'music', 'rock']}

In [97]:
master_dict['pl_name_src']

['', '[UNK]', 'country', 'music', 'rock']

## feature data format

In [None]:
feature_list_string = json.dumps(
    [
        {
            "name": "pl_name_src",
            "feat_type": "fixed_length",
        },
        {
            "name": "track_name_pl",
            "feat_type": "ragged",
        },
        {
            "name": "artist_name_pl",
            "feat_type": "ragged",
        },
        {
            "name": "album_name_pl",
            "feat_type": "ragged",
        },
        {
            "name": "artist_genres_pl",
            "feat_type": "ragged",
        },
        {
            "name": "track_name_can",
            "feat_type": "fixed_length",
        },
        {
            "name": "artist_name_can",
            "feat_type": "fixed_length",
        },
        {
            "name": "album_name_can",
            "feat_type": "fixed_length",
        },
        {
            "name": "artist_genres_can",
            "feat_type": "fixed_length",
        },
    ],
)

feature_list_ = [
        {
            "name": "pl_name_src",
            "feat_type": "fixed_length",
        },
        {
            "name": "track_name_pl",
            "feat_type": "ragged",
        },
        {
            "name": "artist_name_pl",
            "feat_type": "ragged",
        },
        {
            "name": "album_name_pl",
            "feat_type": "ragged",
        },
        {
            "name": "artist_genres_pl",
            "feat_type": "ragged",
        },
        {
            "name": "track_name_can",
            "feat_type": "fixed_length",
        },
        {
            "name": "artist_name_can",
            "feat_type": "fixed_length",
        },
        {
            "name": "album_name_can",
            "feat_type": "fixed_length",
        },
        {
            "name": "artist_genres_can",
            "feat_type": "fixed_length",
        },
    ],

from pprint import pprint

print(feature_list_)

### jw code

In [None]:
MAX_PLAYLIST_LENGTH = 5 # this is set upstream by the BigQuery max length
    
model.query_tower.layers[0].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['pl_name_src']))
print('pl_name_src adapts complete')
model.query_tower.layers[7].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['track_name_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))
print('track_name_pl adapts complete')
model.query_tower.layers[9].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['artist_name_pl'], [-1, MAX_PLAYLIST_LENGTH, 1]))) 
print('artist_name_pl adapts complete')
model.query_tower.layers[11].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['album_name_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))
print('album_name_pl adapts complete')
model.query_tower.layers[12].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['artist_genres_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))
print('artist_genres_pl adapts complete')
# model.query_tower.layers[13].layers[0].adapt(
#     train_dataset.unbatch().batch(10000).map(lambda x: tf.reshape(x['tracks_playlist_titles_pl'], [-1, MAX_PLAYLIST_LENGTH, 1])))
# print('tracks_playlist_titles_pl adapts complete')

model.candidate_tower.layers[1].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['track_name_can'])) 
print('track_name_can adapts complete')
model.candidate_tower.layers[3].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['artist_name_can'])) 
print('artist_name_can adapts complete')
model.candidate_tower.layers[5].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['album_name_can'])) 
print('album_name_can adapts complete')
model.candidate_tower.layers[9].layers[0].adapt(
    train_dataset.unbatch().batch(10000).map(lambda x: x['artist_genres_can'])) 
print('artist_genres_can adapts complete')
# model.candidate_tower.layers[11].layers[0].adapt(
#     train_dataset.unbatch().batch(10000).map(lambda x: x['track_pl_titles_can'])) 
# print('track_pl_titles_can adapts complete')

In [None]:
vocab_dict = {
    'pl_name_src' : model.query_tower.layers[0].layers[0].get_vocabulary(),
    'track_name_pl' : model.query_tower.layers[7].layers[0].get_vocabulary(),
    'artist_name_pl' : model.query_tower.layers[9].layers[0].get_vocabulary(),
    'album_name_pl' : model.query_tower.layers[11].layers[0].get_vocabulary(),
    'artist_genres_pl' : model.query_tower.layers[12].layers[0].get_vocabulary(),
    'tracks_playlist_titles_pl' : model.query_tower.layers[13].layers[0].get_vocabulary(),

    'track_name_can' : model.candidate_tower.layers[1].layers[0].get_vocabulary(),
    'artist_name_can' : model.candidate_tower.layers[3].layers[0].get_vocabulary(),
    'album_name_can' : model.candidate_tower.layers[5].layers[0].get_vocabulary(),
    'artist_genres_can' : model.candidate_tower.layers[9].layers[0].get_vocabulary(),
    'track_pl_titles_can' : model.candidate_tower.layers[11].layers[0].get_vocabulary(),
}

import pickle as pkl

filehandler = open('vocab_dict.pkl', 'wb')
pkl.dump(vocab_dict, filehandler)

filehandler.close()

tt.upload_blob('two-tower-models', 'vocab_dict.pkl', 'vocabs/vocab_dict.pkl')