fixed_text_layer adapts vocab

In [None]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/adapt_fixed_text_layer_vocab.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=[
        # 'google-cloud-aiplatform==1.18.1',
        'google-cloud-storage',
        'tensorflow==2.10.1',
    ],
)
def adapt_fixed_text_layer_vocab(
    project: str,
    location: str,
    version: str,
    data_dir_bucket_name: str,
    data_dir_path_prefix: str,
    train_output_gcs_bucket: str,
    experiment_name: str,
    experiment_run: str,
    max_playlist_length: int,
    max_tokens: int,
    ngrams: int,
    feature_name: str,
    generate_new_vocab: bool,
    # feat_type: str,
) -> NamedTuple('Outputs', [
    ('vocab_gcs_uri', str),
    # ('feature_name', str),
]):

    """
    custom pipeline component to adapt the `pl_name_src` layer
    writes vocab to pickled dict in GCS
    dict combined with other layer vocabs and used in Two Tower training
    """
    
    # import packages
    import os
    import logging
    import pickle as pkl
    import time
    
    from google.cloud import storage
    
    import tensorflow as tf
    
    storage_client = storage.Client(project=project)
    
    logging.info(f"feature_name: {feature_name}")
    
    # ===================================================
    # helper function
    # ===================================================
    
    def download_blob(bucket_name, source_gcs_obj, local_filename):
        """Uploads a file to the bucket."""
        # storage_client = storage.Client(project=project_number)
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(source_gcs_obj)
        blob.download_to_filename(local_filename)

        filehandler = open(f'{local_filename}', 'rb')
        loaded_dict = pkl.load(filehandler)
        filehandler.close()

        logging.info(f"File {local_filename} downloaded from gs://{bucket_name}/{source_gcs_obj}")

        return loaded_dict
    
    # ===================================================
    # set feature vars
    # ===================================================
    MAX_PLAYLIST_LENGTH = max_playlist_length
    logging.info(f"MAX_PLAYLIST_LENGTH: {MAX_PLAYLIST_LENGTH}")
    
    FEATURES_PREFIX = f'{experiment_name}/{experiment_run}/features'
    logging.info(f"FEATURES_PREFIX: {FEATURES_PREFIX}")
    
    all_features_dict = {}
    
    # ===================================================
    # load pickled Candidate features
    # ===================================================
    
    # candidate features
    CAND_FEAT_FILENAME = 'candidate_feats_dict.pkl'
    CAND_FEAT_GCS_OBJ = f'{FEATURES_PREFIX}/{CAND_FEAT_FILENAME}'
    LOADED_CANDIDATE_DICT = f'loaded_{CAND_FEAT_FILENAME}'
    
    loaded_candidate_features_dict = download_blob(
        train_output_gcs_bucket,
        CAND_FEAT_GCS_OBJ,
        LOADED_CANDIDATE_DICT
    )
    
    all_features_dict.update(loaded_candidate_features_dict)
    logging.info(f"all_features_dict: {all_features_dict}")

    # ===================================================
    # load pickled Query features
    # ===================================================

    # query features
    QUERY_FEAT_FILENAME = 'query_feats_dict.pkl'
    QUERY_FEAT_GCS_OBJ = f'{FEATURES_PREFIX}/{QUERY_FEAT_FILENAME}'
    LOADED_QUERY_DICT = f'loaded_{QUERY_FEAT_FILENAME}'
    
    loaded_query_features_dict = download_blob(
        train_output_gcs_bucket,
        QUERY_FEAT_GCS_OBJ,
        LOADED_QUERY_DICT
    )
    
    all_features_dict.update(loaded_query_features_dict)
    logging.info(f"all_features_dict: {all_features_dict}")
    
    # ===================================================
    # tfrecord parser
    # ===================================================
    
    # parsing function
    def parse_tfrecord(example):
        """
        Reads a serialized example from GCS and converts to tfrecord
        """
        # example = tf.io.parse_single_example(
        example = tf.io.parse_example(
            example,
            # feats
            features=all_features_dict
        )
        return example
    
    if generate_new_vocab:
        logging.info(f"Generating new vocab file...")
        
        # list blobs (tfrecords)
        train_files = []
        for blob in storage_client.list_blobs(f'{data_dir_bucket_name}', prefix=f'{data_dir_path_prefix}'):
            if '.tfrecords' in blob.name:
                train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

        logging.info(f"TFRecord file count: {len(train_files)}")

        # ===================================================
        # create TF dataset
        # ===================================================
        logging.info(f"Creating TFRecordDataset...")
        train_dataset = tf.data.TFRecordDataset(train_files)
        train_parsed = train_dataset.map(parse_tfrecord)

        # ===================================================
        # adapt layer for feature
        # ===================================================
        start = time.time()
        text_layer = tf.keras.layers.TextVectorization(
            max_tokens=max_tokens,
            ngrams=ngrams
        )
        text_layer.adapt(train_parsed.map(lambda x: x[f'{feature_name}']))
        end = time.time()

        logging.info(f'Layer adapt elapsed time: {round((end - start), 2)} seconds')

        # ===================================================
        # write vocab to pickled dict --> gcs
        # ===================================================
        logging.info(f"Writting pickled dict to GCS...")

        VOCAB_LOCAL_FILE = f'{feature_name}_vocab_dict.pkl'
        VOCAB_GCS_OBJ = f'{experiment_name}/{experiment_run}/vocab-staging/{VOCAB_LOCAL_FILE}' # destination folder prefix and blob name
        VOCAB_DICT = {f'{feature_name}' : text_layer.get_vocabulary(),}

        logging.info(f"VOCAB_LOCAL_FILE: {VOCAB_LOCAL_FILE}")
        logging.info(f"VOCAB_GCS_OBJ: {VOCAB_GCS_OBJ}")

        # pickle
        filehandler = open(f'{VOCAB_LOCAL_FILE}', 'wb')
        pkl.dump(VOCAB_DICT, filehandler)
        filehandler.close()

        # upload to GCS
        bucket_client = storage_client.bucket(train_output_gcs_bucket)
        blob = bucket_client.blob(VOCAB_GCS_OBJ)
        blob.upload_from_filename(VOCAB_LOCAL_FILE)

        vocab_uri = f'gs://{train_output_gcs_bucket}/{VOCAB_GCS_OBJ}'

        logging.info(f"File {VOCAB_LOCAL_FILE} uploaded to {vocab_uri}")
        
    else:
        logging.info(f"Using existing vocab file...")
        
        vocab_uri = 'gs://two-tower-models/vocabs/vocab_dict.pkl'
        logging.info(f"Using vocab file: {vocab_uri}")
    
    return(
        vocab_uri,
        # feature_name,
    )

ragged adapts vocab


In [None]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/adapt_ragged_text_layer_vocab.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=[
        # 'google-cloud-aiplatform==1.18.1',
        'google-cloud-storage',
        'tensorflow==2.10.1',
    ],
)
def adapt_ragged_text_layer_vocab(
    project: str,
    location: str,
    version: str,
    data_dir_bucket_name: str,
    data_dir_path_prefix: str,
    train_output_gcs_bucket: str,
    experiment_name: str,
    experiment_run: str,
    max_playlist_length: int,
    max_tokens: int,
    ngrams: int,
    feature_name: str,
    generate_new_vocab: bool,
    # feat_type: str,
) -> NamedTuple('Outputs', [
    ('vocab_gcs_uri', str),
    # ('feature_name', str),
]):

    """
    custom pipeline component to adapt the `pl_name_src` layer
    writes vocab to pickled dict in GCS
    dict combined with other layer vocabs and used in Two Tower training
    """
    
    # import packages
    import os
    import logging
    import pickle as pkl
    import time
    
    from google.cloud import storage
    
    import tensorflow as tf
    
    storage_client = storage.Client(project=project)
    
    logging.info(f"feature_name: {feature_name}")
    # logging.info(f"feat_type: {feat_type}")
    
    # ===================================================
    # set feature vars
    # ===================================================
    MAX_PLAYLIST_LENGTH = max_playlist_length
    logging.info(f"MAX_PLAYLIST_LENGTH: {MAX_PLAYLIST_LENGTH}")
    
    FEATURES_PREFIX = f'{experiment_name}/{experiment_run}/features'
    logging.info(f"FEATURES_PREFIX: {FEATURES_PREFIX}")
    
    all_features_dict = {}
    
    # ===================================================
    # load pickled Candidate features
    # ===================================================
    
    # candidate features
    CAND_FEAT_FILENAME = 'candidate_feats_dict.pkl'
    CAND_FEAT_GCS_OBJ = f'{FEATURES_PREFIX}/{CAND_FEAT_FILENAME}'
    LOADED_CANDIDATE_DICT = f'loaded_{CAND_FEAT_FILENAME}'
    logging.info(f"CAND_FEAT_FILENAME: {CAND_FEAT_FILENAME}; CAND_FEAT_GCS_OBJ:{CAND_FEAT_GCS_OBJ}; LOADED_CANDIDATE_DICT: {LOADED_CANDIDATE_DICT}")
    
    # os.system(f'gsutil cp gs://{train_output_gcs_bucket}/{CAND_FEAT_GCS_OBJ} {LOADED_CANDIDATE_DICT}')
    bucket = storage_client.bucket(train_output_gcs_bucket)
    blob = bucket.blob(CAND_FEAT_GCS_OBJ)
    blob.download_to_filename(LOADED_CANDIDATE_DICT)
    
    filehandler = open(f'{LOADED_CANDIDATE_DICT}', 'rb')
    loaded_candidate_features_dict = pkl.load(filehandler)
    filehandler.close()
    logging.info(f"loaded_candidate_features_dict: {loaded_candidate_features_dict}")
    
    all_features_dict.update(loaded_candidate_features_dict)
    logging.info(f"all_features_dict: {all_features_dict}")

    # ===================================================
    # load pickled Query features
    # ===================================================

    # query features
    QUERY_FEAT_FILENAME = 'query_feats_dict.pkl'
    QUERY_FEAT_GCS_OBJ = f'{FEATURES_PREFIX}/{QUERY_FEAT_FILENAME}'
    LOADED_QUERY_DICT = f'loaded_{QUERY_FEAT_FILENAME}'
    logging.info(f"QUERY_FEAT_FILENAME: {QUERY_FEAT_FILENAME}; QUERY_FEAT_GCS_OBJ:{QUERY_FEAT_GCS_OBJ}; LOADED_QUERY_DICT: {LOADED_QUERY_DICT}")
    
    # os.system(f'gsutil cp gs://{train_output_gcs_bucket}/{QUERY_FEATURES_GCS_OBJ} {LOADED_QUERY_DICT}')
    bucket = storage_client.bucket(train_output_gcs_bucket)
    blob = bucket.blob(QUERY_FEAT_GCS_OBJ)
    blob.download_to_filename(LOADED_QUERY_DICT)
    
    filehandler = open(f'{LOADED_QUERY_DICT}', 'rb')
    loaded_query_features_dict = pkl.load(filehandler)
    filehandler.close()
    logging.info(f"loaded_query_features_dict: {loaded_query_features_dict}")
    
    all_features_dict.update(loaded_query_features_dict)
    logging.info(f"all_features_dict: {all_features_dict}")
    
    # ===================================================
    # tfrecord parser
    # ===================================================
    
    # parsing function
    def parse_tfrecord(example):
        """
        Reads a serialized example from GCS and converts to tfrecord
        """
        # example = tf.io.parse_single_example(
        example = tf.io.parse_example(
            example,
            # feats
            features=all_features_dict
        )
        return example
    
    
    if generate_new_vocab:
        logging.info(f"Generating new vocab file...")
    
        # list blobs (tfrecords)
        train_files = []
        for blob in storage_client.list_blobs(f'{data_dir_bucket_name}', prefix=f'{data_dir_path_prefix}'):
            if '.tfrecords' in blob.name:
                train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))

        logging.info(f"TFRecord file count: {len(train_files)}")

        # ===================================================
        # create TF dataset
        # ===================================================
        logging.info(f"Creating TFRecordDataset...")
        train_dataset = tf.data.TFRecordDataset(train_files)
        train_parsed = train_dataset.map(parse_tfrecord)

        # ===================================================
        # adapt layer for feature
        # ===================================================

        start = time.time()
        text_layer = tf.keras.layers.TextVectorization(
            max_tokens=max_tokens,
            ngrams=ngrams
        )
        text_layer.adapt(train_parsed.map(lambda x: tf.reshape(x[f'{feature_name}'], [-1, MAX_PLAYLIST_LENGTH, 1])))
        end = time.time()

        logging.info(f'Layer adapt elapsed time: {round((end - start), 2)} seconds')

        # ===================================================
        # write vocab to pickled dict --> gcs
        # ===================================================
        logging.info(f"Writting pickled dict to GCS...")

        VOCAB_LOCAL_FILE = f'{feature_name}_vocab_dict.pkl'
        VOCAB_GCS_OBJ = f'{experiment_name}/{experiment_run}/vocab-staging/{VOCAB_LOCAL_FILE}' # destination folder prefix and blob name
        VOCAB_DICT = {f'{feature_name}' : text_layer.get_vocabulary(),}

        logging.info(f"VOCAB_LOCAL_FILE: {VOCAB_LOCAL_FILE}")
        logging.info(f"VOCAB_GCS_OBJ: {VOCAB_GCS_OBJ}")

        # pickle
        filehandler = open(f'{VOCAB_LOCAL_FILE}', 'wb')
        pkl.dump(VOCAB_DICT, filehandler)
        filehandler.close()

        # upload to GCS
        bucket_client = storage_client.bucket(train_output_gcs_bucket)
        blob = bucket_client.blob(VOCAB_GCS_OBJ)
        blob.upload_from_filename(VOCAB_LOCAL_FILE)

        vocab_uri = f'gs://{train_output_gcs_bucket}/{VOCAB_GCS_OBJ}'

        logging.info(f"File {VOCAB_LOCAL_FILE} uploaded to {vocab_uri}")
        
    else:
        logging.info(f"Using existing vocab files...")
        vocab_uri = 'gs://two-tower-models/vocabs/vocab_dict.pkl'
        logging.info(f"Using vocab file: {vocab_uri}")
    
    return(
        vocab_uri,
        # feature_name,
    )

In [None]:
create master vocab


In [None]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/{PIPELINES_SUB_DIR}/create_master_vocab.py

import kfp
from typing import Any, Callable, Dict, NamedTuple, Optional, List
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component, Metrics)

@kfp.v2.dsl.component(
    base_image="python:3.9",
    packages_to_install=[
        # 'google-cloud-aiplatform==1.18.1',
        'google-cloud-storage',
        'numpy',
        # 'tensorflow==2.8.3',
    ],
)
def create_master_vocab(
    project: str,
    location: str,
    version: str,
    train_output_gcs_bucket: str,
    experiment_name: str,
    experiment_run: str,
    vocab_uri_1: str,
    vocab_uri_2: str,
    vocab_uri_3: str,
    vocab_uri_4: str,
    vocab_uri_5: str,
    vocab_uri_6: str,
    vocab_uri_7: str,
    vocab_uri_8: str,
    vocab_uri_9: str,
    generate_new_vocab: bool,
) -> NamedTuple('Outputs', [
    ('master_vocab_gcs_uri', str),
    ('experiment_name', str),
    ('experiment_run', str),
]):
    
    """
    combine layer dictionaires to master dictionary
    master dictionary passed to train job for layer vocabs
    """
    
    # import packages
    import os
    import logging
    import pickle as pkl
    import time
    import numpy as np
    
    from google.cloud import storage
    
    # setup clients
    storage_client = storage.Client()
    
    if generate_new_vocab:
        
        logging.info(f"Generating new vocab master file...")
        # ===================================================
        # Create list of all layer vocab dict uris
        # ===================================================

        vocab_dict_uris = [
            vocab_uri_1, vocab_uri_2, 
            vocab_uri_3, vocab_uri_4, 
            vocab_uri_5, vocab_uri_6, 
            vocab_uri_7, vocab_uri_8, 
            vocab_uri_9, 
        ]
        logging.info(f"count of vocab_dict_uris: {len(vocab_dict_uris)}")
        logging.info(f"vocab_dict_uris: {vocab_dict_uris}")

        # ===================================================
        # load pickled dicts
        # ===================================================

        loaded_pickle_list = []
        for i, pickled_dict in enumerate(vocab_dict_uris):

            with open(f"v{i}_vocab_pre_load", 'wb') as local_vocab_file:
                storage_client.download_blob_to_file(pickled_dict, local_vocab_file)

            with open(f"v{i}_vocab_pre_load", 'rb') as pickle_file:
                loaded_vocab_dict = pkl.load(pickle_file)

            loaded_pickle_list.append(loaded_vocab_dict)

        # ===================================================
        # create master vocab dict
        # ===================================================
        master_dict = {}
        for loaded_dict in loaded_pickle_list:
            master_dict.update(loaded_dict)

        # ===================================================
        # Upload master to GCS
        # ===================================================
        MASTER_VOCAB_LOCAL_FILE = f'vocab_dict.pkl'
        MASTER_VOCAB_GCS_OBJ = f'{experiment_name}/{experiment_run}/{MASTER_VOCAB_LOCAL_FILE}' # destination folder prefix and blob name

        # pickle
        filehandler = open(f'{MASTER_VOCAB_LOCAL_FILE}', 'wb')
        pkl.dump(master_dict, filehandler)
        filehandler.close()

        # upload to GCS
        bucket_client = storage_client.bucket(train_output_gcs_bucket)
        blob = bucket_client.blob(MASTER_VOCAB_GCS_OBJ)
        blob.upload_from_filename(MASTER_VOCAB_LOCAL_FILE)

        master_vocab_uri = f'gs://{train_output_gcs_bucket}/{MASTER_VOCAB_GCS_OBJ}'

        logging.info(f"File {MASTER_VOCAB_LOCAL_FILE} uploaded to {master_vocab_uri}")
        
    else:
        logging.info(f"Using existing vocab file...")
        master_vocab_uri = 'gs://two-tower-models/vocabs/vocab_dict.pkl'
        logging.info(f"Using vocab file: {master_vocab_uri}")
    
    return(
        master_vocab_uri,
        experiment_name,
        experiment_run
    )

## pipeline

In [None]:
from src.train_pipes import build_custom_image, train_custom_model, create_tensorboard, generate_candidates, \
                            create_ann_index, create_brute_force_index, create_ann_index_endpoint_vpc, \
                            create_brute_index_endpoint_vpc, deploy_ann_index, deploy_brute_index, \
                            adapt_ragged_text_layer_vocab, adapt_fixed_text_layer_vocab, create_master_vocab, \
                            test_model_index_endpoint_v5

from src.train_pipes import pipeline_config as cfg

@kfp.v2.dsl.pipeline(
    name=f'{PIPELINE_NAME}'.replace('_', '-')
)
def pipeline(
    project: str,
    project_number: str,
    location: str,
    service_account: str,
    model_version: str,
    pipeline_version: str,
    train_image_uri: str,
    train_output_gcs_bucket: str,
    gcs_train_script_path: str,
    model_display_name: str,
    train_dockerfile_name: str,
    train_dir: str,
    train_dir_prefix: str,
    valid_dir: str,
    valid_dir_prefix: str,
    candidate_file_dir: str,
    candidate_files_prefix: str,
    # tensorboard_resource_name: str,
    experiment_name: str,
    experiment_run: str,
    register_model_flag: str,
    vpc_network_name: str,
    generate_new_vocab: bool,
    max_playlist_length: int,
    max_tokens: int,
    ngrams: int,
):
    
    from kfp.v2.components import importer_node
    from google_cloud_pipeline_components.types import artifact_types
    
    # ========================================================================
    # Build Custom Train Image
    # ========================================================================
    
    # build_custom_train_image_op = (
    #     build_custom_train_image.build_custom_train_image(
    #         project=project,
    #         gcs_train_script_path=gcs_train_script_path,
    #         training_image_uri=train_image_uri,
    #         train_dockerfile_name=train_dockerfile_name,
    #     )
    #     .set_display_name("Build custom train image")
    #     .set_caching_options(False)
    # )
    
    # ========================================================================
    # Conditional: Upload models to Vertex model registry
    # ========================================================================
    # with kfp.v2.dsl.Condition(generate_new_vocab == 'True', name="Generate New Vocab"):
        
        # here
    # pl_name_src
    adapt_pl_name_src_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=model_version,
            data_dir_bucket_name=train_dir,
            data_dir_path_prefix=train_dir_prefix,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='pl_name_src',
            generate_new_vocab=generate_new_vocab,
        )
        .set_display_name(f"adapt: pl_name_src")
        .set_caching_options(True)
        .set_cpu_limit('96')
        .set_memory_limit('624G')
    )
    # track_name_can
    adapt_track_name_can_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=model_version,
            data_dir_bucket_name=train_dir,
            data_dir_path_prefix=train_dir_prefix,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='track_name_can',
            generate_new_vocab=generate_new_vocab,
        )
        .set_display_name(f"adapt: track_name_can")
        .set_caching_options(True)
        .set_cpu_limit(cfg.CPU_LIMIT)
        .set_memory_limit(cfg.MEMORY_LIMIT)
    )
    # artist_name_can
    adapt_artist_name_can_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=model_version,
            data_dir_bucket_name=train_dir,
            data_dir_path_prefix=train_dir_prefix,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='artist_name_can',
            generate_new_vocab=generate_new_vocab,
        )
        .set_display_name(f"adapt: artist_name_can")
        .set_caching_options(True)
        .set_cpu_limit(cfg.CPU_LIMIT)
        .set_memory_limit(cfg.MEMORY_LIMIT)
    )

    # album_name_can
    adapt_album_name_can_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=model_version,
            data_dir_bucket_name=train_dir,
            data_dir_path_prefix=train_dir_prefix,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='album_name_can',
            generate_new_vocab=generate_new_vocab,
        )
        .set_display_name(f"adapt: album_name_can")
        .set_caching_options(True)
        .set_cpu_limit(cfg.CPU_LIMIT)
        .set_memory_limit(cfg.MEMORY_LIMIT)
    )
    # artist_genres_can
    adapt_artist_genres_can_op = (
        adapt_fixed_text_layer_vocab.adapt_fixed_text_layer_vocab(
            project=project,
            location=location,
            version=model_version,
            data_dir_bucket_name=train_dir,
            data_dir_path_prefix=train_dir_prefix,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='artist_genres_can',
            generate_new_vocab=generate_new_vocab,
        )
        .set_display_name(f"adapt: artist_genres_can")
        .set_caching_options(True)
        .set_cpu_limit(cfg.CPU_LIMIT)
        .set_memory_limit(cfg.MEMORY_LIMIT)
    )
    # raggeds

    # track_name_pl
    adapt_track_name_pl_features_op = (
        adapt_ragged_text_layer_vocab.adapt_ragged_text_layer_vocab(
            project=project,
            location=location,
            version=model_version,
            data_dir_bucket_name=train_dir,
            data_dir_path_prefix=train_dir_prefix,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='track_name_pl',
            generate_new_vocab=generate_new_vocab,
        )
        .set_display_name(f"adapt: track_name_pl")
        .set_caching_options(True)
        .set_cpu_limit(cfg.CPU_LIMIT)
        .set_memory_limit(cfg.MEMORY_LIMIT)
    )
    # artist_name_pl
    adapt_artist_name_pl_op = (
        adapt_ragged_text_layer_vocab.adapt_ragged_text_layer_vocab(
            project=project,
            location=location,
            version=model_version,
            data_dir_bucket_name=train_dir,
            data_dir_path_prefix=train_dir_prefix,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='artist_name_pl',
            generate_new_vocab=generate_new_vocab,
        )
        .set_display_name(f"adapt: artist_name_pl")
        .set_caching_options(True)
        .set_cpu_limit(cfg.CPU_LIMIT)
        .set_memory_limit(cfg.MEMORY_LIMIT)
    )

    # album_name_pl
    adapt_album_name_pl_op = (
        adapt_ragged_text_layer_vocab.adapt_ragged_text_layer_vocab(
            project=project,
            location=location,
            version=model_version,
            data_dir_bucket_name=train_dir,
            data_dir_path_prefix=train_dir_prefix,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='album_name_pl',
            generate_new_vocab=generate_new_vocab,
        )
        .set_display_name(f"adapt: album_name_pl")
        .set_caching_options(True)
        .set_cpu_limit(cfg.CPU_LIMIT)
        .set_memory_limit(cfg.MEMORY_LIMIT)
    )

    # artist_genres_pl
    adapt_artist_genres_pl_op = (
        adapt_ragged_text_layer_vocab.adapt_ragged_text_layer_vocab(
            project=project,
            location=location,
            version=model_version,
            data_dir_bucket_name=train_dir,
            data_dir_path_prefix=train_dir_prefix,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            max_playlist_length=max_playlist_length,
            max_tokens=max_tokens,
            ngrams=ngrams,
            feature_name='artist_genres_pl',
            generate_new_vocab=generate_new_vocab,
        )
        .set_display_name(f"adapt: artist_genres_pl")
        .set_caching_options(True)
        .set_cpu_limit(cfg.CPU_LIMIT)
        .set_memory_limit(cfg.MEMORY_LIMIT)
    )

    # ====================================================
    # Aggregate all Dicts
    # ====================================================

    create_master_vocab_op = (
        create_master_vocab.create_master_vocab(
            project=project,
            location=location,
            version=model_version,
            train_output_gcs_bucket=train_output_gcs_bucket,
            experiment_name=experiment_name,
            experiment_run=experiment_run,
            vocab_uri_1=adapt_pl_name_src_op.outputs['vocab_gcs_uri'], 
            vocab_uri_2=adapt_track_name_can_op.outputs['vocab_gcs_uri'], 
            vocab_uri_3=adapt_artist_name_can_op.outputs['vocab_gcs_uri'], 
            vocab_uri_4=adapt_album_name_can_op.outputs['vocab_gcs_uri'], 
            vocab_uri_5=adapt_artist_genres_can_op.outputs['vocab_gcs_uri'], 
            vocab_uri_6=adapt_track_name_pl_features_op.outputs['vocab_gcs_uri'], 
            vocab_uri_7=adapt_artist_name_pl_op.outputs['vocab_gcs_uri'], 
            vocab_uri_8=adapt_album_name_pl_op.outputs['vocab_gcs_uri'], 
            vocab_uri_9=adapt_artist_genres_pl_op.outputs['vocab_gcs_uri'],
            generate_new_vocab=generate_new_vocab,
        )
        # .after(fixed_for_loop_op).after(ragged_for_loop_op)
        .set_display_name("create master vocab")
        .set_caching_options(True)
        .set_cpu_limit(cfg.CPU_LIMIT)
        .set_memory_limit(cfg.MEMORY_LIMIT)
    )