# Developing Vertex AI pipelines with the KFP v2 SDK

![Vertex pipeline](../images/pipeline.png)

In [1]:
import os
import sys
import kfp
import kfp.v2.dsl as dsl
import tensorflow as tf

from datetime import datetime
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, Metrics, ClassificationMetrics)
from typing import NamedTuple, List

## Configure lab settings

In [2]:
PROJECT = 'jk-vertexai-ws'
REGION = 'us-central1'
PREFIX = 'jkwst1'

STAGING_BUCKET = f'gs://{PREFIX}-bucket'
VERTEX_SA = f'training-sa@{PROJECT}.iam.gserviceaccount.com'
PIPELINES_SA = f'pipelines-sa@{PROJECT}.iam.gserviceaccount.com'

### Copy training data schema to GCS

In [3]:
SCHEMA_LOCATION = f'{STAGING_BUCKET}/schema'

!gsutil cp schema.pbtxt {SCHEMA_LOCATION}/

Copying file://schema.pbtxt [Content-Type=application/octet-stream]...
/ [1 files][  3.1 KiB/  3.1 KiB]                                                
Operation completed over 1 objects/3.1 KiB.                                      


In [4]:
!gsutil ls {SCHEMA_LOCATION}

gs://jkwst1-bucket/schema/schema.pbtxt


## Prepare a training container

### Create a training script

In [5]:
SCRIPT_FOLDER = 'trainer'
if tf.io.gfile.exists(SCRIPT_FOLDER):
    tf.io.gfile.rmtree(SCRIPT_FOLDER)
tf.io.gfile.mkdir(SCRIPT_FOLDER)
file_path = os.path.join(SCRIPT_FOLDER, 'train.py')

In [6]:
%%writefile {file_path}


# Copyright 2021 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

import json
import os
import tensorflow as tf
import tensorflow_data_validation as tfdv

from absl import app
from absl import flags
from absl import logging

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow_io import bigquery as tfio_bq

FLAGS = flags.FLAGS
flags.DEFINE_integer('epochs', 3, 'Nubmer of epochs')
flags.DEFINE_integer('units', 32, 'Number units in a hidden layer')
flags.DEFINE_integer('per_replica_batch_size', 128, 'Per replica batch size')
flags.DEFINE_float('dropout_ratio', 0.5, 'Dropout ratio')
flags.DEFINE_string('training_table', None, 'Training table name')
flags.DEFINE_string('validation_table', None, 'Validationa table name')
flags.DEFINE_string('schema_file', None, 'Location of the data schema file')
flags.mark_flag_as_required('training_table')
flags.mark_flag_as_required('validation_table')
flags.mark_flag_as_required('schema_file')

LOCAL_MODEL_DIR = '/tmp/saved_model'
LOCAL_TB_DIR = '/tmp/logs'
LOCAL_CHECKPOINT_DIR = '/tmp/checkpoints'
TARGET_TAG = 'target'


def schema_to_features(schema):
    """Converts a schema_pb2 protobuf to feature dictionary."""
    
    features = {}
    for feature in schema.feature:
        if feature.type == 2:
            if feature.int_domain.is_categorical:
                features[feature.name] = ('categorical', tf.int64)
            else:
                features[feature.name] = ('numeric', tf.int64)
        elif feature.type == 1:
            features[feature.name] = ('categorical', tf.string)
        elif feature.type == 3:
            features[feature.name] = ('numeric', tf.double)
    
    return features


def get_target_feature(schema):
    """Returns the name of the target feature in schema."""
    
    target_feature = None
    for feature in schema.feature:
        if feature.HasField('annotation'):
            if TARGET_TAG in feature.annotation.tag:
                target_feature = feature.name
    return target_feature


def set_job_dirs():
    """Sets job directories based on env variables set by Vertex AI."""
    
    model_dir = os.getenv('AIP_MODEL_DIR', LOCAL_MODEL_DIR)
    tb_dir = os.getenv('AIP_TENSORBOARD_LOG_DIR', LOCAL_TB_DIR)
    checkpoint_dir = os.getenv('AIP_CHECKPOINT_DIR', LOCAL_CHECKPOINT_DIR)
    
    return model_dir, tb_dir, checkpoint_dir


def get_bq_dataset(table_name, features, target_feature, batch_size=32):
    """Creates a tf.data dataset for direct access to BQ table."""
    
    def _transform_row(row_dict):
        trimmed_dict = {column:
                       (tf.strings.strip(tensor) if tensor.dtype == 'string' else tensor) 
                       for (column,tensor) in row_dict.items()
                       }
        target = trimmed_dict.pop(target_feature)
        return (trimmed_dict, target)
    
    selected_fields = {key: {'output_type': value[1]} 
                       for key, value in features.items()}
    project_id, dataset_id, table_id = table_name.split('.')
    client = tfio_bq.BigQueryClient()
    parent = f'projects/{project_id}'

    read_session = client.read_session(
        parent=parent,
        project_id=project_id,
        table_id=table_id,
        dataset_id=dataset_id,
        selected_fields=selected_fields,
    )

    dataset = read_session.parallel_read_rows().map(_transform_row).batch(batch_size)
    
    return dataset


def get_category_encoding_layer(name, dataset, dtype):
    """Creates a CategoryEncoding layer for a given feature."""

    if dtype == tf.string:
      index = preprocessing.StringLookup()
    else:
      index = preprocessing.IntegerLookup()

    feature_ds = dataset.map(lambda x, y: x[name])
    index.adapt(feature_ds)
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    return lambda feature: encoder(index(feature))


def get_normalization_layer(name, dataset):
    """"Creates a Normalization layer for a given feature."""
    normalizer = preprocessing.Normalization()

    feature_ds = dataset.map(lambda x, y: x[name])
    normalizer.adapt(feature_ds)

    return normalizer


def create_model(dataset, input_features, units, dropout_ratio):
    """Creates a binary classifier for Chicago Taxi tip prediction task."""
    
    all_inputs = []
    encoded_features = []
    for feature_name, feature_info in input_features.items():
        col = tf.keras.Input(shape=(1,), name=feature_name, dtype=feature_info[1])
        if feature_info[0] == 'categorical':
            
            encoding_layer = get_category_encoding_layer(feature_name, 
                                                         dataset,
                                                         feature_info[1])
        else:
            encoding_layer = get_normalization_layer(feature_name,
                                                     dataset) 
        encoded_col = encoding_layer(col)
        all_inputs.append(col)
        encoded_features.append(encoded_col)
        
    all_features = tf.keras.layers.concatenate(encoded_features)
    
    x = tf.keras.layers.Dense(units, activation="relu")(all_features)
    x = tf.keras.layers.Dropout(dropout_ratio)(x)
    output = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(all_inputs, output)
    
    return model


def main(argv):
    del argv
    
    # Set distribution strategy
    strategy = tf.distribute.MirroredStrategy()
    
    global_batch_size = (strategy.num_replicas_in_sync *
                         FLAGS.per_replica_batch_size)
    
    # Extract features from schema_pb2
    schema = tfdv.load_schema_text(FLAGS.schema_file)
    features = schema_to_features(schema)
    target_feature = get_target_feature(schema)

    if not target_feature:
        raise RuntimeError('Schema does not have a target feature')
    
    # Prepare datasets
    validation_ds = get_bq_dataset(FLAGS.validation_table, 
                                   features,
                                   target_feature,
                                   batch_size=global_batch_size)
    training_ds = get_bq_dataset(FLAGS.training_table,
                                 features,
                                 target_feature,
                                 batch_size=global_batch_size)
    
    # Prepare the model
    logging.info('Creating the model ...')
    input_features = {key: value for key, value in features.items() if key != target_feature}
    
    with strategy.scope():
        model = create_model(training_ds, input_features, FLAGS.units, FLAGS.dropout_ratio)
        model.compile(optimizer='adam',
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
    # Configure Keras callbacks
    model_dir, tb_dir, checkpoint_dir = set_job_dirs()
    callbacks = [tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=checkpoint_dir)]
    callbacks.append(tf.keras.callbacks.TensorBoard(
            log_dir=tb_dir, update_freq='batch'))
    
    logging.info('Starting training ...')
    model.fit(training_ds, 
              epochs=FLAGS.epochs, 
              validation_data=validation_ds,
              callbacks=callbacks)
    
    # Save trained model
    logging.info('Training completed. Saving the trained model to: {}'.format(model_dir))
    model.save(model_dir)  
    
    
if __name__ == '__main__':
    logging.set_verbosity(logging.INFO)
    app.run(main)

Writing trainer/train.py


### Create a Dockerfile

In [7]:
BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-cpu.2-4'
TRAIN_IMAGE = f'gcr.io/{PROJECT}/taxi_classifier_trainer_v2'

dockerfile = f'''
FROM {BASE_IMAGE}

WORKDIR /trainer

# Copies the trainer code to the docker image.
COPY train.py .

ENTRYPOINT ["python", "train.py"]
'''

with open(os.path.join(SCRIPT_FOLDER, 'Dockerfile'), 'w') as f:
    f.write(dockerfile)

### Build a container image

In [None]:
!gcloud builds submit --tag {TRAIN_IMAGE} {SCRIPT_FOLDER}

Creating temporary tarball archive of 2 file(s) totalling 8.0 KiB before compression.
Uploading tarball of [trainer] to [gs://jk-wst1_cloudbuild/source/1624910431.434771-e3a22503cfff4662b529d66022f3525d.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jk-wst1/locations/global/builds/75cf6e6f-ab4d-431a-874f-b6f49e9bbc02].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/75cf6e6f-ab4d-431a-874f-b6f49e9bbc02?project=630263135640].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "75cf6e6f-ab4d-431a-874f-b6f49e9bbc02"

FETCHSOURCE
Fetching storage object: gs://jk-wst1_cloudbuild/source/1624910431.434771-e3a22503cfff4662b529d66022f3525d.tgz#1624910431778319
Copying gs://jk-wst1_cloudbuild/source/1624910431.434771-e3a22503cfff4662b529d66022f3525d.tgz#1624910431778319...
/ [1 files][  2.9 KiB/  2.9 KiB]                                                
Operation completed over 1 objects/2.9 KiB.
BUILD
Already ha

## Define custom KFP components

### Data ingestion

In [9]:
@dsl.component(base_image='gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.1')
def ingest_data_op(
    project: str,
    bq_location: str,
    sample_size: int,
    year: int,
    dataset_name: str,
    train_split_name: str,
    valid_split_name: str,
    test_split_name: str,
    dataset: Output[Dataset]
):
    """Prepares training, validation, and testing data splits
    from Chicago taxi public dataset."""
    
    import logging
    from google.cloud import bigquery
    from google.cloud import exceptions
    
    METADATA_TRAIN_SPLIT_KEY = 'train_split'
    METADATA_VALID_SPLIT_KEY = 'valid_split'
    METADATA_TEST_SPLIT_KEY = 'test_split'
    
    sql_script_template = '''
    CREATE TEMP TABLE features 
    AS (
        WITH
        taxitrips AS (
        SELECT
            FORMAT_DATETIME('%Y-%d-%m', trip_start_timestamp) AS date,
            trip_start_timestamp,
            trip_seconds,
            trip_miles,
            payment_type,
            pickup_longitude,
            pickup_latitude,
            dropoff_longitude,
            dropoff_latitude,
            tips,
            fare
        FROM
            `bigquery-public-data.chicago_taxi_trips.taxi_trips`
        WHERE 1=1 
        AND pickup_longitude IS NOT NULL
        AND pickup_latitude IS NOT NULL
        AND dropoff_longitude IS NOT NULL
        AND dropoff_latitude IS NOT NULL
        AND trip_miles > 0
        AND trip_seconds > 0
        AND fare > 0
        AND EXTRACT(YEAR FROM trip_start_timestamp) = @YEAR
        )

        SELECT
        trip_start_timestamp,
        EXTRACT(MONTH from trip_start_timestamp) as trip_month,
        EXTRACT(DAY from trip_start_timestamp) as trip_day,
        EXTRACT(DAYOFWEEK from trip_start_timestamp) as trip_day_of_week,
        EXTRACT(HOUR from trip_start_timestamp) as trip_hour,
        trip_seconds,
        trip_miles,
        payment_type,
        ST_AsText(
            ST_SnapToGrid(ST_GeogPoint(pickup_longitude, pickup_latitude), 0.1)
        ) AS pickup_grid,
        ST_AsText(
            ST_SnapToGrid(ST_GeogPoint(dropoff_longitude, dropoff_latitude), 0.1)
        ) AS dropoff_grid,
        ST_Distance(
            ST_GeogPoint(pickup_longitude, pickup_latitude), 
            ST_GeogPoint(dropoff_longitude, dropoff_latitude)
        ) AS euclidean,
        IF((tips/fare >= 0.2), 1, 0) AS tip_bin,
        CASE (ABS(MOD(FARM_FINGERPRINT(date),10))) 
            WHEN 9 THEN 'TEST'
            WHEN 8 THEN 'VALIDATE'
            ELSE 'TRAIN' END AS data_split
        FROM
        taxitrips
        LIMIT @LIMIT
    );

    CREATE OR REPLACE TABLE `@PROJECT.@DATASET.@TRAIN_SPLIT`
    AS
    SELECT * EXCEPT (trip_start_timestamp, data_split)
    FROM features
    WHERE data_split='TRAIN';

    CREATE OR REPLACE TABLE `@PROJECT.@DATASET.@VALIDATE_SPLIT`
    AS
    SELECT * EXCEPT (trip_start_timestamp, data_split)
    FROM features
    WHERE data_split='VALIDATE';

    CREATE OR REPLACE TABLE `@PROJECT.@DATASET.@TEST_SPLIT`
    AS
    SELECT * EXCEPT (trip_start_timestamp, data_split)
    FROM features
    WHERE data_split='TEST';

    DROP TABLE features;
    '''
    
    client = bigquery.Client(project=project)
    ds = bigquery.Dataset(f'{project}.{dataset_name}')
    ds.location = bq_location
    try:
        ds = client.create_dataset(ds, timeout=30)
        logging.info(f'Created dataset: {project}.{dataset_name}')
    except exceptions.Conflict:
        logging.info(f'Dataset {project}.{dataset_name} already exists')
        
    sql_script = sql_script_template.replace(
        '@PROJECT', project).replace(
        '@DATASET', dataset_name).replace(
        '@TRAIN_SPLIT', train_split_name).replace(
        '@VALIDATE_SPLIT', valid_split_name).replace(
        '@TEST_SPLIT', test_split_name).replace(
        '@YEAR', str(year)).replace(
        '@LIMIT', str(sample_size))

    job = client.query(sql_script)
    job.result()
    
    dataset.metadata[METADATA_TRAIN_SPLIT_KEY] = f'{project}.{dataset_name}.{train_split_name}'
    dataset.metadata[METADATA_VALID_SPLIT_KEY] = f'{project}.{dataset_name}.{valid_split_name}'
    dataset.metadata[METADATA_TEST_SPLIT_KEY] = f'{project}.{dataset_name}.{test_split_name}'

### Statistics generation

In [10]:
@dsl.component(base_image='tensorflow/tfx:latest',
               packages_to_install=['google-cloud-bigquery[bqstorage,pandas]'])
def generate_stats_op(
    project: str,
    sample_percentage: int,
    dataset: Input[Dataset],
    stats: Output[Artifact],
   
):
    """Generates statistics for the data splits 
    referenced in the input Dataset artifact."""
    
    import os
    import tensorflow_data_validation as tfdv
    from google.cloud import bigquery
    
    METADATA_TRAIN_SPLIT_KEY = 'train_split'
    METADATA_VALID_SPLIT_KEY = 'valid_split'
    METADATA_TEST_SPLIT_KEY = 'test_split'
    
    STATS_FILE_NAME = 'stats.pbtxt'
    
    sql_script_template = '''
    SELECT * 
    FROM @TABLE
    TABLESAMPLE SYSTEM (@SAMPLE_PERC PERCENT)
    '''
    
    client = bigquery.Client(project=project)
    for key in [METADATA_TRAIN_SPLIT_KEY, METADATA_VALID_SPLIT_KEY, METADATA_TEST_SPLIT_KEY]:
        if key in dataset.metadata.keys():
            sql_script = sql_script_template.replace(
                '@TABLE', dataset.metadata[key]).replace(
                '@SAMPLE_PERC', str(sample_percentage))
            
            df = client.query(sql_script).result().to_dataframe()
    
            stats_proto = tfdv.generate_statistics_from_dataframe(
                dataframe=df,
                stats_options=tfdv.StatsOptions(
                    num_top_values=50
                )
            )
    
            file_path = os.path.join(stats.path, key)
            os.makedirs(file_path)
            tfdv.write_stats_text(stats_proto, 
                                  os.path.join(file_path, STATS_FILE_NAME))
            
            
    

### Data validation

In [11]:
@dsl.component(base_image='tensorflow/tfx:latest')
def validate_stats_op(
    project: str,
    stats: Input[Artifact],
    schema: Input[Artifact],
    anomalies: Output[Artifact],  
)-> NamedTuple(
    'ValidOutputs',
    [
        ('anomalies_detected', str)
    ]):
    """Validates statistices referenced in the input stats Artifact."""
    
    STATS_FILE_NAME = 'stats.pbtxt'
    ANOMALIES_FILE_NAME = 'anomalies.pbtxt'
    TRUE = 'true'
    FALSE = 'false'
    
    import os
    import logging
    import tensorflow_data_validation as tfdv
    from collections import namedtuple
    
    schema_proto = tfdv.load_schema_text(
        input_path=schema.path
    ) 
    
    anomalies_detected = FALSE
    for folder in os.listdir(stats.path):
        stats_proto = tfdv.load_stats_text(
            input_path=os.path.join(stats.path, folder, STATS_FILE_NAME)
        )
        
        anomalies_proto = tfdv.validate_statistics(
            statistics=stats_proto, 
            schema=schema_proto
        )
        
        file_path = os.path.join(anomalies.path, folder)
        os.makedirs(file_path)
        file_path = os.path.join(file_path, ANOMALIES_FILE_NAME)
        tfdv.write_anomalies_text(anomalies_proto, file_path)
                                 
        if anomalies_proto.anomaly_info:
            anomalies_detected = TRUE
            logging.info('Anomamlies detected: {}'.format(file_path))
    
    output = namedtuple('ValidOutputs', ['anomalies_detected'])
    
    return output(anomalies_detected)

## Define and compile the pipeline

In [12]:
# Compile time settings
PIPELINE_NAME = f'{PREFIX}-continuous-training-pipeline'

BQ_DATASET_NAME = f'{PREFIX}_dataset_pipeline'
BQ_LOCATION = 'US'
TRAINING_TABLE_NAME = 'training'
VALIDATION_TABLE_NAME = 'validation'
TESTING_TABLE_NAME = 'testing'
SCHEMA = f'{SCHEMA_LOCATION}/schema.pbtxt'

TRAINING_CONTAINER_IMAGE = TRAIN_IMAGE
TRAINING_MACHINE_TYPE = 'n1-standard-4'
REPLICA_COUNT = 1

SERVING_CONTAINER_IMAGE = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-4:latest'
SERVING_MACHINE_TYPE = 'n1-standard-4'


@dsl.pipeline(name=PIPELINE_NAME)
def taxi_tip_predictor_training(
    model_display_name: str,
    epochs: int,
    staging_location: str,
    per_replica_batch_size: int,
    sample_percentage: int = 100,
    year: int = 2020,
    sample_size: int = 1000000,
):
    
    import_schema = kfp.dsl.importer(
        artifact_uri=SCHEMA,
        artifact_class=Artifact,
        reimport=False,
    )
    
    prepare_data = ingest_data_op(
        project=PROJECT,
        bq_location=BQ_LOCATION,
        sample_size=sample_size,
        year=year,
        dataset_name=BQ_DATASET_NAME,
        train_split_name=TRAINING_TABLE_NAME,
        valid_split_name=VALIDATION_TABLE_NAME,
        test_split_name=TESTING_TABLE_NAME,
    )
    
    generate_stats = generate_stats_op(
        project=PROJECT,
        sample_percentage=sample_percentage,
        dataset=prepare_data.outputs['dataset'],
    )
    
    validate_stats = validate_stats_op(
        project=PROJECT,
        schema=import_schema.output,
        stats=generate_stats.outputs['stats'],
    )
    
    with dsl.Condition(validate_stats.outputs['anomalies_detected'] == 'false',
                       name = 'Anomalies detected'):
    
        args = [
            '--epochs', str(epochs),
            '--per_replica_batch_size', str(per_replica_batch_size),
            '--training_table', f'{PROJECT}.{BQ_DATASET_NAME}.{TRAINING_TABLE_NAME}',
            '--validation_table',  f'{PROJECT}.{BQ_DATASET_NAME}.{VALIDATION_TABLE_NAME}',
            '--schema_file', SCHEMA,
        ]
        
        train = gcc_aip.CustomContainerTrainingJobRunOp(
            project=PROJECT,
            location=REGION,
            display_name=model_display_name,
            model_display_name=model_display_name,
            container_uri=TRAINING_CONTAINER_IMAGE,
            args=args,
            replica_count=REPLICA_COUNT,
            staging_bucket=staging_location,
            model_serving_container_image_uri=SERVING_CONTAINER_IMAGE,
        )
    
        create_endpoint = gcc_aip.EndpointCreateOp(
            project=PROJECT,
            display_name=model_display_name
        )
        create_endpoint.after(train)
    
        deploy_model = gcc_aip.ModelDeployOp(
            project=PROJECT,
            endpoint=create_endpoint.outputs['endpoint'],
            model=train.outputs['model'],
            deployed_model_display_name=model_display_name,
            machine_type=SERVING_MACHINE_TYPE
        )

### Compile the pipeline

In [13]:
package_path = 'taxi_tip_predictor_pipeline.json'
compiler.Compiler().compile(
    pipeline_func=taxi_tip_predictor_training,
    package_path=package_path
)

### Submit a pipeline run

In [14]:
api_client = AIPlatformClient(
    project_id=PROJECT,
    region=REGION,
)

In [15]:
parameter_values = {
    'model_display_name': f'{PREFIX} Taxi tip predictor',
    'epochs': 2,
    'per_replica_batch_size': 128,
    'staging_location': f'{STAGING_BUCKET}/vertex_staging',
}

response = api_client.create_run_from_job_spec(
    package_path,
    pipeline_root=f'{STAGING_BUCKET}/pipeline_runs',
    parameter_values=parameter_values,
    enable_caching=False,
    service_account=PIPELINES_SA
)

## Extract pipeline run metadata

In [16]:
from google.cloud import aiplatform

In [17]:
pipeline_name = PIPELINE_NAME

pipeline_df = aiplatform.get_pipeline_df(pipeline=pipeline_name)
pipeline_df

Unnamed: 0,pipeline_name,run_name,param.input:per_replica_batch_size,param.input:year,param.input:model_display_name,param.input:sample_percentage,param.input:staging_location,param.input:epochs,param.input:sample_size
0,jk-continuous-training-pipeline,jk-continuous-training-pipeline-20210616005908,128,2020,jk Taxi tip predictor,100,gs://jk-bucket/vertex_staging,2,1000000
1,jk-continuous-training-pipeline,jk-continuous-training-pipeline-20210615220234,128,2020,jk Taxi tip predictor,100,gs://jk-bucket/vertex_staging,2,1000000
