# Developing Vertex AI pipelines with the TFX SDK



In [1]:
import os
import sys
import kfp
import tensorflow as tf
import tfx

from kfp.v2.google.client import AIPlatformClient

from tfx.dsl.components.base import executor_spec
from tfx.components.trainer import executor as trainer_executor
from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor
from tfx.orchestration import data_types
from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner
from tfx.orchestration.local.local_dag_runner import LocalDagRunner
from tfx.orchestration.metadata import sqlite_metadata_connection_config
from tfx.proto import trainer_pb2




In [2]:
tfx.__version__

'0.30.0'

In [3]:
%load_ext autoreload
%autoreload 2

## Configure lab settings

In [21]:
PROJECT = 'jk-vertex-workshop'
REGION = 'us-central1'
PREFIX = 'jkvw'

STAGING_BUCKET = f'gs://{PREFIX}-bucket'
VERTEX_SA = f'training-sa@{PROJECT}.iam.gserviceaccount.com'
PIPELINES_SA = f'pipelines-sa@{PROJECT}.iam.gserviceaccount.com'

DATA_ROOT_URI = 'gs://workshop-datasets/covertype/small'
SCHEMA_FOLDER = f'{STAGING_BUCKET}/schema'
TFX_IMAGE = f'gcr.io/{PROJECT}/tfxcovertype'

In [22]:
!gsutil cp pipeline/schema/schema.pbtxt {SCHEMA_FOLDER}/schema.pbtxt

Copying file://pipeline/schema/schema.pbtxt [Content-Type=application/octet-stream]...
/ [1 files][  2.9 KiB/  2.9 KiB]                                                
Operation completed over 1 objects/2.9 KiB.                                      


## Review the pipeline design

## Compile the pipeline

In [23]:
from pipeline import pipeline

pipeline_name = 'covertype-continuous-training'

trainer_custom_config = None
trainer_custom_executor_spec=executor_spec.ExecutorClassSpec(
    trainer_executor.GenericExecutor)

beam_pipeline_args = [
    '--direct_running_mode=multi_processing',
    '--direct_num_workers=0' 
] 

pipeline_def = pipeline.create_pipeline(
    pipeline_name=pipeline_name,
    pipeline_root=f'{STAGING_BUCKET}/pipelines/{pipeline_name}',
    serving_model_uri=f'{STAGING_BUCKET}/models/Covertype',
    data_root_uri=data_types.RuntimeParameter(
        name='data-root-uri',
        ptype=str,
        default=DATA_ROOT_URI
    ),
    schema_folder_uri = data_types.RuntimeParameter(
        name='schema-folder-uri',
        ptype=str,
        default=SCHEMA_FOLDER
    ),
    train_steps = data_types.RuntimeParameter(
        name='train-steps',
        ptype=int,
        default=1000
    ),
    eval_steps = data_types.RuntimeParameter(
        name='eval-steps',
        ptype=int,
        default=500
    ),
    trainer_custom_executor_spec=trainer_custom_executor_spec,
    trainer_custom_config=trainer_custom_config,
    beam_pipeline_args=beam_pipeline_args,
)



In [24]:
pipeline_definition_file = 'covertype.json'

runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
    config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
        default_image=TFX_IMAGE
    ),
    output_filename=pipeline_definition_file,
)

pipeline_json = runner.run(pipeline_def)

## Prepare a TFX container

In [None]:
!gcloud builds submit --tag {TFX_IMAGE} pipeline

Creating temporary tarball archive of 16 file(s) totalling 85.9 KiB before compression.
Uploading tarball of [pipeline] to [gs://jk-vertex-workshop_cloudbuild/source/1623788537.875119-f1776ee9a84a46b490506d6e4c5d1472.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jk-vertex-workshop/locations/global/builds/41fc9971-3429-4faf-a02f-da1287834329].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/41fc9971-3429-4faf-a02f-da1287834329?project=910094146258].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "41fc9971-3429-4faf-a02f-da1287834329"

FETCHSOURCE
Fetching storage object: gs://jk-vertex-workshop_cloudbuild/source/1623788537.875119-f1776ee9a84a46b490506d6e4c5d1472.tgz#1623788538134801
Copying gs://jk-vertex-workshop_cloudbuild/source/1623788537.875119-f1776ee9a84a46b490506d6e4c5d1472.tgz#1623788538134801...
/ [1 files][ 19.0 KiB/ 19.0 KiB]                                                
Operation com

## Submitting pipeline runs

In [20]:
pipeline_client = AIPlatformClient(
    project_id=PROJECT,
    region=REGION,
)

In [None]:
parameter_values = {
    'data_root_uri': DATA_ROOT_URI,
    'train_steps': 1000,
    'eval_steps': 500
}

pipeline_client.create_run_from_job_spec{
    job_spec_path=pipeline_definition_file,
    parameter_values=parameter_values
}