# Developing Vertex AI pipelines with the TFX SDK



In [1]:
import os
import sys
import kfp
import tensorflow as tf
import tfx

from kfp.v2.google.client import AIPlatformClient

from tfx.dsl.components.base import executor_spec
from tfx.components.trainer import executor as trainer_executor
from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor
from tfx.orchestration import data_types
from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner
from tfx.orchestration.local.local_dag_runner import LocalDagRunner
from tfx.orchestration.metadata import sqlite_metadata_connection_config
from tfx.proto import trainer_pb2




In [2]:
tfx.__version__

'0.30.0'

In [3]:
%load_ext autoreload
%autoreload 2

## Configure lab settings

In [4]:
PROJECT = 'jk-vertexai-ws'
REGION = 'us-central1'
PREFIX = 'jk'

STAGING_BUCKET = f'gs://{PREFIX}-bucket'
VERTEX_SA = f'training-sa@{PROJECT}.iam.gserviceaccount.com'
PIPELINES_SA = f'pipelines-sa@{PROJECT}.iam.gserviceaccount.com'

DATA_ROOT_URI = 'gs://workshop-datasets/covertype/small'
SCHEMA_FOLDER = f'{STAGING_BUCKET}/tfx/schema'
TFX_IMAGE = f'gcr.io/{PROJECT}/tfxcovertype'

In [None]:
!gsutil cp pipeline/schema/schema.pbtxt {SCHEMA_FOLDER}/schema.pbtxt

## Review the pipeline design

## Compile the pipeline

In [None]:
from pipeline import pipeline

pipeline_name = 'covertype-continuous-training'

trainer_custom_config = None
trainer_custom_executor_spec=executor_spec.ExecutorClassSpec(
    trainer_executor.GenericExecutor)

beam_pipeline_args = [
    '--direct_running_mode=multi_processing',
    '--direct_num_workers=0' 
] 

pipeline_def = pipeline.create_pipeline(
    pipeline_name=pipeline_name,
    pipeline_root=f'{STAGING_BUCKET}/pipelines/{pipeline_name}',
    serving_model_uri=f'{STAGING_BUCKET}/models/Covertype',
    data_root_uri=data_types.RuntimeParameter(
        name='data_root_uri',
        ptype=str,
        default=DATA_ROOT_URI
    ),
    schema_folder_uri=SCHEMA_FOLDER,
    train_steps = 1000,
    eval_steps = 500,
    trainer_custom_executor_spec=trainer_custom_executor_spec,
    trainer_custom_config=trainer_custom_config,
    beam_pipeline_args=beam_pipeline_args,
)

In [None]:
pipeline_definition_file = 'covertype.json'

runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
    config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
        default_image=TFX_IMAGE
    ),
    output_filename=pipeline_definition_file,
)

pipeline_json = runner.run(pipeline_def)

## Prepare a TFX container

In [None]:
!gcloud builds submit --tag {TFX_IMAGE} pipeline

## Submitting pipeline runs

In [None]:
pipeline_client = AIPlatformClient(
    project_id=PROJECT,
    region=REGION,
)

In [None]:
parameter_values = {
    'data_root_uri': DATA_ROOT_URI,
}

pipeline_client.create_run_from_job_spec(
    job_spec_path=pipeline_definition_file,
    parameter_values=parameter_values,
    enable_caching=False,
    service_account=PIPELINES_SA
)