In [116]:
import os
import sys
import logging
import uuid
import kfp

import kfp.v2.dsl as dsl

from datetime import datetime
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
from kfp.v2.google import experimental
from kfp.v2.google.client import AIPlatformClient

sys.path.append('pipelines')
from pipelines.pipeline import taxi_tip_predictor_pipeline

In [2]:
kfp.__version__

'1.6.3'

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
PROJECT = 'jk-mlops-dev'
STAGING_BUCKET = 'gs://jk-vertex-workshop-bucket'
REGION = 'us-central1'
PIPELINES_SA = 'pipelines-sa@jk-mlops-dev.iam.gserviceaccount.com'

## Define custom components

In [None]:
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath)


### Data preparation component

In [None]:
@dsl.component(base_image='gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.1')
def prepare_data_splits_op(
    project: str,
    region: str,
    
    dataset: Output[Dataset]
):
    """Prepares training, validation, and testing data splits."""
    
    sql_script_template = '''
    CREATE OR REPLACE TABLE `@PROJECT.@DATASET.@TABLE` 
    AS (
        WITH
        taxitrips AS (
        SELECT
            FORMAT_DATETIME('%Y-%d-%m', trip_start_timestamp) AS date,
            trip_start_timestamp,
            trip_seconds,
            trip_miles,
            payment_type,
            pickup_longitude,
            pickup_latitude,
            dropoff_longitude,
            dropoff_latitude,
            tips,
            fare
        FROM
            `bigquery-public-data.chicago_taxi_trips.taxi_trips`
        WHERE 1=1 
        AND pickup_longitude IS NOT NULL
        AND pickup_latitude IS NOT NULL
        AND dropoff_longitude IS NOT NULL
        AND dropoff_latitude IS NOT NULL
        AND trip_miles > 0
        AND trip_seconds > 0
        AND fare > 0
        AND EXTRACT(YEAR FROM trip_start_timestamp) = @YEAR
        )

        SELECT
        trip_start_timestamp,
        EXTRACT(MONTH from trip_start_timestamp) as trip_month,
        EXTRACT(DAY from trip_start_timestamp) as trip_day,
        EXTRACT(DAYOFWEEK from trip_start_timestamp) as trip_day_of_week,
        EXTRACT(HOUR from trip_start_timestamp) as trip_hour,
        trip_seconds,
        trip_miles,
        payment_type,
        ST_AsText(
            ST_SnapToGrid(ST_GeogPoint(pickup_longitude, pickup_latitude), 0.1)
        ) AS pickup_grid,
        ST_AsText(
            ST_SnapToGrid(ST_GeogPoint(dropoff_longitude, dropoff_latitude), 0.1)
        ) AS dropoff_grid,
        ST_Distance(
            ST_GeogPoint(pickup_longitude, pickup_latitude), 
            ST_GeogPoint(dropoff_longitude, dropoff_latitude)
        ) AS euclidean,
        IF((tips/fare >= 0.2), 1, 0) AS tip_bin,
        CASE (ABS(MOD(FARM_FINGERPRINT(date),10))) 
            WHEN 9 THEN 'testing'
            WHEN 8 THEN 'validation'
            ELSE 'training' END AS data_split
        FROM
        taxitrips
        LIMIT @LIMIT
    )
    '''
    
    dataset.metadata['training_split'] = 'jk-mlops-dev.chicago_taxi_training.training_split'
    dataset.metadata['validation_split'] = 'jk-mlops-dev.chicago_taxi_training.validation_split'

### Trainer component

In [None]:
@component(base_image='gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.1')
def train_op(
    project: str,
    region: str,
    epochs: int,
    per_replica_batch_size: int,
    machine_type: str,
    accelerator_type: str,
    accelerator_count: int,
    dataset: Input[Dataset],
    model: Output[Model],
   
):
    """Prepares and submits Vertex AI Training custom container job."""
    
    CONTAINER_IMAGE_URI = 'gcr.io/jk-mlops-dev/taxi_classifier_trainer'
    
    import logging
    import time
    
    from google.cloud import aiplatform as vertex_ai
    

    # Set base_output_dir
    if model.path[0:4] != '/gcs':
        raise RuntimeError('Model dir must be a GCS location.')   
    model_path = model.path.rsplit('/', 1)
    if model_path[1] == 'model':
        base_output_dir = model_path[0]
    else:
        base_output_dir = model_path
    base_output_dir = 'gs://' + base_output_dir.split('/', 2)[2]

    # Prepare worker pool specification
    worker_pool_specs =  [
        {
            "machine_spec": {
                "machine_type": machine_type,
                "accelerator_type": accelerator_type,
                "accelerator_count": accelerator_count,
            },
            "replica_count": 1,
            "container_spec": {
                "image_uri": CONTAINER_IMAGE_URI,
                "command": ["python", "train.py"],
                "args": [
                    '--epochs=' + str(epochs), 
                    '--per_replica_batch_size=' + str(per_replica_batch_size),
                    '--training_table=' + dataset.metadata['training_split'],
                    '--validation_table=' + dataset.metadata['validation_split'],
                ],
            },
        }
    ]
    
    # Submit the job
    vertex_ai.init(
        project=project,
        location=region
    )
                                             
    job_name = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
    job = vertex_ai.CustomJob(
        display_name=job_name,
        worker_pool_specs=worker_pool_specs,
        staging_bucket=base_output_dir
    )

    response = job.run(sync=True)


TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
TIMESTAMP

In [110]:

@component(base_image='gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.1')
def prepare_dataset_op(
    project: str,
    region: str,
    dataset: Output[Dataset]
):
    dataset.metadata['training_split'] = 'jk-mlops-dev.chicago_taxi_training.training_split'
    dataset.metadata['validation_split'] = 'jk-mlops-dev.chicago_taxi_training.validation_split'


@component(base_image='gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.1')
def train_op(
    project: str,
    region: str,
    epochs: int,
    per_replica_batch_size: int,
    machine_type: str,
    accelerator_type: str,
    accelerator_count: int,
    dataset: Input[Dataset],
    model: Output[Model],
   
):
    
    CONTAINER_IMAGE_URI = 'gcr.io/jk-mlops-dev/taxi_classifier_trainer'
    
    import time
    from google.cloud import aiplatform as vertex_ai
    
    vertex_ai.init(
        project=project,
        location=region
    )
    
    job_name = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
    
    print('************************')
    print(model.path)
    
    
    # Set base_output_dir
    if model.path[0:4] != '/gcs':
        raise RuntimeError('Model dir must be a GCS location.')   
    model_path = model.path.rsplit('/', 1)
    if model_path[1] == 'model':
        base_output_dir = model_path[0]
    else:
        base_output_dir = model_path
    base_output_dir = 'gs://' + base_output_dir.split('/', 2)[2]
    
    print('******************')
    print(base_output_dir)
        
    
    worker_pool_specs =  [
        {
            "machine_spec": {
                "machine_type": machine_type,
                "accelerator_type": accelerator_type,
                "accelerator_count": accelerator_count,
            },
            "replica_count": 1,
            "container_spec": {
                "image_uri": CONTAINER_IMAGE_URI,
                "command": ["python", "train.py"],
                "args": [
                    '--epochs=' + str(epochs), 
                    '--per_replica_batch_size=' + str(per_replica_batch_size),
                    '--training_table=' + dataset.metadata['training_split'],
                    '--validation_table=' + dataset.metadata['validation_split'],
                ],
            },
        }
    ]
    
    print('########################')
    print(worker_pool_specs)
    print(base_output_dir)
    print(machine_type)
    print('########################')
                                                
    
    job = vertex_ai.CustomJob(
        display_name=job_name,
        worker_pool_specs=worker_pool_specs,
        staging_bucket=base_output_dir
    )

    response = job.run(sync=True)
    

@component(base_image='gcr.io/ml-pipeline/google-cloud-pipeline-components:0.1.1')
def test_op(
    input1: Input[Model]
):
    print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
    print(input1)
    print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')

In [111]:
VERTEX_TRAINING_JOB_NAME = 'taxi-tip-predictor-training-job'
PIPELINE_NAME = 'taxi-tip-predictor-continuous-training'

@kfp.dsl.pipeline(name=PIPELINE_NAME)
def taxi_tip_predictor_pipeline(
    project: str,
    region: str,
    staging_bucket: str,
    epochs: int,
    per_replica_batch_size: int,
    training_table: str,
    validation_table: str,
    machine_type: str = 'n1-standard-4',
    accelerator_type: str = 'NVIDIA_TESLA_T4',
    accelerator_count: int = 1,
):
    
    prepare_data = prepare_dataset_op(
        project=project,
        region=region
    )
    
    train = train_op(
        project=project,
        region=region,
        dataset=prepare_data.outputs['dataset'],
        epochs=epochs,
        per_replica_batch_size=per_replica_batch_size,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count
    )
    
    test = test_op(
        input1 = train.outputs['model']
    )

    
    

### Compile the pipeline

In [112]:
package_path = 'taxi_tip_predictor_pipeline.json'
compiler.Compiler().compile(
    pipeline_func=taxi_tip_predictor_pipeline,
    package_path=package_path
)

### Submit a pipeline run

In [113]:
api_client = AIPlatformClient(
    project_id=PROJECT,
    region=REGION,
)

In [114]:
pipeline_root = f'{STAGING_BUCKET}/pipelines'
model_display_name = 'Taxi tip predictor'
training_container_image = 'gcr.io/jk-mlops-dev/taxi_classifier_trainer'
epochs = 3
per_replica_batch_size = 128
training_table = 'jk-mlops-dev.chicago_taxi_training.training_split'
validation_table = 'jk-mlops-dev.chicago_taxi_training.validation_split'


parameter_values = {
    'project': PROJECT,
    'region': REGION,
    'staging_bucket': STAGING_BUCKET,
    'epochs': epochs,
    'per_replica_batch_size': per_replica_batch_size,
    'training_table': training_table,
    'validation_table': validation_table,
}

response = api_client.create_run_from_job_spec(
    package_path,
    pipeline_root=pipeline_root,
    parameter_values=parameter_values,
    service_account=PIPELINES_SA
)