## Define Constants

In [50]:
PROJECT_ID = 'jchavezar-demo'
TRAINING_IMAGE_URI = f'gcr.io/{PROJECT_ID}/demos-train-aws:latest'
SERVING_CONTAINER_IMAGE_URI = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-9:latest'
DATASET_DIR = 's3://gml-datasets/fraud_detection.csv'
MODEL_DIR = 'gs://vtx-models/aws/hpt'

## Create Folder Structure

In [51]:
import os

if not os.path.exists("1_custom_train_job"):
    os.makedirs("1_custom_train_job")

## Create Training File

In [59]:
%%writefile 1_custom_train_job/main.py

# Extracting information from AWS S3

import os
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
import hypertune

epochs = 20
batch_size = 100

def get_args():
    '''Parses args. Must include all hyperparameters you want to tune.'''
    
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--learning_rate',
        required=True,
        type=float,
        help='learning rate')
    parser.add_argument(
        '--num_neurons_1',
        required=True,
        type=int,
        help='number of units in first hidden layer')
    parser.add_argument(
        '--num_neurons_2',
        required=True,
        type=int,
        help='number of units in second hidden layer')
    args = parser.parse_args()
    return args


def preprocess_data(dataset: str):
    
    df = pd.read_csv(
        dataset,
        storage_options={
            "key": os.environ["AWS_ACCESS_KEY_ID"],
            "secret": os.environ["AWS_SECRET_ACCESS_KEY"],
        },
    )
    
    train_df = df.sample(frac=0.8, random_state=1)
    test_df = df.drop(train_df.index)

    X_train = train_df.iloc[:,:-1]
    X_test = test_df.iloc[:,:-1]
    y_train = train_df['Class'].astype(np.float32)
    y_test = test_df['Class'].astype(np.float32)
    
    # Standarization
    
    X_train_norm = (X_train-X_train.mean())/X_train.std()
    X_test_norm = (X_test-X_test.mean())/X_test.std()
    
    return X_train_norm, y_train, X_test_norm, y_test 
    
    # Model

def create_model(
    ds_length: int,
    my_learning_rate: float, 
    nn_1: int, 
    nn_2: int):
    from tensorflow.keras import layers
    
    METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
      tf.keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
    ]
    
    model = tf.keras.models.Sequential([
        layers.Dense(nn_1, activation='relu', input_shape=[ds_length]),
        layers.Dense(nn_2, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
        
    optimizer=tf.keras.optimizers.Adam(learning_rate=my_learning_rate)
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=optimizer,
        metrics=METRICS)
    
    return model

def train_model(
    x, 
    y, 
    model, 
    epochs,
    batch_size=None, 
    shuffle=True
):
    history = model.fit(
        x=x, 
        y=y, 
        batch_size=batch_size,
        epochs=epochs, shuffle=shuffle)
    
    epochs = history.epoch
    hist = pd.DataFrame(history.history)
    
    return hist

def main():
    
    # The following variables are the hyperparameters.
    args = get_args()
    print(args.learning_rate)
    
    X_train_norm, y_train, X_test_norm, y_test = preprocess_data(os.environ['FILE_URI'])
    ds_length = len(X_train_norm.keys())
    
    # Establish the model's topography.
    my_model = create_model(ds_length, args.learning_rate, args.num_neurons_1, args.num_neurons_2)
    
    # Train the model on the training set.
    hist = train_model(X_train_norm, y_train, my_model, epochs, 
                           batch_size)
    
    # DEFINE METRIC
    hp_metric = hist['accuracy'][0]
    
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag='accuracy',
        metric_value=hp_metric,
        global_step=epochs)
    
    import sys
    
    print(os.environ['AIP_MODEL_DIR'])
    print(os.environ['AIP_MODEL_DIR'], file=sys.stderr)

    my_model.save(os.environ['AIP_MODEL_DIR'])

if __name__ == "__main__":
    main()

Overwriting 1_custom_train_job/main.py


## Create Dockerfile

In [60]:
%%writefile 1_custom_train_job/Dockerfile

FROM tensorflow/tensorflow

RUN pip install boto3 pandas s3fs
RUN pip install --upgrade protobuf==3.20.0
RUN pip install cloudml-hypertune

COPY main.py /main.py

CMD ["python", "/main.py"]

Overwriting 1_custom_train_job/Dockerfile


## Create Docker Image with CloudBuild

In [61]:
!gcloud builds submit -t $TRAINING_IMAGE_URI 1_custom_train_job/.

Creating temporary tarball archive of 2 file(s) totalling 4.0 KiB before compression.
Uploading tarball of [1_custom_train_job/.] to [gs://jchavezar-demo_cloudbuild/source/1665083813.342888-8c64544d4ce64462b8f82fe08ded2489.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jchavezar-demo/locations/global/builds/06916014-e298-4213-be9e-012dde90c1b4].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/06916014-e298-4213-be9e-012dde90c1b4?project=569083142710 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "06916014-e298-4213-be9e-012dde90c1b4"

FETCHSOURCE
Fetching storage object: gs://jchavezar-demo_cloudbuild/source/1665083813.342888-8c64544d4ce64462b8f82fe08ded2489.tgz#1665083813565708
Copying gs://jchavezar-demo_cloudbuild/source/1665083813.342888-8c64544d4ce64462b8f82fe08ded2489.tgz#1665083813565708...
/ [1 files][  1.7 KiB/  1.7 KiB]                                                
Operation complet

## Create the Pipeline

In [62]:
## Define Job Specs
import env
from google.cloud.aiplatform import hyperparameter_tuning as hpt
from google_cloud_pipeline_components.experimental import hyperparameter_tuning_job

worker_pool_specs = [
    {
        'machine_spec': {
            'machine_type': 'n1-standard-4',
    },
        'replica_count': 1,
        'container_spec': {
            'image_uri': TRAINING_IMAGE_URI,
            'env': [
                {
                    'name': 'FILE_URI',
                    'value': DATASET_DIR
                },
                {
                    'name': 'AWS_ACCESS_KEY_ID',
                    'value': env.AWS_ACCESS_KEY_ID
                },
                {
                    'name': 'AWS_SECRET_ACCESS_KEY',
                    'value': env.AWS_SECRET_ACCESS_KEY
                },
            ]
        }
    }
]


metric_spec=hyperparameter_tuning_job.serialize_metrics({'accuracy': 'maximize'})
parameter_spec = hyperparameter_tuning_job.serialize_parameters({
    "learning_rate": hpt.DoubleParameterSpec(min=0.001, max=1, scale="log"),
    "num_neurons_1": hpt.DiscreteParameterSpec(values=[16, 32, 64, 128, 256], scale=None),
    "num_neurons_2": hpt.DiscreteParameterSpec(values=[16, 32, 64, 128, 256], scale=None)
})

In [63]:
import os
from kfp.v2 import dsl
from kfp.v2.dsl import pipeline
from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
from google_cloud_pipeline_components.experimental import hyperparameter_tuning_job
from google_cloud_pipeline_components.aiplatform import (
    EndpointCreateOp,
    ModelDeployOp,
    ModelUploadOp,
)

@pipeline(name='aws-gcp-test')
def pipeline(
    project_id: str,
    model_dir: str,
    serving_image_uri: str
):
    hp_tuning_task = hyperparameter_tuning_job.HyperparameterTuningJobRunOp(
        project=project_id,
        display_name='hpt_custom_train_task',
        worker_pool_specs = worker_pool_specs,
        study_spec_metrics=metric_spec,
        study_spec_parameters=parameter_spec,
        max_trial_count=15,
        parallel_trial_count=3,
        base_output_directory=model_dir
    )
    
    trials_task = hyperparameter_tuning_job.GetTrialsOp(
      gcp_resources=hp_tuning_task.outputs['gcp_resources'])

    best_trial_task = hyperparameter_tuning_job.GetBestTrialOp(
      trials=trials_task.output, study_spec_metrics=metric_spec)

    is_accuracy_beyond_threshold_task = hyperparameter_tuning_job.IsMetricBeyondThresholdOp(
      trial=best_trial_task.output, study_spec_metrics=metric_spec, threshold=0.7)
    
    with dsl.Condition(
        is_accuracy_beyond_threshold_task.output == "true",
        name="deploy_decision",        
    ):
        
        best_hyperparameters_task = hyperparameter_tuning_job.GetHyperparametersOp(
            trial=best_trial_task.output)
        
        # Construct new worker_pool_specs based on best hyperparameters
        worker_pool_specs_task = hyperparameter_tuning_job.GetWorkerPoolSpecsOp(
          best_hyperparameters=best_hyperparameters_task.output,
          worker_pool_specs=worker_pool_specs
        )
        
        # Train new model based on new worker_pool_specs
        training_task = CustomTrainingJobOp(
          project=project_id,
          display_name='training-job',
          worker_pool_specs=worker_pool_specs_task.output
        )
        
        model_upload_task = ModelUploadOp(
            project=project_id,
            display_name=f'tf-kfp-prebuilt-model-upload-job',
            artifact_uri=f'{model_dir}/model',
            serving_container_image_uri=serving_image_uri,
        ).after(training_task)
        
        create_endpoint_task = EndpointCreateOp(
            project=project_id,
            display_name='fraud_det'
        ).after(model_upload_task)
        
        model_deploy_task = ModelDeployOp(
            endpoint=create_endpoint_task.outputs["endpoint"],
            model=model_upload_task.outputs["model"],
            dedicated_resources_min_replica_count=1,
            dedicated_resources_max_replica_count=1,
            dedicated_resources_machine_type='n1-standard-4'
        )

In [66]:
import os
from kfp.v2 import dsl
from kfp.v2.dsl import pipeline
from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
from google_cloud_pipeline_components.experimental import hyperparameter_tuning_job
from google_cloud_pipeline_components.aiplatform import (
    EndpointCreateOp,
    ModelDeployOp,
    ModelUploadOp,
)

@pipeline(name='aws-gcp-test')
def pipeline(
    project_id: str,
    model_dir: str,
    serving_image_uri: str
):
    hp_tuning_task = hyperparameter_tuning_job.HyperparameterTuningJobRunOp(
        project=project_id,
        display_name='hpt_custom_train_task',
        worker_pool_specs = worker_pool_specs,
        study_spec_metrics=metric_spec,
        study_spec_parameters=parameter_spec,
        max_trial_count=15,
        parallel_trial_count=3,
        base_output_directory=model_dir
    )

## Compile Pipeline

In [67]:
from kfp.v2 import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='aws_gcp_test.json')

## Run Pipeline Job

In [68]:
import google.cloud.aiplatform as aip

job = aip.PipelineJob(
    display_name='customjob-aws-gcp',
    template_path='aws_gcp_test.json',
    pipeline_root='gs://vtx-path-root',
    parameter_values={
        'project_id': 'jchavezar-demo',
        'model_dir': MODEL_DIR,
        'serving_image_uri': SERVING_CONTAINER_IMAGE_URI
    },
    enable_caching=False
)

job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/569083142710/locations/us-central1/pipelineJobs/aws-gcp-test-20221006192405
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/569083142710/locations/us-central1/pipelineJobs/aws-gcp-test-20221006192405')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/aws-gcp-test-20221006192405?project=569083142710
