# Vertex Regression with .HyperparameterTuningJob() using Azure Container as source of Datasets

<center><img src="../images/03.png"/></center>

## Graphic Description

<center><img src="../images/azure-aws-hpt.png"/></center>

## Set Constants

In [1]:
PROJECT_ID = 'jchavezar-demo'
REGION = 'us-central1'
APPLICATION_DIR = 'hptune'
TRAINER_DIR = f"{APPLICATION_DIR}/trainer"
MODEL_URI = 'gs://vtx-models/mpg'
STAGING_URI = 'gs://vtx-staging/mpg/'
TRAIN_IMAGE_URI = f'gcr.io/{PROJECT_ID}/03cb-tf-train:latest'
PREDICTION_IMAGE_URI = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-9:latest'

## Import Libraries

In [2]:
import env
import google.cloud.aiplatform as aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

## Create Folder Structure

```
source
     └─── trainer
          |  train.py
          |

```

In [3]:
!rm -fr $APPLICATION_DIR
!mkdir -p $TRAINER_DIR

## Write Training Code

In [4]:
%%writefile {TRAINER_DIR}/task.py
import os
import argparse
import hypertune
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from azure.storage.blob import BlobServiceClient

def get_args():
    '''Parses args. Must include all hyperparameters you want to tune.'''
    
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--learning_rate',
        required=True,
        type=float,
        help='learning rate')
    parser.add_argument(
        '--dnn_hidden_units',
        required=True,
        type=str,
        help='Number of hidden layers and nueral networks')
    args = parser.parse_args()
    return args
args = get_args()

def load_data():
    '''Load data from Azure blob storage.'''
    
    # Create the BlobServiceClient object
    connect_str = os.environ['AZURE_STORAGE_CONNECTION_STRING']
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    
    # Load features (training dataset) from azure to pandas df
    azure_file = 'train.csv'
    blob_client_instance = blob_service_client.get_blob_client('vertex', azure_file, snapshot=None)
    with open(azure_file, 'wb') as my_blob:
        blob_data = blob_client_instance.download_blob()
        blob_data.readinto(my_blob)
    train_df = pd.read_csv(azure_file)
    
    # Load target/labels (values to predict) from AWS to pandas df
    train_labels = pd.read_csv(
    "s3://gml-datasets/train_labels.csv",
    storage_options={
        "key": os.environ["AWS_ACCESS_KEY_ID"],
        "secret": os.environ["AWS_SECRET_ACCESS_KEY"],
    },)
    
    return train_df, train_labels 
        
def build_model(dnn_hidden_units, learning_rate):
    '''Defines and compiles model.'''
    
    model = keras.Sequential()
    
    for neurons in dnn_hidden_units.split(','):
        model.add(layers.Dense(neurons, activation='relu', input_shape=[9]))
        model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1))
    optimizer = tf.keras.optimizers.RMSprop(learning_rate)

    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae', 'mse'])
    return model

def main():
    normed_train_data, train_labels = load_data()
    model = build_model(args.dnn_hidden_units, args.learning_rate)
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    if os.getenv('HPTUNE') == 'yes':
        NUM_EPOCHS = 5
        history = model.fit(
            normed_train_data, train_labels, 
            epochs=NUM_EPOCHS, validation_split = 0.2, 
            callbacks=[early_stop])
        
        # DEFINE METRIC
        hp_metric = history.history['mae'][-1]

        hpt = hypertune.HyperTune()
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='mae',
            metric_value=hp_metric,
            global_step=NUM_EPOCHS)
    else:
        NUM_EPOCHS = 30
        history = model.fit(
            normed_train_data, train_labels, 
            epochs=NUM_EPOCHS, validation_split = 0.2, 
            callbacks=[early_stop])
        model.save(os.environ['AIP_MODEL_DIR'])

if __name__ == "__main__":
    main()

Writing hptune/trainer/task.py


## Write Dockerfile

In [5]:
%%writefile {APPLICATION_DIR}/Dockerfile

FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-8

WORKDIR /

# Installs hypertune library
RUN pip install azure-storage-blob
RUN pip install boto3 pandas s3fs
RUN pip install cloudml-hypertune

# Copies the trainer code to the Docker image.
COPY trainer /trainer

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "-m", "trainer.task"]

Writing hptune/Dockerfile


## Build and Push Docker Image into Google Container Repository

In [6]:
!gcloud builds submit -t $TRAIN_IMAGE_URI $APPLICATION_DIR/.

Creating temporary tarball archive of 2 file(s) totalling 3.4 KiB before compression.
Uploading tarball of [hptune/.] to [gs://jchavezar-demo_cloudbuild/source/1673283546.811167-6463c4335f1e4db5b51ca2852cabea92.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jchavezar-demo/locations/global/builds/cdd016e2-b464-4fa5-97cb-d5736dd09a6c].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/cdd016e2-b464-4fa5-97cb-d5736dd09a6c?project=569083142710 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "cdd016e2-b464-4fa5-97cb-d5736dd09a6c"

FETCHSOURCE
Fetching storage object: gs://jchavezar-demo_cloudbuild/source/1673283546.811167-6463c4335f1e4db5b51ca2852cabea92.tgz#1673283547098549
Copying gs://jchavezar-demo_cloudbuild/source/1673283546.811167-6463c4335f1e4db5b51ca2852cabea92.tgz#1673283547098549...
/ [1 files][  1.7 KiB/  1.7 KiB]                                                
Operation completed over 1 ob

## Configure a Hyperparameter Tuning Job

In [7]:
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_T4",
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_IMAGE_URI,
            "env": [
                {
                    "name": "AZURE_STORAGE_CONNECTION_STRING",
                    "value": env.AZURE_STORAGE_CONNECTION_STRING
                },
                {
                    'name': 'AWS_ACCESS_KEY_ID',
                    'value': env.AWS_ACCESS_KEY_ID
                },
                {
                    'name': 'AWS_SECRET_ACCESS_KEY',
                    'value': env.AWS_SECRET_ACCESS_KEY
                },
                {
                    "name": "HPTUNE",
                    "value": "yes"
                },
            ]
        },
    }
]

In [8]:
## Parameter specs for hyperparemeter tuning job
parameter_spec = {
    "learning_rate": hpt.DoubleParameterSpec(min=0.001, max=1, scale="log"),
    "dnn_hidden_units": hpt.CategoricalParameterSpec(values=["16", "32", "64", "64,64", "16,32", "64,16,4", "64,64,64,8", "16,32,16", "64,256", "256,64,16", "8,16"]),
}
metric_spec = {"mae": "minimize"}

In [9]:
my_custom_job = aiplatform.CustomJob(
    display_name="milespergallon-prediction-job",
    worker_pool_specs=worker_pool_specs,
    staging_bucket=STAGING_URI,
)

In [10]:
hp_job = aiplatform.HyperparameterTuningJob(
    display_name="milespergallon-prediction-job",
    custom_job=my_custom_job,
    metric_spec=metric_spec,
    parameter_spec=parameter_spec,
    max_trial_count=36,
    parallel_trial_count=6,
)

hp_job.run()

Creating HyperparameterTuningJob
HyperparameterTuningJob created. Resource name: projects/569083142710/locations/us-central1/hyperparameterTuningJobs/2861343908795777024
To use this HyperparameterTuningJob in another session:
hpt_job = aiplatform.HyperparameterTuningJob.get('projects/569083142710/locations/us-central1/hyperparameterTuningJobs/2861343908795777024')
View HyperparameterTuningJob:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/2861343908795777024?project=569083142710
HyperparameterTuningJob projects/569083142710/locations/us-central1/hyperparameterTuningJobs/2861343908795777024 current state:
JobState.JOB_STATE_RUNNING
HyperparameterTuningJob projects/569083142710/locations/us-central1/hyperparameterTuningJobs/2861343908795777024 current state:
JobState.JOB_STATE_RUNNING
HyperparameterTuningJob projects/569083142710/locations/us-central1/hyperparameterTuningJobs/2861343908795777024 current state:
JobState.JOB_STATE_RUNNING
HyperparameterTuningJ

## Getting Best Trial Parameters

In [11]:
metrics_values = [v.final_measurement.metrics[0].value for n,v in enumerate(hp_job.trials)]
lowest_value = min(metrics_values)

results = {}

for n,v in enumerate(hp_job.trials):
    if v.final_measurement.metrics[0].value == lowest_value:
        best_trial = n
        for i in v.parameters:
            results[i.parameter_id] = i.value
print(f'The best trial was: {best_trial} and their paramaters: \n {results}')

The best trial was: 20 and their paramaters: 
 {'dnn_hidden_units': '32', 'learning_rate': 0.027525819470277928}


In [12]:
results['learning_rate']

0.027525819470277928

## Configure Training Job with Best Metrics

In [13]:
worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": "n1-standard-4",
            "accelerator_type": "NVIDIA_TESLA_T4",
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_IMAGE_URI,
            "env": [
                {
                    "name": "AZURE_STORAGE_CONNECTION_STRING",
                    "value": env.AZURE_STORAGE_CONNECTION_STRING
                },
                {
                    'name': 'AWS_ACCESS_KEY_ID',
                    'value': env.AWS_ACCESS_KEY_ID
                },
                {
                    'name': 'AWS_SECRET_ACCESS_KEY',
                    'value': env.AWS_SECRET_ACCESS_KEY
                },
                {
                    "name": "HPTUNE",
                    "value": "no"
                },
            ],
            "args": [
                f"--learning_rate={results['learning_rate']}",
                f"--dnn_hidden_units={results['dnn_hidden_units']}"
            ]
        },
    }
]

In [14]:
my_custom_job = aiplatform.CustomJob(
    display_name = "milespergallon-prediction-job",
    worker_pool_specs = worker_pool_specs,
    base_output_dir = MODEL_URI,
    staging_bucket = STAGING_URI,
)

model = my_custom_job.run()

Creating CustomJob
CustomJob created. Resource name: projects/569083142710/locations/us-central1/customJobs/1445524775941177344
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/569083142710/locations/us-central1/customJobs/1445524775941177344')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1445524775941177344?project=569083142710
CustomJob projects/569083142710/locations/us-central1/customJobs/1445524775941177344 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/1445524775941177344 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/1445524775941177344 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/1445524775941177344 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/1445524

## Upload Model

In [15]:
model = aiplatform.Model.upload(
    display_name = 'azure-03cc-mpg_tf_v2',
    serving_container_image_uri = PREDICTION_IMAGE_URI,
    artifact_uri = f'{MODEL_URI}/model',
)

Creating Model
Create Model backing LRO: projects/569083142710/locations/us-central1/models/73643089805180928/operations/5546236611018096640
Model created. Resource name: projects/569083142710/locations/us-central1/models/73643089805180928@1
To use this Model in another session:
model = aiplatform.Model('projects/569083142710/locations/us-central1/models/73643089805180928@1')


In [16]:
endpoint = model.deploy(
    deployed_model_display_name = '03cc-aa-mpg_tf_ep_dep',
    traffic_percentage = 100,
    machine_type = 'n1-standard-2',
    min_replica_count = 1,
    max_replica_count = 1,
)

Creating Endpoint
Create Endpoint backing LRO: projects/569083142710/locations/us-central1/endpoints/3689103403746590720/operations/401436986700726272
Endpoint created. Resource name: projects/569083142710/locations/us-central1/endpoints/3689103403746590720
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/569083142710/locations/us-central1/endpoints/3689103403746590720')
Deploying model to Endpoint : projects/569083142710/locations/us-central1/endpoints/3689103403746590720
Deploy Endpoint model backing LRO: projects/569083142710/locations/us-central1/endpoints/3689103403746590720/operations/7542457145850068992
Endpoint model deployed. Resource name: projects/569083142710/locations/us-central1/endpoints/3689103403746590720
