## Large Vertex AI jobs and HPT

In [None]:
import os
from pathlib import  Path
import numpy as np
import json
import tensorflow as tf
from google.cloud import aiplatform

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='./tf101.json'

In [3]:
from google.cloud import storage

In [4]:
def upload_directory(bucket, dirpath):
    dirpath = Path(dirpath)
    for filepath in dirpath.glob("**/*"):
        if filepath.is_file():
            blob = bucket.blob(filepath.relative_to(dirpath.parent).as_posix())
            blob.upload_from_filename(filepath)

In [5]:
project_id = 'just-aloe-414315'
bucket_name = 'tf101_bucket'
location = 'us-central1'

In [6]:
storage_client = storage.Client(project=project_id)

In [7]:
# Bucket creation, done only once
bucket = storage_client.create_bucket(bucket_name, location=location)

In [8]:
bucket

<Bucket: tf101_bucket>

In [None]:
bucket = storage_client.get_bucket(bucket_name)

In [9]:
aiplatform.init(project=project_id, location=location)

In [10]:
server_image = "gcr.io/cloud-aiplatform/prediction/tf2-gpu.2-8:latest"

In [11]:
#X_new = X_test[:3]

## Large Vertex AI jobs use similar MultiWorkerMirroredStrategy config with a few adjustments

## The Vertex AI training script
* To be saved in *my_vertex_ai_training_task.py*

In [None]:
import os
from pathlib import  Path
import tempfile
import tensorflow as tf

strategy = tf.distribute.MultiWorkerMirroredStrategy()
resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()

print(f"Starting task {resolver.task_type} #{resolver.task_id}")

if resolver.task_type == 'chief':
    model_dir = os.getenv("AIP_MODEL_DIR") # Provided by Vertex AI
    tensorboard_log_dir = os.getenv("AIP_TENSORBOARD_LOG_DIR")
    checkpoint_dir = os.getenv("AIP_CHECKPOINT_DIR")
else:
    tmp_dir = Path(tempfile.mkdtemp()) # tmpdirs for non-chief workers
    model_dir = tmp_dir / "model"
    tensorboard_log_dir = tmp_dir / "logs"
    checkpoint_dir = tmp_dir / "ckpt"

callbacks = [tf.keras.callbacks.TensorBoard(tensorboard_log_dir),
            tf.keras.callbacks.ModelCheckpoint(checkpoint_dir)]

with strategy.scope():

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=[28, 28]))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(300, activation='relu'))
    model.add(tf.keras.layers.Dense(100, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='softmax'))

    model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
             optimizer=tf.keras.optimizers.SGD(),
             metrics=[tf.keras.metrics.sparse_categorical_accuracy])

fashion_mnist=tf.keras.datasets.fashion_mnist.load_data()

(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist

X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]

# Data normalization
X_train, X_valid, X_test = X_train/255., X_valid/255., X_test/255. 

model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10,
          callbacks=callbacks)

model.save(model_dir, save_format="tf")

## Can store data in GCS and directly create tf.data.TextLineDataset and tf.data.TFRecordDataset from it
## Need to pass GCS paths as filenames, e.g., gs://my_bucket/data/001.csv

## The Vertex AI custom training job

In [34]:
custom_training_job = aiplatform.CustomTrainingJob(
    display_name="my_custom_training_job",
    script_path="my_vertex_ai_training_task.py",
    # Training image containter - for GPU training
    #container_uri='gcr.io/cloud-aiplatform/training/tf-gpu.2-4:latest',
    # No quota for GPU, changing to CPU
    container_uri='gcr.io/cloud-aiplatform/training/tf-cpu.2-4:latest',
    # Inference image container
    model_serving_container_image_uri=server_image,
    # Library dependences example
    requirements=["gcsfs==2022.3.0"],
    # Training script storage and model saving bucket
    staging_bucket=f'gs://{bucket_name}/staging'
)

In [35]:
mnist_model2 = custom_training_job.run(
    machine_type="n1-standard-4",
    replica_count=4,
    # No GPU quota available, changing to CPU
    #accelerator_type="NVIDIA_TESLA_K80",
    #accelerator_count=0,
)

Training script copied to:
gs://tf101_bucket/staging/aiplatform-2024-02-17-18:18:50.954-aiplatform_custom_trainer_script-0.1.tar.gz.
Training Output directory:
gs://tf101_bucket/staging/aiplatform-custom-training-2024-02-17-18:18:51.684 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/4683688018408636416?project=372043913167
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6114495693773078528?project=372043913167
CustomTrainingJob projects/372043913167/locations/us-central1/trainingPipelines/4683688018408636416 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomTrainingJob projects/372043913167/locations/us-central1/trainingPipelines/4683688018408636416 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomTrainingJob projects/372043913167/locations/us-central1/trainingPipelines/4683688018408636416 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomTrainingJob projects/37204

## Logs are in GCP Console in Vertex AI Trainings

In [None]:
%%tensorboard --logdir gs://projects/just-aloe-414315/logs/ml.googleapis.com%2F5106815277148930048

## Successful run will return a model, which can be deployed with an endpoint and used for inference

## HPT on Vertex AI, based on Bayesian modeling

## HPT script must accept parameters, which will set values in model setup
* To be saved in *my_vertex_ai_trial.py*
* Reporting model performance to Vertex AI for parameter selection

In [None]:
import os
from pathlib import  Path
import tempfile
import argparse
import hypertune

parser = argparse.ArgumentParser()
parser.add_argument("--n_hidden", type=int, default=2)
parser.add_argument("--n_neurons", type=int, default=256)
parser.add_argument("--learning_rate", type=float, default=1e-2)
parser.add_argument("--optimizer", default="adam")

args = parser.parse_args()

import tensorflow as tf

def build_model(args):

    with tf.distribute.MirroredStrategy().scope():
    
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Flatten(input_shape=[28, 28], dtype=tf.uint8))
        for _ in range(args.n_hidden):
            model.add(tf.keras.layers.Dense(args.n_neurons, activation='relu'))
        model.add(tf.keras.layers.Dense(10, activation='softmax'))

        opt = tf.keras.optimizers.get(args.optimizer)
        opt.learning_rate = args.learning_rate
    
        model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
             optimizer=opt,
             #metrics=[tf.keras.metrics.sparse_categorical_accuracy])
             metrics=["accuracy"])

        return model
    
fashion_mnist=tf.keras.datasets.fashion_mnist.load_data()

(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist

X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]

# Data normalization
X_train, X_valid, X_test = X_train/255., X_valid/255., X_test/255. 

model = build_model(args)

history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10)

model_dir = os.getenv("AIP_MODEL_DIR")

model.save(model_dir, save_format="tf")

hypertune = hypertune.HyperTune()
hypertune.report_hyperparameter_tuning_metric(
    hyperparameter_metric_tag="accuracy",
    metric_value=max(history.history["val_accuracy"]),
    global_step=model.optimizer.iterations.numpy(),
)

## HPT job 

In [51]:
trial_job = aiplatform.CustomJob.from_local_script(
    display_name="my_search_trial_job",
    script_path='my_vertex_ai_trial.py',
    #container_uri="gcr.io/cloud-aiplatform/training/tf-gpu.2-4:latest",
    # Using CPU due to quota
    container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-4:latest",
    staging_bucket=f"gs://{bucket_name}/staging",
    #accelerator_type="NVIDIA_TESLA_K80",
    #accelerator_count=0, # If nonzero 'n' each trial would have n GPUs
)
    

Training script copied to:
gs://tf101_bucket/staging/aiplatform-2024-02-17-19:25:20.727-aiplatform_custom_trainer_script-0.1.tar.gz.


In [52]:
from google.cloud.aiplatform import hyperparameter_tuning as hpt

hp_job = aiplatform.HyperparameterTuningJob(
    display_name="my_hp_search_job",
    custom_job=trial_job,
    metric_spec={"accuracy": "maximize"},
    parameter_spec={
        "learning_rate": hpt.DoubleParameterSpec(min=1e-3, max=10, scale='log'),
        "n_neurons": hpt.IntegerParameterSpec(min=1, max=300, scale="linear"),
        "n_hidden": hpt.IntegerParameterSpec(min=1, max=10, scale="linear"),
        "optimizer": hpt.CategoricalParameterSpec(["sgd", "adam"]),
    },
    max_trial_count=2, # Limiting for fast task completion
    parallel_trial_count=2, # Limiting to 2 due to quota
)

In [53]:
hp_job.run()

Creating HyperparameterTuningJob
HyperparameterTuningJob created. Resource name: projects/372043913167/locations/us-central1/hyperparameterTuningJobs/7531757385883320320
To use this HyperparameterTuningJob in another session:
hpt_job = aiplatform.HyperparameterTuningJob.get('projects/372043913167/locations/us-central1/hyperparameterTuningJobs/7531757385883320320')
View HyperparameterTuningJob:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7531757385883320320?project=372043913167
HyperparameterTuningJob projects/372043913167/locations/us-central1/hyperparameterTuningJobs/7531757385883320320 current state:
JobState.JOB_STATE_PENDING
HyperparameterTuningJob projects/372043913167/locations/us-central1/hyperparameterTuningJobs/7531757385883320320 current state:
JobState.JOB_STATE_RUNNING
HyperparameterTuningJob projects/372043913167/locations/us-central1/hyperparameterTuningJobs/7531757385883320320 current state:
JobState.JOB_STATE_RUNNING
HyperparameterTuningJ

## Fetching results

In [54]:
def get_final_metric(trial, metric_id):
    for metric in trial.final_measurement.metrics:
        if metric.metric_id == metric_id:
            return metric.value

In [55]:
trials = hp_job.trials
trial_accuracies = [get_final_metric(trial, 'accuracy') for trial in trials]
best_trial = trials[np.argmax(trial_accuracies)]

In [56]:
max(trial_accuracies)

0.44839999079704285

In [57]:
best_trial.id

'1'

In [58]:
best_trial.parameters

[parameter_id: "learning_rate"
value {
  number_value: 0.10000000000000005
}
, parameter_id: "n_hidden"
value {
  number_value: 6
}
, parameter_id: "n_neurons"
value {
  number_value: 151
}
, parameter_id: "optimizer"
value {
  string_value: "sgd"
}
]