# INTRO VERTEX AI EXPERIMENT - TENSORBOARD
Intro how to registry artifacts, metrics, params of training of models (neuronal networks of tensorflow) in vertex tensorboard.
How to registry in the tensorboard instance located in GCP Vertex

Documentation web page: https://cloud.google.com/vertex-ai/docs/experiments/configure-training-script?hl=es-419

Documentation example github - log time series metrics: https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/experiments/vertex_ai_model_experimentation.ipynb

In [1]:
# ml packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
import seaborn as sns
import pickle
import os
from dotenv import load_dotenv, find_dotenv

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
import tensorflow as tf

# vertex gcp
from google.cloud import aiplatform as vertex_ai
from google.cloud import storage

### 1. load data

In [2]:
# load data
data_X, data_y = fetch_california_housing(return_X_y=True, as_frame=True)

### 2. split data

In [3]:
# split train and test
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size = 0.25, random_state = 0)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

In [4]:
# feature scaling X
scaler_x = StandardScaler()
X_train = scaler_x.fit_transform(X_train)
X_valid = scaler_x.transform(X_valid)
X_test = scaler_x.transform(X_test)

scaler_y = StandardScaler()
y_train = scaler_y.fit_transform(np.array(y_train).reshape(-1, 1))
y_valid = scaler_y.transform(np.array(y_valid).reshape(-1, 1))
y_test = scaler_y.transform(np.array(y_test).reshape(-1, 1))

In [5]:
# review shape
print('shapes datasets')
print('\nX_train:', X_train.shape)
print('X_valid:', X_valid.shape)
print('X_test:', X_test.shape)

print('\ny_train:', y_train.shape)
print('y_valid:', y_valid.shape)
print('y_test:', y_test.shape)

shapes datasets

X_train: (12384, 8)
X_valid: (3096, 8)
X_test: (5160, 8)

y_train: (12384, 1)
y_valid: (3096, 1)
y_test: (5160, 1)


### 3. Create tensorboard instance and experiment

In [6]:
def get_tensorboard_instance_or_create(experiment_name, experiment_description, project_gcp, location_gcp):
    """
    Search if exist a tensorboard instance and get it. If the instance doesn't exist, create it.
    The instance of tensorboard has its name with the idea to have the same name of the experiment of vertex ai that will use this instance
    of vertex.

    Args
        experiment_name (string)
        experiment_description (string)
        project_gcp (string)
        location_gcp (string)

    Return
        id_experiment_tensorboard (vertex ai tensorboard object)
    """
    
    ''' search tensorboard instance. if the list is empty the tensorboard instance doesn't exist and it will created '''
    # GET tensorboard instance created FILTERING by display name. return a list of the instance doesn't exist return a empty list
    list_tensorboard_vertex = vertex_ai.Tensorboard.list(
        filter = f'display_name="tensorboard-{experiment_name}"',
        project = project_gcp,
        location = location_gcp
    )

    # if vertex tensorboard instance doesn't exist, create it
    if len(list_tensorboard_vertex) == 0:
        print('--- creating vertex tensorboard instance ---')
        id_tensorboard_vertex = vertex_ai.Tensorboard.create(display_name = f'tensorboard-{experiment_name}',
                                                                 description = f'tensorboard-{experiment_description}',
                                                                 project = project_gcp,
                                                                 location = location_gcp
                                                                ) # return tensorboard instance created
    else:
        print('--- tensorboard instance already exists ---')
        id_tensorboard_vertex = list_tensorboard_vertex[0] # tensorboard instance exists, return it
    
    return id_tensorboard_vertex

In [7]:
""" PARAMETERS """
# get environment variables from .env - only necesary using a jupyter notebook
load_dotenv(find_dotenv())
PROJECT_GCP = os.environ.get("PROJECT_GCP", "")
LOCATION_GCP = os.environ.get("LOCATION_GCP", "")
BUCKET_NAME = os.environ.get("BUCKET_NAME", "")

# PARAMETERS TO CREATE AN EXPERIMENT IN VERTEX AI
EXPERIMENT_NAME = 'house-price-tensorflow'
EXPERIMENT_DESCRIPTION = 'Test to train tensorflow models and registry it in tensorboard full'



""" RUN """
# search tensorboard instance, if it doesn't exist -> created it
id_tensorboard_vertex = get_tensorboard_instance_or_create(experiment_name = EXPERIMENT_NAME,
                                                           experiment_description = EXPERIMENT_DESCRIPTION,
                                                           project_gcp = PROJECT_GCP,
                                                           location_gcp = LOCATION_GCP
                                                          )

# set experiment (or created if it doesn't exist - automatically)
print('\n--- setting experiment vertex ai ---')
vertex_ai.init(
    experiment = EXPERIMENT_NAME,
    experiment_description = EXPERIMENT_DESCRIPTION,
    experiment_tensorboard = id_tensorboard_vertex,
    project = PROJECT_GCP,
    location = LOCATION_GCP,
    )

--- tensorboard instance already exists ---

--- setting experiment vertex ai ---


In [8]:
id_tensorboard_vertex

<google.cloud.aiplatform.tensorboard.tensorboard_resource.Tensorboard object at 0x00000180B6F61C00> 
resource name: projects/724348686027/locations/us-east1/tensorboards/583919839186255872

### 4. Start run in vertex experiment to save results

In [23]:
""" registry run in experiment """
RUN_NAME = "run-6"

# create a run
vertex_ai.start_run(RUN_NAME)

Associating projects/724348686027/locations/us-east1/metadataStores/default/contexts/house-price-tensorflow-run-6 to Experiment: house-price-tensorflow


<google.cloud.aiplatform.metadata.experiment_run_resource.ExperimentRun at 0x180bccd9420>

### 5. Train model and registry it in run of vertex experiment and tensorboard instance in vertex

In [24]:
def create_model(seq_lengh):
        return tf.keras.models.Sequential(
            [
                tf.keras.layers.Dense(512, activation="relu", input_shape=(seq_lengh,)),
                tf.keras.layers.Dense(256, activation="relu"),
                tf.keras.layers.Dense(128, activation="relu"),
                tf.keras.layers.Dense(1, activation="relu"),
            ]
        )

In [25]:
# create nn
model = create_model(seq_lengh = X_train.shape[1])

In [26]:
# compile

model.compile(optimizer='adam', 
              loss='mean_squared_error', 
              metrics=[
                  tf.keras.metrics.MeanSquaredError(name="mse"),
                  tf.keras.metrics.RootMeanSquaredError(name="rmse"),
                  tf.keras.metrics.MeanAbsoluteError(name="mae")
                      ])

In [27]:
# summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 512)               4608      
                                                                 
 dense_5 (Dense)             (None, 256)               131328    
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 1)                 129       
                                                                 
Total params: 168961 (660.00 KB)
Trainable params: 168961 (660.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [33]:
# define callback TO SAVE MORE RESULTS IN TENSORBOARD vertex (more results than metrics)
tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir = f'gs://{id_tensorboard_vertex.gca_resource.blob_storage_path_prefix}',
        histogram_freq=1
    )

In [34]:
# train
history = model.fit(X_train, 
                    y_train, 
                    epochs=2, 
                    batch_size=32, # 16, 32, 64
                    validation_data=(X_valid, y_valid), 
                    verbose=1,
                    #callbacks=[tensorboard_callback],
                   )

InvalidArgumentError: {{function_node __wrapped__CreateSummaryFileWriter_device_/job:localhost/replica:0/task:0/device:CPU:0}} Error executing an HTTP request: HTTP response code 400 with body '{
  "error": {
    "code": 400,
    "message": "Invalid bucket name: 'cloud-ai-platform-842676af-776b-48aa-8d24-03041c5fafda\\train'",
    "errors": [
      {
        "message": "Invalid bucket name: 'cloud-ai-platform-842676af-776b-48aa-8d24-03041c5fafda\\train'",
        "domain": "global",
        "reason": "invalid"
      }
    ]
  }
}
' [Op:CreateSummaryFileWriter] name: 

In [None]:
""" save history - tensorboard vertex ai """

# run for each epoch of training and get the loss and metrics of each epoch and save in tensorboard vertex
for i in range(history.params["epochs"]): 
        vertex_ai.log_time_series_metrics(
            dict(
                train_loss = history.history["loss"][i],
                train_mse = history.history["mse"][i],
                train_rmse = history.history["rmse"][i],
                train_mae = history.history["mae"][i],

                val_loss = history.history["val_loss"][i],
                val_mse = history.history["val_mse"][i],
                val_rmse = history.history["val_rmse"][i],
                val_mae = history.history["val_mae"][i]
            )
        )

In [None]:
""" evaluate in test dataset and save metrics """

# evaluate 
metrics = model.evaluate(X_test, return_dict=True)

# save vertex metric
vertex_ai.log_metrics(
    dict(
        loss = metrics["loss"],
        mse = metrics["mse"],
        rmse = metrics["rmse"],
        mae = metrics["mae"]
    )
)

In [19]:
### terminar run
vertex_ai.end_run()