In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Running batch inference using `tfrecord` files

This notebook demonstrates how to run batch inference on data in `tfrecord` format.



In [1]:
# Reloads modules automatically before executing any code/script
%load_ext autoreload
%autoreload 2

### Import libraries

Import the required libraries.

In [2]:
import os
import time
from datetime import datetime
import pandas as pd

# Import the Vertex AI SDK for Python
from google.cloud import aiplatform as vertex_ai

Import [`utils.py`](utils.py), which is a
custom script that has utility functions to streamline configuration and
submission of a Vertex AI custom job and to track the generated artifacts and
metrics from the custom job.

In [3]:
import utils

2022-09-07 18:45:22.625602: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-07 18:45:22.625651: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
 This a JAX bug; please report an issue at https://github.com/google/jax/issues
  _warn(f"cloud_tpu_init failed: {repr(exc)}\n This a JAX bug; please report "
  from .autonotebook import tqdm as notebook_tqdm


### Configure environment settings

Based on the [environment setup](README.md) you did previously, configure the
following environment settings:

-  `PROJECT_ID`: Configure the Google Cloud project ID.
-  `REGION`: Configure the
    [region](https://cloud.google.com/vertex-ai/docs/general/locations) to use
    for Vertex AI operations throughout this notebook.
-  `BUCKET`: Configure the Google Cloud Storage bucket name that you created
    during environment setup. Vertex AI uses this bucket for operations such as
    staging the code and saving generated artifacts.
-  `TENSORBOARD_NAME`: Configure the managed TensorBoard instance name that
    you created during the environment setup.

In [4]:
# Project definitions
PROJECT_ID = '<YOUR PROJECT ID>' # Change to your project id.
REGION = '<YOUR REGION>'  # Change to your region.

# Bucket definitions
BUCKET = '<YOUR BUCKET NAME>' # Change to your bucket.

In [5]:
# Project definitions
PROJECT_ID = 'jk-mlops-dev' # Change to your project id.
REGION = 'us-central1'  # Change to your region.

# Bucket definitions
BUCKET = 'jk-t5x-staging' # Change to your bucket.

In [6]:
# Tensorboard definitions
TENSORBOARD_NAME = 't5x-experiments' # Change to your Tensorboard instance name

In [7]:
# Get Vertex AI TensorBoard ID based on name
TENSORBOARD_ID = ! gcloud ai tensorboards list --filter="displayName={TENSORBOARD_NAME}" --format="value(name)" --region={REGION} 2>/dev/null 
TENSORBOARD_ID = TENSORBOARD_ID[0]

print(f"TENSORBOARD_ID = {TENSORBOARD_ID}")

TENSORBOARD_ID = projects/895222332033/locations/us-central1/tensorboards/2937103421045473280


In [8]:
# Configure the custom container image name
IMAGE_NAME = 't5x-base' # Change to your image name
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

Verify that the image exists in Container Registry:

In [9]:
! gcloud container images describe $IMAGE_URI

image_summary:
  digest: sha256:e4d0c053d27a8d5398457a292eb14b43cb6c43ec79fc2e31d8bace48e0853082
  fully_qualified_digest: gcr.io/jk-mlops-dev/t5x-base@sha256:e4d0c053d27a8d5398457a292eb14b43cb6c43ec79fc2e31d8bace48e0853082
  registry: gcr.io
  repository: jk-mlops-dev/t5x-base


### Configure experiment settings


In [10]:
EXPERIMENT_NAME = 'tr-en-1' # Change to your experiment name

EXPERIMENT_WORKSPACE = f'gs://{BUCKET}/experiments/{EXPERIMENT_NAME}'
EXPERIMENT_RUNS = f'{EXPERIMENT_WORKSPACE}/runs'

### Initialize the Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project, bucket, and
experiment.

In [11]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=EXPERIMENT_WORKSPACE,
    experiment=EXPERIMENT_NAME
)

---

## Configure and run a batch inference job



### Configure a Gin file for the batch inference job


In [12]:
GIN_FILE = '../configs/infer_t511_base_tr_en.gin'
! cat {GIN_FILE}


from __gin__ import dynamic_registration


#from t5x import utils
#from t5x import infer

include "t5x/configs/runs/infer_from_tfexample_file.gin"
include "t5x/examples/t5/t5_1_1/base.gin"


TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 256}
TF_EXAMPLE_FILE_TYPE = 'tfrecord'

infer.create_task_from_tfexample_file.inputs_key = 'tr'






### Configure the Vertex AI custom job

In [13]:
INFER_RUN_NAME = f'<YOUR RUN NAME>' # Change to your run name for the custom job
INFER_RUN_ID = f'{EXPERIMENT_NAME}-{INFER_RUN_NAME}-{datetime.now().strftime("%Y%m%d%H%M")}'
INFER_RUN_DIR = f'{EXPERIMENT_RUNS}/{INFER_RUN_ID}'

In [40]:
INFER_RUN_NAME = f'infer-1' # Change to your run name for the custom job
INFER_RUN_ID = f'{EXPERIMENT_NAME}-{INFER_RUN_NAME}-{datetime.now().strftime("%Y%m%d%H%M")}'
INFER_RUN_DIR = f'{EXPERIMENT_RUNS}/{INFER_RUN_ID}'

Specify the run mode as `infer` to run the T5X launch script in inference
mode.

In [41]:
RUN_MODE = 'infer'

#### Select the checkpoint

In [42]:
# Get all experiment runs and run directories
utils.get_all_experiment_run_directories(EXPERIMENT_NAME)

Unnamed: 0,RUN_ID,RUN_DIR
0,tr-en-1-infer-1-202209071845,
1,tr-en-1-infer-1-202209071808,
2,tr-en-1-run-1-202209071535,gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535


In [43]:
! gsutil ls '<YOUR PREVIOUS RUN DIRECTORY>' # Change to the previous run directory

CommandException: "ls" command does not support "file://" URLs. Did you mean to use a gs:// URL?


In [44]:
! gsutil ls 'gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535' # Change to the previous run directory

gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/config.gin
gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/finetune_t511_base_tr_en.gin
gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/model-info.txt
gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/checkpoint_1000000.tmp-1662565322/
gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/checkpoint_1000000/
gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/checkpoint_1001000.tmp-1662565912/
gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/checkpoint_1001000/
gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/inference_eval/
gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/train/
gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/training_eval/


In [45]:
CHECKPOINT_PATH = '<YOUR MODEL CHECKPOINT PATH>' # Change to the path where model checkpoint exists

In [46]:
CHECKPOINT_PATH = 'gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/checkpoint_1001000' # Change to the path where model checkpoint exists

Configure runtime parameters for the inference.

In [47]:
GIN_FILES = [GIN_FILE]
GIN_OVERWRITES = [
    'USE_CACHED_TASKS=False',
    f'CHECKPOINT_PATH="{CHECKPOINT_PATH}"',
    f'INFER_OUTPUT_DIR="{INFER_RUN_DIR}"',
    f"TF_EXAMPLE_FILE_PATHS=['gs://jk-t5x-staging/datasets/turkish-english/eval.tfrecords']"
]

To help with troubleshooting, display the values of local variables.

In [48]:
for key in [
    "PROJECT_ID", "REGION", "BUCKET", "TENSORBOARD_NAME", "TENSORBOARD_ID", 
    "IMAGE_NAME", "IMAGE_URI", 
    "EXPERIMENT_NAME", "EXPERIMENT_WORKSPACE", "EXPERIMENT_RUNS", 
    "GIN_FILES", "GIN_OVERWRITES", 
    "INFER_RUN_NAME", "INFER_RUN_ID", "INFER_RUN_DIR", "RUN_MODE",
    "CHECKPOINT_PATH"
]:
    print(f"{key}={eval(key)}")

PROJECT_ID=jk-mlops-dev
REGION=us-central1
BUCKET=jk-t5x-staging
TENSORBOARD_NAME=t5x-experiments
TENSORBOARD_ID=projects/895222332033/locations/us-central1/tensorboards/2937103421045473280
IMAGE_NAME=t5x-base
IMAGE_URI=gcr.io/jk-mlops-dev/t5x-base
EXPERIMENT_NAME=tr-en-1
EXPERIMENT_WORKSPACE=gs://jk-t5x-staging/experiments/tr-en-1
EXPERIMENT_RUNS=gs://jk-t5x-staging/experiments/tr-en-1/runs
GIN_FILES=['../configs/infer_t511_base_tr_en.gin']
GIN_OVERWRITES=['USE_CACHED_TASKS=False', 'CHECKPOINT_PATH="gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/checkpoint_1001000"', 'INFER_OUTPUT_DIR="gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-infer-1-202209071909"', "TF_EXAMPLE_FILE_PATHS=['gs://jk-t5x-staging/datasets/turkish-english/eval.tfrecords']"]
INFER_RUN_NAME=infer-1
INFER_RUN_ID=tr-en-1-infer-1-202209071909
INFER_RUN_DIR=gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-infer-1-202209071909
RUN_MODE=infer
CHECKPOINT_PATH=gs://jk-t5x-staging/experiments

### Create the Vertex AI custom job

Configure the worker pool specification based on TPU availability in the region.
See
[Vertex AI locations](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators)
for details.

In [49]:
# Define worker pool resource specification
# Machine to run the custom job on. For TPUs, use `cloud-tpu`.
MACHINE_TYPE = 'cloud-tpu'
# Accelerator type to attach to the machine. For TPUs, use `TPU_V2`, `TPU_V3`.
ACCELERATOR_TYPE = 'TPU_V2'
# Number of accelerators to attach to the machine.
# For TPUs, specify the number of cores to be provisioned.
ACCELERATOR_COUNT = 8

Create the custom job spec.

In [50]:
job = utils.create_t5x_custom_job(
    display_name=INFER_RUN_ID,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    image_uri=IMAGE_URI,
    run_mode=RUN_MODE,
    gin_files=GIN_FILES,
    model_dir=CHECKPOINT_PATH,
    gin_overwrites=GIN_OVERWRITES
)

In [51]:
job.job_spec

worker_pool_specs {
  machine_spec {
    machine_type: "cloud-tpu"
    accelerator_type: TPU_V2
    accelerator_count: 8
  }
  replica_count: 1
  container_spec {
    image_uri: "gcr.io/jk-mlops-dev/t5x-base"
    command: "python"
    command: "./t5x/t5x/infer.py"
    args: "--gin.MODEL_DIR=\"gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/checkpoint_1001000\""
    args: "--tfds_data_dir=None"
    args: "--gin_file=/gcs/jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/checkpoint_1001000/infer_t511_base_tr_en.gin"
    args: "--gin.USE_CACHED_TASKS=False"
    args: "--gin.CHECKPOINT_PATH=\"gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-run-1-202209071535/checkpoint_1001000\""
    args: "--gin.INFER_OUTPUT_DIR=\"gs://jk-t5x-staging/experiments/tr-en-1/runs/tr-en-1-infer-1-202209071909\""
    args: "--gin.TF_EXAMPLE_FILE_PATHS=[\'gs://jk-t5x-staging/datasets/turkish-english/eval.tfrecords\']"
  }
}
base_output_directory {
  output_uri_prefix:

### Submit the custom job to Vertex AI and track the experiment

The `submit_and_track_tx5_vertex_job` function launches the T5X script in
inference mode with the fine-tuned model checkpoint specification. The function
submits the job to Vertex AI, which generates inferences results and writes them
to the Cloud Storage bucket. To monitor progress of the job, click the URL in
the cell output. The URL goes to the Vertex AI Custom Job console for the job.

In [52]:
utils.submit_and_track_t5x_vertex_job(
    custom_job=job,
    job_display_name=INFER_RUN_ID,
    run_name=INFER_RUN_ID,
    experiment_name=EXPERIMENT_NAME,
    execution_name=INFER_RUN_ID,
    model_dir=INFER_RUN_DIR,
    vertex_ai=vertex_ai,
    run_mode=RUN_MODE
)

Creating CustomJob
CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/7809108987831386112
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/7809108987831386112')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7809108987831386112?project=895222332033
Job still pending. Waiting additional 15 seconds.
CustomJob projects/895222332033/locations/us-central1/customJobs/7809108987831386112 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/7809108987831386112 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/7809108987831386112 current state:
JobState.JOB_STATE_PENDING
Job still pending. Waiting additional 15 seconds.
CustomJob projects/895222332033/locations/us-central1/customJobs/7809108987831386112 current state:

### Explore batch inference results

The batch inference job writes inference results to the run directory that you
configured. The output is written in JSON lines (`.jsonl`) format.

In [None]:
! gsutil ls $INFER_RUN_DIR

The following example is a snippet from the output of the batch inference job
for the translation task from a fine-tuned
[WMT English-to-German translation](https://www.tensorflow.org/datasets/catalog/wmt_t2t_translate)
model.

``` json
{
	"inputs": {
		"inputs_pretokenized": "translate English to German: As recently as last Tuesday, the Nasdaq indices were not calculated for one hour due to data transfer errors.",
		"targets_pretokenized": "Erst am Dienstag waren die Indizes der Nasdaq wegen Fehlern im Datentransport eine Stunde lang nicht berechnet worden."
	},
	"prediction": "Erst am vergangenen Dienstag wurden die Nasdaq-Indexe wegen Daten\u00fcbertragungen f\u00fcr eine Stunde nicht berechnet."
}
```

---

## What's next?

This notebook template covers how to run a T5X job for fine-tuning, evaluation,
and inference tasks on Vertex AI.

You can copy this notebook template for a specific task or dataset or
configuration and then make changes as needed to run the T5X job on Vertex AI.
Refer to other notebooks in the
[notebooks directory](/) of this repo as
inspiration.