In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Prototype: Turkish-English translation


This notebook demonstrates how to fine tune the T5 1.1 small model for the Turkish to English transalation.

## Imports and initialization

In [1]:
# reloads modules automatically before executing any code/script
%load_ext autoreload
%autoreload 2

### Import libraries

Please refer to the [environment setup](../README.md) section in the README 
file to setup the development environment and install the required libraries 
before importing them.

In [2]:
import os
import time
from datetime import datetime
import pandas as pd

import utils

# import vertex ai sdk for python
from google.cloud import aiplatform as vertex_ai

2022-09-06 20:38:41.109164: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-06 20:38:41.109214: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
 This a JAX bug; please report an issue at https://github.com/google/jax/issues
  _warn(f"cloud_tpu_init failed: {repr(exc)}\n This a JAX bug; please report "
  from .autonotebook import tqdm as notebook_tqdm


### Configure environment settings


- **`PROJECT_ID`:** Configure the Google Cloud Project ID
- **`REGION`:** Configure the [region](https://cloud.google.com/vertex-ai/docs/general/locations) 
  to be used for Vertex AI operations throughout the rest of this notebook
- **`BUCKET`:** Google Cloud Storage bucket name to be used by vertex AI for 
  any operations such as to stage the code, save any  artifacts generated etc.
- **`TENSORBOARD_NAME`:** Configure the managed TensorBoard instance name 
  created during the environment setup.

In [3]:
# Project definitions
PROJECT_ID = '<YOUR PROJECT ID>' # Change to your project id.
REGION = '<YOUR REGION>'  # Change to your region.

# Bucket definitions
BUCKET = '<YOUR BUCKET NAME>' # Change to your bucket.

# Tensorboard definitions
TENSORBOARD_NAME = '<YOUR TENSORBOARD NAME>' # Change to your Tensorboard instance name

Get Vertex AI TensorBoard ID based on name.

In [5]:
TENSORBOARD_ID = ! gcloud ai tensorboards list --filter="displayName={TENSORBOARD_NAME}" --format="value(name)" --region={REGION} 2>/dev/null 
TENSORBOARD_ID = TENSORBOARD_ID[0]

print(f"TENSORBOARD_ID = {TENSORBOARD_ID}")

TENSORBOARD_ID = projects/895222332033/locations/us-central1/tensorboards/2937103421045473280


### Configure custom container image

In this example, you use the base T5X custom training container.

In [6]:
IMAGE_NAME = 't5x-base' 
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

Validate image exists in the Container Registry

In [7]:
! gcloud container images describe $IMAGE_URI

image_summary:
  digest: sha256:abe3086bb53678aa675fdbadb62fe8b17f1baa0d4fa4f95c744907caac1814d8
  fully_qualified_digest: gcr.io/jk-mlops-dev/t5x-base@sha256:abe3086bb53678aa675fdbadb62fe8b17f1baa0d4fa4f95c744907caac1814d8
  registry: gcr.io
  repository: jk-mlops-dev/t5x-base


To take a quick anonymous survey, run:
  $ gcloud survey



### Configure experiment settings


In [None]:
EXPERIMENT_NAME = '<YOUR EXPERIMENT>' # Change to your experiment name

EXPERIMENT_WORKSPACE = f'gs://{BUCKET}/experiments/{EXPERIMENT_NAME}'
EXPERIMENT_RUNS = f'{EXPERIMENT_WORKSPACE}/runs'

### Initialize Vertex AI SDK for Python


In [9]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=EXPERIMENT_WORKSPACE,
    experiment=EXPERIMENT_NAME
)

### Configure dataset location


In [10]:
TFDS_DATA_DIR = f'gs://{BUCKET}/datasets'

## Run fine-tuning job
### Define the job's gin file

This job is configured using the following Gin file.

In [12]:
JOB_GIN_FILE = '../configs/finetune_t511_base_tr_en.gin'

!cat {JOB_GIN_FILE}

from __gin__ import dynamic_registration

import __main__ as train_script
from t5.data import mixtures
from t5x import models
from t5x import partitioning
from t5x import utils

import custom_tasks

include "t5x/examples/t5/t5_1_1/base.gin"
include "t5x/configs/runs/finetune.gin"


TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 256}
TRAIN_STEPS = 1_003_000  # 1000000 pre-trained steps + 3000 fine-tuning steps.
DROPOUT_RATE = 0.0
INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000"
LOSS_NORMALIZING_FACTOR = 233472
EVAL_PERIOD = 500

BATCH_SIZE = 128

TRAIN_PATH = %gin.REQUIRED
VALIDATION_PATH = %gin.REQUIRED
MIXTURE_OR_TASK_NAME = @custom_tasks.register_tr_en_task()
custom_tasks.register_tr_en_task:
    task_name = "translate_tr_en"
    train_path = %TRAIN_PATH
    valid_path = %VALIDATION_PATH



In [13]:
TRAIN_PATH = 'gs://<YOUR_PATH>/train.tfrecords'
VALIDATION_PATH = 'gs://<YOUR_PATH>/eval.tfrecords'

GIN_FILES = [JOB_GIN_FILE]  
GIN_OVERWRITES = [
        'USE_CACHED_TASKS=False',
        f'TRAIN_PATH="{TRAIN_PATH}"',
        f'VALIDATION_PATH="{VALIDATION_PATH}"',
    ]

### Configure Vertex AI CustomJob

In [None]:
RUN_NAME = f'<YOUR RUN NAME>' # Change to your run name for the custom job
RUN_ID = f'{EXPERIMENT_NAME}-{RUN_NAME}-{datetime.now().strftime("%Y%m%d%H%M")}'
RUN_DIR = f'{EXPERIMENT_RUNS}/{RUN_ID}'
RUN_MODE = 'train'

Log local variables defined for any troubleshooting

In [33]:
for key in [
    "PROJECT_ID", "REGION", "BUCKET", "TENSORBOARD_NAME", "TENSORBOARD_ID", 
    "IMAGE_NAME", "IMAGE_URI", 
    "EXPERIMENT_NAME", "EXPERIMENT_WORKSPACE", "EXPERIMENT_RUNS", 
    "TFDS_DATA_DIR", "GIN_FILES", "GIN_OVERWRITES", 
    "RUN_NAME", "RUN_ID", "RUN_DIR", "RUN_MODE"
]:
    print(f"{key}={eval(key)}")

PROJECT_ID=jk-mlops-dev
REGION=us-central1
BUCKET=jk-t5x-staging
TENSORBOARD_NAME=t5x-experiments
TENSORBOARD_ID=projects/895222332033/locations/us-central1/tensorboards/2937103421045473280
IMAGE_NAME=t5x-base
IMAGE_URI=gcr.io/jk-mlops-dev/t5x-base
EXPERIMENT_NAME=tr-en-translation
EXPERIMENT_WORKSPACE=gs://jk-t5x-staging/experiments/tr-en-translation
EXPERIMENT_RUNS=gs://jk-t5x-staging/experiments/tr-en-translation/runs
TFDS_DATA_DIR=gs://jk-t5x-staging/datasets
GIN_FILES=['../configs/finetune_t511_base_tr_en.gin']
GIN_OVERWRITES=['USE_CACHED_TASKS=False', 'TRAIN_PATH="gs://jk-t5x-staging/datasets/turkish-english/train.tfrecords"', 'VALIDATION_PATH="gs://jk-t5x-staging/datasets/turkish-english/eval.tfrecords"']
RUN_NAME=run-1
RUN_ID=tr-en-translation-run-1-202209062307
RUN_DIR=gs://jk-t5x-staging/experiments/tr-en-translation/runs/tr-en-translation-run-1-202209062307
RUN_MODE=train


Configure a Cloud TPU slice for the job. Double check if your [region](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators) supports the specified TPU topology.

In [34]:
MACHINE_TYPE = 'cloud-tpu'
ACCELERATOR_TYPE = 'TPU_V2'
ACCELERATOR_COUNT = 8

Create the custom job spec

In [35]:
job = utils.create_t5x_custom_job(
    display_name=RUN_ID,
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    image_uri=IMAGE_URI,
    run_mode=RUN_MODE,
    gin_files=GIN_FILES,
    model_dir=RUN_DIR,
    tfds_data_dir=TFDS_DATA_DIR,
    gin_overwrites=GIN_OVERWRITES
)

job.job_spec

worker_pool_specs {
  machine_spec {
    machine_type: "cloud-tpu"
    accelerator_type: TPU_V2
    accelerator_count: 8
  }
  replica_count: 1
  container_spec {
    image_uri: "gcr.io/jk-mlops-dev/t5x-base"
    args: "--run_mode=train"
    args: "--gin.MODEL_DIR=\"gs://jk-t5x-staging/experiments/tr-en-translation/runs/tr-en-translation-run-1-202209062307\""
    args: "--tfds_data_dir=gs://jk-t5x-staging/datasets"
    args: "--gin_file=/gcs/jk-t5x-staging/experiments/tr-en-translation/runs/tr-en-translation-run-1-202209062307/finetune_t511_base_tr_en.gin"
    args: "--gin.USE_CACHED_TASKS=False"
    args: "--gin.TRAIN_PATH=\"gs://jk-t5x-staging/datasets/turkish-english/train.tfrecords\""
    args: "--gin.VALIDATION_PATH=\"gs://jk-t5x-staging/datasets/turkish-english/eval.tfrecords\""
  }
}
base_output_directory {
  output_uri_prefix: "gs://jk-t5x-staging/experiments/tr-en-translation/aiplatform-custom-job-2022-09-06-23:07:33.578"
}

### Submit the custom job to Vertex AI and track the experiment


In [36]:
utils.submit_and_track_t5x_vertex_job(
    custom_job=job,
    job_display_name=RUN_ID,
    run_name=RUN_ID,
    experiment_name=EXPERIMENT_NAME,
    execution_name=RUN_ID,
    tfds_data_dir=TFDS_DATA_DIR,
    model_dir=RUN_DIR,
    vertex_ai=vertex_ai,
    run_mode=RUN_MODE
)

Creating CustomJob
CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/299343415051550720
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/299343415051550720')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/299343415051550720?project=895222332033
Job still pending. Waiting additional 15 seconds.
CustomJob projects/895222332033/locations/us-central1/customJobs/299343415051550720 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/299343415051550720 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/299343415051550720 current state:
JobState.JOB_STATE_PENDING
Job still pending. Waiting additional 15 seconds.
CustomJob projects/895222332033/locations/us-central1/customJobs/299343415051550720 current state:
JobSta

### Monitor the job with Vertex AI TensorBoard

**Execute the following command from the terminal window to sync logs to Vertex 
AI TensorBoard**

In [None]:
cmd = f"""
tb-gcp-uploader --tensorboard_resource_name {TENSORBOARD_ID} \
--logdir {EXPERIMENT_RUNS} \
--experiment_name {EXPERIMENT_NAME}
"""

print(cmd)

To access the TensorBoard instance for the experiment, click the below URL

In [None]:
TENSORBOARD_URL = f"https://{REGION}.tensorboard.googleusercontent.com/experiment/{TENSORBOARD_ID.replace('/', '+')}+experiments+{EXPERIMENT_NAME}/"
print(f"TensorBoard URL for the experiment is located at {TENSORBOARD_URL}")

Alternatively, you can access the Vertex AI TensorBoard experiment from the [console](https://console.cloud.google.com/vertex-ai/experiments/).