In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Fine Tuning T5 1.1 XL on WMT task

In [1]:
import os
import time
from datetime import datetime
import pandas as pd

from google.cloud import aiplatform as vertex_ai

## Configure environment settings

In [2]:
PROJECT_ID = 'jk-mlops-dev' # Change to your project id.
REGION = 'us-central1'  # Change to your region.
BUCKET = 'jk-t5x-staging' # Change to your bucket.
TENSORBOARD_NAME = 't5x-experiments' # Change to your bucket.
TENSORBOARD_ID = ! gcloud ai tensorboards list --filter="displayName={TENSORBOARD_NAME}" --format="value(name)" --region=us-central1 
TENSORBOARD_ID = TENSORBOARD_ID[1]
IMAGE_NAME = 't5x-base'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

## Configure experiment settings

In [3]:
EXPERIMENT_NAME = 'fine-tune-t5-xl-3'

EXPERIMENT_WORKSPACE = f'gs://{BUCKET}/experiments/{EXPERIMENT_NAME}'
TFDS_DATA_DIR = f'{EXPERIMENT_WORKSPACE}/dataset'
EXPERIMENT_RUNS = f'{EXPERIMENT_WORKSPACE}/runs'

In [4]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=EXPERIMENT_WORKSPACE,
    experiment=EXPERIMENT_NAME
)

### Pre-download the dataset

This is necessary in multi-host environments. To avoid potential conflicts we adopted it as a convention for all our samples. Another benefit is that you are not idling TPUs while the dataset is downloaded.

The default configurtion for the wmt_t2t_translate is german/english

In [None]:
!tfds build wmt_t2t_translate --data_dir=$TFDS_DATA_DIR

## Configure a fine tuning job

### Configure run name and artifact repo

*We may want to encapsulate the following steps in a utility function, as this is a lot of boilerplate code. For now I removed experiment logging code as we need to rethink and simplify how we do it*

In [11]:
RUN_NAME = f'v2-32-1'
RUN_ID = f'{EXPERIMENT_NAME}-{RUN_NAME}-{datetime.now().strftime("%Y%m%d%H%M")}'
RUN_DIR = f'{EXPERIMENT_RUNS}/{RUN_ID}'
GIN_FILE_NAME = f'{EXPERIMENT_NAME}-{RUN_NAME}.gin'

### Create a gin file

In [12]:
%%writefile {GIN_FILE_NAME}
from __gin__ import dynamic_registration

import __main__ as train_script
from t5.data import mixtures
from t5x import models
from t5x import partitioning
from t5x import utils

include "t5x/examples/t5/t5_1_1/xl.gin"
include "t5x/configs/runs/finetune.gin"

MIXTURE_OR_TASK_NAME = "wmt_t2t_ende_v003"
TASK_FEATURE_LENGTHS = {"inputs": 256, "targets": 256}
TRAIN_STEPS = 1_020_000  # 1000000 pre-trained steps + 20000 fine-tuning steps.
DROPOUT_RATE = 0.0
INITIAL_CHECKPOINT_PATH = "gs://t5-data/pretrained_models/t5x/t5_1_1_xl/checkpoint_1000000"
LOSS_NORMALIZING_FACTOR = 233472

BATCH_SIZE = 64
partitioning.PjitPartitioner.num_partitions = 8

Writing fine-tune-t5-xl-3-v2-32-1.gin


#### Copy gin file to the run artifact repo

In [13]:
GIN_FILE_GCS = f'{RUN_DIR}/{GIN_FILE_NAME}'
! gsutil cp {GIN_FILE_NAME} {GIN_FILE_GCS}
GIN_FILE_GCS = GIN_FILE_GCS.replace('gs:/' , '/gcs')

Copying file://fine-tune-t5-xl-3-v2-32-1.gin [Content-Type=application/octet-stream]...
/ [1 files][  646.0 B/  646.0 B]                                                
Operation completed over 1 objects/646.0 B.                                      


### Configure Vertex AI CustomJob

In [14]:
RUN_MODE = 'train'

MACHINE_TYPE = 'cloud-tpu'
ACCELERATOR_TYPE = 'TPU_V2'
ACCELERATOR_NUM = 32
REPLICA_COUNT = 1

worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": MACHINE_TYPE,
            "accelerator_type": ACCELERATOR_TYPE,
            "accelerator_count": ACCELERATOR_NUM,
        },
        "replica_count": REPLICA_COUNT,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "args": [
                f'--run_mode={RUN_MODE}',
                f'--gin_file={GIN_FILE_GCS}',
                f'--gin.MODEL_DIR="{RUN_DIR}"',
                f'--tfds_data_dir={TFDS_DATA_DIR}',
                '--gin.USE_CACHED_TASKS=False'
            ],
        },
    }
]

worker_pool_specs

[{'machine_spec': {'machine_type': 'cloud-tpu',
   'accelerator_type': 'TPU_V2',
   'accelerator_count': 32},
  'replica_count': 1,
  'container_spec': {'image_uri': 'gcr.io/jk-mlops-dev/t5x-base',
   'args': ['--run_mode=train',
    '--gin_file=/gcs/jk-t5x-staging/experiments/fine-tune-t5-xl-3/runs/fine-tune-t5-xl-3-v2-32-1-202207240306/fine-tune-t5-xl-3-v2-32-1.gin',
    '--gin.MODEL_DIR="gs://jk-t5x-staging/experiments/fine-tune-t5-xl-3/runs/fine-tune-t5-xl-3-v2-32-1-202207240306"',
    '--tfds_data_dir=gs://jk-t5x-staging/experiments/fine-tune-t5-xl-3/dataset',
    '--gin.USE_CACHED_TASKS=False']}}]

### Submit the job

In [15]:
job = vertex_ai.CustomJob(
    display_name=RUN_ID,
    worker_pool_specs=worker_pool_specs,
    base_output_dir=RUN_DIR
)
job.run(
    sync=False
)

Creating CustomJob
CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/7532928159608471552
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/7532928159608471552')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7532928159608471552?project=895222332033
CustomJob projects/895222332033/locations/us-central1/customJobs/7532928159608471552 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/7532928159608471552 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/7532928159608471552 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/7532928159608471552 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/7532928

### Monitor the job with Vertex Tensorboard

*During the Vertex AI Training Preview, native integration with Vertex Tensorboard is not supported. As a mitigation you can use tb-gcp-uploader to manually start tensorboard logs uploading*

*Execute the following command from the terminal window*

In [10]:
cmd = f"""
tb-gcp-uploader --tensorboard_resource_name {TENSORBOARD_ID} \
--logdir {EXPERIMENT_RUNS} \
--experiment_name {EXPERIMENT_NAME}
"""

print(cmd)


tb-gcp-uploader --tensorboard_resource_name projects/895222332033/locations/us-central1/tensorboards/2937103421045473280 --logdir gs://jk-t5x-staging/experiments/fine-tune-t5-xl-3/runs --experiment_name fine-tune-t5-xl-3

CustomJob projects/895222332033/locations/us-central1/customJobs/8267366742590750720 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/8267366742590750720 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/8267366742590750720 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/8267366742590750720 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/8267366742590750720 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/8267366742590750720 current state:
JobState.JOB_STATE_PENDING
CustomJob proje

In [None]:
vertex_ai.end_run()