# T5X fine tuning

In [1]:
import os
import time

from google.cloud import aiplatform as vertex_ai



In [2]:
PROJECT_ID = 'jk-mlops-dev'
BUCKET = 'jk-t5x-staging'
REGION = 'us-central1'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/t5x'

In [3]:
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=f'gs://{BUCKET}/staging',
)

In [8]:
MODEL_DISPLAY_NAME = 'fine-tune-squad'
GIN_FILE = 'small_finetune_wmt.gin'
GIN_FILE_GCS = f'gs://{BUCKET}/staging/{GIN_FILE}'
!gsutil cp {GIN_FILE} {GIN_FILE_GCS}
GIN_FILE_GCS = GIN_FILE_GCS.replace('gs://', '/gcs/')
RUN_MODE = 'train'
MODEL_DIR = f'gs://{BUCKET}/jobs/{time.strftime("%Y%m%d_%H%M%S")}'
TFDS_DATA_DIR = f'gs://{BUCKET}/dataset/{MODEL_DISPLAY_NAME}'

Copying file://small_finetune_wmt.gin [Content-Type=application/octet-stream]...
/ [1 files][  567.0 B/  567.0 B]                                                
Operation completed over 1 objects/567.0 B.                                      


In [9]:
worker_pool_specs =  [
    {
        "machine_spec": {
            "machine_type": "cloud-tpu",
            "accelerator_type": "TPU_V2",
            "accelerator_count": 8,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "args": [
                f'--run_mode={RUN_MODE}',
                f'--gin_file={GIN_FILE_GCS}',
                f'--gin.MODEL_DIR="{MODEL_DIR}"',
                f'--tfds_data_dir={TFDS_DATA_DIR}',
                '--gin.USE_CACHED_TASKS=False'
            ],
        },
    }
]

worker_pool_specs

[{'machine_spec': {'machine_type': 'cloud-tpu',
   'accelerator_type': 'TPU_V2',
   'accelerator_count': 8},
  'replica_count': 1,
  'container_spec': {'image_uri': 'gcr.io/jk-mlops-dev/t5x',
   'args': ['--run_mode=train',
    '--gin_file=/gcs/jk-t5x-staging/staging/small_finetune_wmt.gin',
    '--gin.MODEL_DIR="gs://jk-t5x-staging/jobs/20220720_020546"',
    '--tfds_data_dir=gs://jk-t5x-staging/dataset/fine-tune-squad',
    '--gin.USE_CACHED_TASKS=False']}}]

In [12]:
job_name = 't5x_{}'.format(time.strftime("%Y%m%d_%H%M%S"))

job = vertex_ai.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(
    sync=True
)

Creating CustomJob
CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/1474935749911511040
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/1474935749911511040')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1474935749911511040?project=895222332033
CustomJob projects/895222332033/locations/us-central1/customJobs/1474935749911511040 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/1474935749911511040 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/1474935749911511040 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/1474935749911511040 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/895222332033/locations/us-central1/customJobs/1474935

RuntimeError: Job failed with:
code: 3
message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=895222332033&resource=ml_job%2Fjob_id%2F1474935749911511040&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%221474935749911511040%22"
