# Using TPUs with Vertex AI Training - Hello World

In [27]:
import os
import sys
import shutil
import time

from pathlib import Path

import tensorflow as tf

from google.cloud import aiplatform as aip

In [28]:
print(tf.__version__)
print(aip.__version__)

2.7.0
1.11.0


## Configure environment

In [29]:
PROJECT_ID = 'jk-mlops-dev'
REGION = 'us-central1'
STAGING_BUCKET = 'gs://jk-tpu-staging'
VERTEX_SA = f'vertex-sa@{PROJECT}.iam.gserviceaccount.com'

IMAGE_NAME = 'tpu-test'
IMAGE_URI = f'gcr.io/{PROJECT}/{IMAGE_NAME}'

# Use TPU Accelerators. Temporarily using numeric codes, until types are added to the SDK
#   6 = TPU_V2
#   7 = TPU_V3
TRAIN_TPU, TRAIN_NTPU = (7, 8)
TRAIN_COMPUTE = "cloud-tpu"

## Build a custom training container

In [30]:
TRAINER_FOLDER = 'trainer'

path = Path(TRAINER_FOLDER)
if path.exists():
    shutil.rmtree(path)
path.mkdir() 

In [31]:
%%writefile {TRAINER_FOLDER}/__init__.py
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


Writing trainer/__init__.py


In [32]:
%%writefile {TRAINER_FOLDER}/task.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import tensorflow as tf

from typing import Any, Mapping, MutableMapping, Optional, Sequence, Union

from absl import logging
from absl import flags
from absl import app


def train_eval():
    cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
    tf.config.experimental_connect_to_cluster(cluster_resolver)
    tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
    print("All devices: ", tf.config.list_logical_devices('TPU'))

def _main(argv):
    train_eval()


FLAGS = flags.FLAGS

flags.DEFINE_list('training_data_paths', None, 'Paths to training datasets')
flags.DEFINE_list('validation_data_paths', None, 'Paths to validation datasets') 
flags.DEFINE_integer('tpu_cores', 8, 'A number of TPU cores')
flags.DEFINE_string('tpu_type', 6, 'TPU type: 6 = TPU_V2, 7 = TPU_V3')


if __name__=='__main__':
    #flags.mark_flags_as_required([
    #    'fasta_path',
    #    'database_paths',
    #    'output_dir'
    #])
    app.run(_main)



Writing trainer/task.py


In [39]:
%%writefile Dockerfile

FROM us-docker.pkg.dev/vertex-ai/training/tf-tpu.2-8

WORKDIR /
ADD trainer /trainer

ENTRYPOINT ["python", "-m", "trainer.task"]

Overwriting Dockerfile


In [40]:
!docker build -t {IMAGE_URI} .
!docker push {IMAGE_URI}

Sending build context to Docker daemon     64kB
Step 1/4 : FROM us-docker.pkg.dev/vertex-ai/training/tf-tpu.2-8
 ---> c182456fc8d1
Step 2/4 : WORKDIR /
 ---> Using cache
 ---> 91ce71b658fc
Step 3/4 : ADD trainer /trainer
 ---> df344db32310
Step 4/4 : ENTRYPOINT ["python", "-m", "trainer.task"]
 ---> Running in cf6650d9204a
Removing intermediate container cf6650d9204a
 ---> 9d68a7fe721d
Successfully built 9d68a7fe721d
Successfully tagged gcr.io/jk-mlops-dev/tpu-test:latest
Using default tag: latest
The push refers to repository [gcr.io/jk-mlops-dev/tpu-test]

[1B9da9bd55: Preparing 
[1B95c7b436: Preparing 
[1Bf663075e: Preparing 
[1B8d708c85: Preparing 
[1B2f24f403: Preparing 
[1B58b94eaf: Preparing 
[2B58b94eaf: Preparing 
[1Bed2ba73d: Preparing 
[1Ba3410843: Preparing 
[1Bb410a533: Preparing 
[1Bcd721701: Preparing 
[1B7bb57e52: Preparing 
[1B4301e47a: Preparing 
[1Bd419225a: Preparing 
[1Bfa7b148c: Preparing 
[1Bb7176e36: Preparing 
[1B867f895f: Preparing 
[1B6191ec

## Configure and submit a custom training job

### Initialize Vertex SDK

In [41]:
aip.init(project=PROJECT_ID, staging_bucket=STAGING_BUCKET)

### Configure worker pool specs

In [42]:
worker_pool_specs = [
    {
        "container_spec": {
            #"args": TRAINER_ARGS,
            "image_uri": IMAGE_URI,
        },
        "replica_count": 1,
        "machine_spec": {
            "machine_type": TRAIN_COMPUTE,
            "accelerator_type": TRAIN_TPU,
            "accelerator_count": TRAIN_NTPU,
        },
    }
]

print(worker_pool_specs[0])

{'container_spec': {'image_uri': 'gcr.io/jk-mlops-dev/tpu-test'}, 'replica_count': 1, 'machine_spec': {'machine_type': 'cloud-tpu', 'accelerator_type': 7, 'accelerator_count': 8}}


### Submit a job

In [43]:
display_name = f'tpu_hello_world_{time.strftime("%Y%m%d_%H%M%S")}'

job = aip.CustomJob(display_name=display_name, worker_pool_specs=worker_pool_specs)

In [44]:
job.run(sync=True)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/5775378888695742464
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/5775378888695742464')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/5775378888695742464?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/5775378888695742464 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/5775378888695742464 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/

RuntimeError: Job failed with:
code: 3
message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=895222332033&resource=ml_job%2Fjob_id%2F5775378888695742464&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%225775378888695742464%22"
