# Using TPUs with Vertex AI Training - Hello World

In [1]:
import os
import sys
import shutil
import time

from pathlib import Path


from google.cloud import aiplatform as aip

In [2]:
print(aip.__version__)

1.11.0


## Configure environment

In [3]:
PROJECT_ID = 'jk-mlops-dev'
REGION = 'us-central1'
STAGING_BUCKET = 'gs://jk-tpu-sandbox'
VERTEX_SA = f'vertex-sa@{PROJECT_ID}.iam.gserviceaccount.com'

IMAGE_NAME = 'tpu-test'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'

# Use TPU Accelerators. Temporarily using numeric codes, until types are added to the SDK
#   6 = TPU_V2
#   7 = TPU_V3
TRAIN_TPU, TRAIN_NTPU = (7, 8)
TRAIN_COMPUTE = "cloud-tpu"

## Build a custom training container

In [4]:
TRAINER_FOLDER = 'trainer'

path = Path(TRAINER_FOLDER)
if path.exists():
    shutil.rmtree(path)
path.mkdir() 

In [5]:
%%writefile {TRAINER_FOLDER}/__init__.py
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


Writing trainer/__init__.py


In [6]:
%%writefile {TRAINER_FOLDER}/task.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import tensorflow as tf

from typing import Any, Mapping, MutableMapping, Optional, Sequence, Union

from absl import logging
from absl import flags
from absl import app


def train_eval():
    print("In train_eval")
    cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu="local")
    tf.config.experimental_connect_to_cluster(cluster_resolver)
    tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
    print("All devices: ", tf.config.list_logical_devices('TPU'))

def _main(argv):
    train_eval()


FLAGS = flags.FLAGS

flags.DEFINE_list('training_data_paths', None, 'Paths to training datasets')
flags.DEFINE_list('validation_data_paths', None, 'Paths to validation datasets') 
flags.DEFINE_integer('tpu_cores', 8, 'A number of TPU cores')
flags.DEFINE_integer('tpu_type', 6, 'TPU type: 6 = TPU_V2, 7 = TPU_V3')


if __name__=='__main__':
    #flags.mark_flags_as_required([
    #    'fasta_path',
    #    'database_paths',
    #    'output_dir'
    #])
    app.run(_main)



Writing trainer/task.py


In [7]:
%%writefile Dockerfile

FROM python:3.8

WORKDIR /
ADD trainer /trainer

RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/20210525/libtpu.so -O /lib/libtpu.so
RUN chmod 777 /lib/libtpu.so

RUN wget https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/20210525/tf_nightly-2.6.0-cp38-cp38-linux_x86_64.whl
RUN pip3 install tf_nightly-2.6.0-cp38-cp38-linux_x86_64.whl
RUN rm tf_nightly-2.6.0-cp38-cp38-linux_x86_64.whl


ENTRYPOINT ["python3", "-m", "trainer.task"]

Overwriting Dockerfile


In [9]:
!docker build --quiet -t {IMAGE_URI} .
!docker push {IMAGE_URI}

sha256:ace43cbdc90e6522fe138ee2fc735f121fa09c15ba940d93b564755e80da26b9
Using default tag: latest
The push refers to repository [gcr.io/jk-mlops-dev/tpu-test]

[1Bb8fb2240: Preparing 
[1B5ab029e1: Preparing 
[1B158cd9e6: Preparing 
[1B849ef672: Preparing 
[1Bbfb87a1c: Preparing 
[1B2071043e: Preparing 
[1B4efb9fe5: Preparing 
[1B4d69a0f0: Preparing 
[1Bc2e608b0: Preparing 
[1B9a7a4e4a: Preparing 
[1B9a205adc: Preparing 
[1B98da17f2: Preparing 
[1B69727d80: Preparing 
[1B22fdcc61: Preparing 
[14Bab029e1: Pushed   1.151GB/1.138GB[2K[14A[2K[14A[2K[12A[2K[14A[2K[14A[2K[11A[2K[14A[2K[11A[2K[14A[2K[13A[2K[14A[2K[11A[2K[12A[2K[14A[2K[11A[2K[14A[2K[11A[2K[14A[2K[11A[2K[14A[2K[13A[2K[12A[2K[13A[2K[11A[2K[14A[2K[11A[2K[12A[2K[12A[2K[13A[2K[12A[2K[14A[2K[12A[2K[14A[2K[13A[2K[14A[2K[15A[2K[13A[2K[11A[2K[12A[2K[14A[2K[12A[2K[14A[2K[12A[2K[13A[2K[12A[2K[13A[2K[14A[2K[12A[2K[12A[2K[12A[

## Configure and submit a custom training job

### Initialize Vertex SDK

In [10]:
aip.init(project=PROJECT_ID, staging_bucket=STAGING_BUCKET)

### Configure worker pool specs

In [11]:
worker_pool_specs = [
    {
        "container_spec": {
            #"args": TRAINER_ARGS,
            "image_uri": IMAGE_URI,
        },
        "replica_count": 1,
        "machine_spec": {
            "machine_type": TRAIN_COMPUTE,
            "accelerator_type": TRAIN_TPU,
            "accelerator_count": TRAIN_NTPU,
        },
    }
]

print(worker_pool_specs[0])

{'container_spec': {'image_uri': 'gcr.io/jk-mlops-dev/tpu-test'}, 'replica_count': 1, 'machine_spec': {'machine_type': 'cloud-tpu', 'accelerator_type': 7, 'accelerator_count': 8}}


### Submit a job

In [12]:
display_name = f'tpu_hello_world_{time.strftime("%Y%m%d_%H%M%S")}'

job = aip.CustomJob(display_name=display_name, worker_pool_specs=worker_pool_specs)

In [13]:
job.run(sync=True)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/1666943740347940864
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/1666943740347940864')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/1666943740347940864?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/1666943740347940864 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/1666943740347940864 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/