# Distributed training with Vertex Reduction server

In [1]:
import os
import pprint
import sys

from google.cloud import aiplatform

## Set environment constants

In [2]:
PROJECT_ID = 'jk-mlops-dev'
REGION = 'us-west1'
STAGING_BUCKET = 'gs://jk-vertex-staging'

## Prepare a training container

In [3]:
BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-5'
MODEL_GARDEN_VERSION = 'v2.5.0'
TRAIN_IMAGE = f'gcr.io/{PROJECT_ID}/tf_nlp_toolkit'

dockerfile = f'''
FROM {BASE_IMAGE}
WORKDIR /source
RUN git clone -b {MODEL_GARDEN_VERSION}  --single-branch https://github.com/tensorflow/models.git 
RUN pip install -r models/official/requirements.txt 
ENV PYTHONPATH=/source/models

#ENTRYPOINT ["/bin/bash", "-c"]
#CMD ["echo TensorFlow Model Garden image"]
ENTRYPOINT ["python"]
CMD ["-c", "print('Hello')"]
'''

with open('Dockerfile', 'w') as f:
    f.write(dockerfile)

In [4]:
! docker build -t {TRAIN_IMAGE} .

Sending build context to Docker daemon  129.5kB
Step 1/7 : FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-5
 ---> b963122c3c2c
Step 2/7 : WORKDIR /source
 ---> Using cache
 ---> ae3fb96fd36d
Step 3/7 : RUN git clone -b v2.5.0  --single-branch https://github.com/tensorflow/models.git
 ---> Using cache
 ---> 9a6a3b235d6c
Step 4/7 : RUN pip install -r models/official/requirements.txt
 ---> Using cache
 ---> 4f5f993b1f1d
Step 5/7 : ENV PYTHONPATH=/source/models
 ---> Using cache
 ---> c34b5beadd9f
Step 6/7 : ENTRYPOINT ["python"]
 ---> Using cache
 ---> 3f09894d6299
Step 7/7 : CMD ["-c", "print('Hello')"]
 ---> Using cache
 ---> eaa66349bebe
Successfully built eaa66349bebe
Successfully tagged gcr.io/jk-mlops-dev/tf_nlp_toolkit:latest


### Test the container locally

In [34]:
MNLI_TRAIN_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record'
MNLI_VALID_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record'
MNLI_METADATA = 'gs://jk-vertex-demos/datasets/MNLI/metadata.json'
BERT_CHECKPOINT = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16'
MODEL_DIR = 'gs://jk-vertex-demos/testing/r1'

task = 'MNLI'
mode = 'train_and_eval'
global_batch_size = 32
steps_per_loop = 10
learning_rate = 2e-5
num_train_epochs = 3
distribution_strategy = 'mirrored'
num_gpus = 2

In [35]:
!docker run -it --rm --gpus all {TRAIN_IMAGE} models/official/nlp/bert/run_classifier.py \
--mode={mode} \
--model_dir={MODEL_DIR} \
--input_meta_data_path={MNLI_METADATA} \
--train_data_path={MNLI_TRAIN_SPLIT} \
--eval_data_path={MNLI_VALID_SPLIT} \
--bert_config_file={BERT_CHECKPOINT}/bert_config.json \
--init_checkpoint={BERT_CHECKPOINT}/bert_model.ckpt \
--train_batch_size={global_batch_size} \
--eval_batch_size={global_batch_size} \
--steps_per_loop={steps_per_loop} \
--learning_rate={learning_rate} \
--num_train_epochs={num_train_epochs} \
--distribution_strategy={distribution_strategy} \
--num_gpus={num_gpus}

2021-05-26 05:14:31.521712: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
  'command line!' % flag_name)
2021-05-26 05:14:35.748749: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-05-26 05:14:37.111185: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-05-26 05:14:37.112128: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-05-26 05:14:37.112313: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA no

### Push the container

In [5]:
! docker push {TRAIN_IMAGE}

Using default tag: latest
The push refers to repository [gcr.io/jk-mlops-dev/tf_nlp_toolkit]

[1Be22ef9a1: Preparing 
[1B77ad8be7: Preparing 
[1Ba2fa6014: Preparing 
[1B961a296c: Preparing 
[1B53abc6c2: Preparing 
[1B3723ef37: Preparing 
[1B0089a9c0: Preparing 
[1B3e41a2c0: Preparing 
[1B25162004: Preparing 
[1B99d982dd: Preparing 
[1B6603d114: Preparing 
[1Bc97a79f1: Preparing 
[1Be02b8502: Preparing 
[1Bd34a65ac: Preparing 
[1Bce22e436: Preparing 
[1B7e013d33: Preparing 
[1Baff4f6ee: Preparing 
[1Be4ccb381: Preparing 
[1B90ceec1e: Preparing 
[1B0ab30137: Preparing 
[1Bed8ae595: Preparing 
[1B855df562: Preparing 
[1Bdb3c5655: Preparing 
[1B0a9a6a11: Preparing 
[1B7e8b38e6: Preparing 
[1B8f196cf4: Preparing 
[1B01dbc7de: Preparing 
[1B31d2d72b: Preparing 
[1Ba966f459: Preparing 
[1Bb9e63cdf: Preparing 
[1B49f5bf51: Preparing 
[1Baa2fa9fe: Preparing 
[1B325cc380: Preparing 
[1Bdd81f9fa: Preparing 
[1B09cad0ba: Layer already exists [31A[2K[29A[2K[2

## Submit Vertex Training jobs

### Create a training container

### Initialize Vertex AI SDK


In [6]:
aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

### Create a training container

In [7]:
def prepare_worker_pool_specs(
    image_uri,
    args,
    cmd, 
    replica_count=1,
    machine_type='n1-standard-4',
    accelerator_count=0,
    accelerator_type='ACCELERATOR_TYPE_UNSPECIFIED'):

    if accelerator_count > 0:
        machine_spec = {
            'machine_type': machine_type,
            'accelerator_type': accelerator_type,
            'accelerator_count': accelerator_count,
        }
    else:
        machine_spec = {
            'machine_type': machine_type
        }
    
    container_spec = {
        'image_uri': image_uri,
        'args': args,
        'command': cmd,
    }
    
    chief_spec = {
        'replica_count': 1,
        'machine_spec': machine_spec,
        'container_spec': container_spec
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            'replica_count': replica_count - 1,
            'machine_spec': machine_spec,
            'container_spec': container_spec
        }
        worker_pool_specs.append(workers_spec)
    
    return worker_pool_specs

In [8]:
MNLI_TRAIN_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record'
MNLI_VALID_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record'
MNLI_METADATA = 'gs://jk-vertex-demos/datasets/MNLI/metadata.json'
BERT_CHECKPOINT = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16'
MODEL_DIR = 'gs://jk-vertex-demos/testing/r1'

task = 'MNLI'
mode = 'train_and_eval'
global_batch_size = 32
steps_per_loop = 10
learning_rate = 2e-5
num_train_epochs = 3

distribution_strategy = 'mirrored'
distribution_strategy = 'multi_worker_mirrored'
num_gpus = 1

replica_count = 2
machine_type = 'n1-standard-8'
accelerator_count = 1
accelerator_type = 'NVIDIA_TESLA_V100'

image_uri = TRAIN_IMAGE

cmd = [
    "python", "models/official/nlp/bert/run_classifier.py"
]

args = [
    '--mode=' + mode,
    '--model_dir=' + MODEL_DIR,
    '--input_meta_data_path=' + MNLI_METADATA,
    '--train_data_path=' + MNLI_TRAIN_SPLIT,
    '--eval_data_path=' + MNLI_VALID_SPLIT,
    '--bert_config_file=' + BERT_CHECKPOINT + '/bert_config.json',
    '--init_checkpoint=' + BERT_CHECKPOINT + '/bert_model.ckpt',
    '--train_batch_size=' + str(global_batch_size),
    '--eval_batch_size=' + str(global_batch_size),
    '--steps_per_loop=' + str(steps_per_loop),
    '--learning_rate=' + str(learning_rate),
    '--num_train_epochs=' + str(num_train_epochs),
    '--distribution_strategy=' + distribution_strategy,
    '--num_gpus=' + str(num_gpus),
]

worker_pool_specs = prepare_worker_pool_specs(
    image_uri=image_uri,
    args=args,
    cmd=cmd,
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_count=accelerator_count,
    accelerator_type=accelerator_type
)

pp = pprint.PrettyPrinter()
print(pp.pformat(worker_pool_specs))

[{'container_spec': {'args': ['--mode=train_and_eval',
                              '--model_dir=gs://jk-vertex-demos/testing/r1',
                              '--input_meta_data_path=gs://jk-vertex-demos/datasets/MNLI/metadata.json',
                              '--train_data_path=gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record',
                              '--eval_data_path=gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record',
                              '--bert_config_file=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_config.json',
                              '--init_checkpoint=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_model.ckpt',
                              '--train_batch_size=32',
                              '--eval_batch_size=32',
                              '--steps_per_loop=10',
                              '--learning_rate=2e-05',
                              '--num_train_epochs=3',
       

In [9]:
display_name = 'custom-test'

job = aiplatform.CustomJob(
    display_name=display_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(sync=False)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-west1/customJobs/8383380337606000640
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-west1/customJobs/8383380337606000640')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-west1/training/8383380337606000640?project=895222332033


In [10]:
job.resource_name

'projects/895222332033/locations/us-west1/customJobs/8383380337606000640'

In [None]:
job.wait()

INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/8383380337606000640 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/8383380337606000640 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/8383380337606000640 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/8383380337606000640 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/8383380337606000640 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/8383380337606000640 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:CustomJob projects