# Distributed training with Vertex Reduction server

In [33]:
import os
import pprint
import sys

from google.cloud import aiplatform

## Set environment constants

In [34]:
PROJECT_ID = 'jk-mlops-dev'
REGION = 'us-west1'
STAGING_BUCKET = 'gs://jk-vertex-staging'

BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-5'
TRAIN_IMAGE = f'gcr.io/{PROJECT_ID}/tf_nlp_toolkit'

MNLI_TRAIN_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record'
MNLI_VALID_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record'

BERT_HUB_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4'

## Train

### Initialize Vertex AI SDK


In [35]:
aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

### Create a training script

In [36]:
%%writefile trainer/test.py

import os

import tensorflow as tf

strategy = tf.distribute.MultiWorkerMirroredStrategy()

if strategy.cluster_resolver:    
        task_type, task_id = (strategy.cluster_resolver.task_type,
                              strategy.cluster_resolver.task_id)
else:
        task_type, task_id =(None, None)
        
print('*** task_type: {}, task_id: {}'.format(task_type, task_id))

print(os.environ['AIP_MODEL_DIR'], os.environ['AIP_CHECKPOINT_DIR'], os.environ['AIP_TENSORBOARD_LOG_DIR'])



Overwriting trainer/test.py


In [37]:
print(os.environ['PATH'])

/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games


### Create a training container

In [38]:
dockerfile = f'''
FROM {BASE_IMAGE}

RUN pip install pip install tf-models-official tensorflow-text 

WORKDIR /

# Copies the trainer code to the docker image.
COPY trainer /trainer

# Sets up the entry point to invoke the trainer.
ENTRYPOINT ["python", "-m", "trainer.train"]
'''

with open('Dockerfile', 'w') as f:
    f.write(dockerfile)

In [39]:
! docker build -t {TRAIN_IMAGE} .

Sending build context to Docker daemon  148.5kB
Step 1/5 : FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-5
 ---> b963122c3c2c
Step 2/5 : RUN pip install pip install tf-models-official tensorflow-text
 ---> Using cache
 ---> 486eefe48a12
Step 3/5 : WORKDIR /
 ---> Using cache
 ---> 4ee02d56eb08
Step 4/5 : COPY trainer /trainer
 ---> 08ce1dc21014
Step 5/5 : ENTRYPOINT ["python", "-m", "trainer.train"]
 ---> Running in 2d110f49a008
Removing intermediate container 2d110f49a008
 ---> cf8844a34a23
Successfully built cf8844a34a23
Successfully tagged gcr.io/jk-mlops-dev/tf_nlp_toolkit:latest


### Test the container locally

In [None]:
replica_count = 1
machine_type = 'n1-standard-8'
accelerator_count = 2
accelerator_type = 'NVIDIA_TESLA_T4'

output_dir = 'gs://jk-vertex-demos/rs-demo'
model_dir = f'{output_dir}/model'
tfhub_cache_dir = f'{output_dir}/tfhub-cache'

strategy = 'mirrored'



params_override = [
    'task.train_data.input_path=' + MNLI_TRAIN_SPLIT,
    'task.validation_data.input_path=' + MNLI_VALID_SPLIT,
    'task.hub_module_url=' + BERT_HUB_URL,
    'runtime.num_gpus=' + str(accelerator_count),
    'runtime.distribution_strategy=' + strategy,
]

params = ','.join(params_override)

In [None]:
!docker run -it --rm --gpus all gcr.io/jk-mlops-dev/tf_nlp_toolkit \
--experiment=bert/sentence_prediction \
--mode=train_and_eval \
--model_dir=gs://jk-vertex-staging/test \
--config_file=trainer/glue_mnli_matched.yaml \
--params_override={params}

### Push the container

In [None]:
! docker push {TRAIN_IMAGE}

In [40]:
def prepare_worker_pool_specs(
    image_uri,
    args,
    replica_count=1,
    machine_type='n1-standard-4',
    accelerator_count=0,
    accelerator_type='ACCELERATOR_TYPE_UNSPECIFIED'):

    if accelerator_count > 0:
        machine_spec = {
            'machine_type': machine_type,
            'accelerator_type': accelerator_type,
            'accelerator_count': accelerator_count,
        }
    else:
        machine_spec = {
            'machine_type': machine_type
        }
    
    container_spec = {
        'image_uri': image_uri,
        'args': args
    }
    
    chief_spec = {
        'replica_count': 1,
        'machine_spec': machine_spec,
        'container_spec': container_spec
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            'replica_count': replica_count - 1,
            'machine_spec': machine_spec,
            'container_spec': container_spec
        }
        worker_pool_specs.append(workers_spec)
    
    return worker_pool_specs

In [66]:
replica_count = 2
machine_type = 'n1-standard-8'
accelerator_count = 1
accelerator_type = 'NVIDIA_TESLA_V100'
global_batch_size = 32

output_dir = 'gs://jk-vertex-demos/rs-demo'
model_dir = f'{output_dir}/model'
tfhub_cache_dir = f'{output_dir}/tfhub-cache'

#strategy = 'mirrored'
strategy = 'multi_worker_mirrored'

summary_interval: 1000
    
# Training data size 392,702 examples, 3 epochs.
train_steps: 36813
validation_interval: 6135

params_override = [
    'task.train_data.input_path=' + MNLI_TRAIN_SPLIT,
    'task.validation_data.input_path=' + MNLI_VALID_SPLIT,
    'task.train_data.global_batch_size=' + str(global_batch_size),
    'task.validation_data.global_batch_size=' + str(global_batch_size),
    'task.hub_module_url=' + BERT_HUB_URL,
    'runtime.num_gpus=' + str(accelerator_count),
    'runtime.distribution_strategy=' + strategy,
]


image_uri = TRAIN_IMAGE
args = [
    '--experiment=bert/sentence_prediction',
    '--mode=train_and_eval',
    '--model_dir=' + model_dir,
    '--config_file=trainer/glue_mnli_matched.yaml',
    '--tfhub_cache_dir=' + tfhub_cache_dir,
    '--params_override=' + ','.join(params_override),
]

worker_pool_specs = prepare_worker_pool_specs(
    image_uri=image_uri,
    args=args,
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_count=accelerator_count,
    accelerator_type=accelerator_type
)

pp = pprint.PrettyPrinter()
print(pp.pformat(worker_pool_specs))

[{'container_spec': {'args': ['--experiment=bert/sentence_prediction',
                              '--mode=train_and_eval',
                              '--model_dir=gs://jk-vertex-demos/rs-demo/model',
                              '--config_file=trainer/glue_mnli_matched.yaml',
                              '--tfhub_cache_dir=gs://jk-vertex-demos/rs-demo/tfhub-cache',
                              '--params_override=task.train_data.input_path=gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record,task.validation_data.input_path=gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record,task.train_data.global_batch_size=32,task.validation_data.global_batch_size=32,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4,runtime.num_gpus=1,runtime.distribution_strategy=multi_worker_mirrored'],
                     'image_uri': 'gcr.io/jk-mlops-dev/tf_nlp_toolkit'},
  'machine_spec': {'accelerator_count': 1,
                   'accelerator_type': 'NVIDIA_TESL

In [67]:
display_name = 'custom-test'

job = aiplatform.CustomJob(
    display_name=display_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(sync=False)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-west1/customJobs/7706714493593583616
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-west1/customJobs/7706714493593583616')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-west1/training/7706714493593583616?project=895222332033


In [68]:
job.resource_name

'projects/895222332033/locations/us-west1/customJobs/7706714493593583616'

In [69]:
job.wait()

INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7706714493593583616 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7706714493593583616 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7706714493593583616 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7706714493593583616 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7706714493593583616 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7706714493593583616 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:CustomJob projects

RuntimeError: Job failed with:
code: 1
message: "CANCELED"
