# Distributed training with Vertex Reduction server

In [1]:
import os
import pprint
import sys
import time

from google.cloud import aiplatform

## Set environment constants

In [2]:
PROJECT_ID = 'jk-mlops-dev'
REGION = 'us-central1'
STAGING_BUCKET = 'gs://jk-vertex-staging'

## Prepare and test a training container

### Create a Dockerfile

In [3]:
#BASE_IMAGE = 'tensorflow/tensorflow:2.5.0-gpu'
BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-gpu.2-5'
CUDA_VERSION='cuda-11-2'
#BASE_IMAGE = 'gcr.io/deeplearning-platform-release/base-cu110'
MODEL_GARDEN_VERSION = '2.5.0'
TRAIN_IMAGE = f'gcr.io/{PROJECT_ID}/model_garden'
TF_TEXT='2.5.0'


dockerfile = f'''
FROM {BASE_IMAGE}

RUN pip install tf-models-official=={MODEL_GARDEN_VERSION} tensorflow-text=={TF_TEXT}

WORKDIR /

# Copies the trainer code to the docker image.
COPY trainer /trainer

ENTRYPOINT ["python"]
CMD ["-c", "print('Hello')"]
'''

with open('Dockerfile', 'w') as f:
    f.write(dockerfile)

### Build a container image

In [4]:
! docker build -t {TRAIN_IMAGE} .

Sending build context to Docker daemon  505.9kB
Step 1/6 : FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-5
 ---> 950969e5619c
Step 2/6 : RUN pip install tf-models-official==2.5.0 tensorflow-text==2.5.0
 ---> Using cache
 ---> 691637649ce1
Step 3/6 : WORKDIR /
 ---> Using cache
 ---> bf564ec3645d
Step 4/6 : COPY trainer /trainer
 ---> 6474559b990c
Step 5/6 : ENTRYPOINT ["python"]
 ---> Running in 7181f5c00a06
Removing intermediate container 7181f5c00a06
 ---> b0382c91d062
Step 6/6 : CMD ["-c", "print('Hello')"]
 ---> Running in eda8e45114b7
Removing intermediate container eda8e45114b7
 ---> c43db23b2739
Successfully built c43db23b2739
Successfully tagged gcr.io/jk-mlops-dev/model_garden:latest


### Push the container to Container Registry

In [5]:
! docker push {TRAIN_IMAGE}

Using default tag: latest
The push refers to repository [gcr.io/jk-mlops-dev/model_garden]

[1B0acc7932: Preparing 
[1B29ea19d2: Preparing 
[1B464d3f17: Preparing 
[1Bdaea14d2: Preparing 
[1Bb28de254: Preparing 
[1B52e30556: Preparing 
[1Bfc085027: Preparing 
[1B7d90a58d: Preparing 
[1B285b3362: Preparing 
[1B0730cb59: Preparing 
[1B18de1f93: Preparing 
[1Bd1dfb5d0: Preparing 
[1B686f5924: Preparing 
[1B5de2196f: Preparing 
[1B383a0e80: Preparing 
[1Beaf882b2: Preparing 
[1B2519572d: Preparing 
[1Bfbfba824: Preparing 
[14B2e30556: Waiting g 
[1B2a1c8291: Preparing 
[15Bc085027: Waiting g 
[1Bb363f69f: Preparing 
[16Bd90a58d: Waiting g 
[16B85b3362: Waiting g 
[16B730cb59: Waiting g 
[1B01dbc7de: Preparing 
[17B8de1f93: Waiting g 
[17B1dfb5d0: Waiting g 
[1Bb9e63cdf: Preparing 
[1B49f5bf51: Preparing 
[19B86f5924: Waiting g 
[1B325cc380: Preparing 
[20Bde2196f: Waiting g 
[34Bacc7932: Pushed lready exists 8kB[30A[2K[25A[2K[23A[2K[18A[2K[12A[2K

## Submit Vertext Training jobs

### Define helper functions

In [12]:
def prepare_worker_pool_specs(
    image_uri,
    args,
    cmd, 
    replica_count=1,
    machine_type='n1-standard-4',
    accelerator_count=0,
    accelerator_type='ACCELERATOR_TYPE_UNSPECIFIED',
    reduction_server_count=0,
    reduction_server_machine_type='n1-standard-4',
    reduction_server_image_uri='gcr.io/cloud-aiplatform-restricted/reductionserver'
):

    if accelerator_count > 0:
        machine_spec = {
            'machine_type': machine_type,
            'accelerator_type': accelerator_type,
            'accelerator_count': accelerator_count,
        }
    else:
        machine_spec = {
            'machine_type': machine_type
        }
    
    container_spec = {
        'image_uri': image_uri,
        'args': args,
        'command': cmd,
    }
    
    chief_spec = {
        'replica_count': 1,
        'machine_spec': machine_spec,
        'container_spec': container_spec
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            'replica_count': replica_count - 1,
            'machine_spec': machine_spec,
            'container_spec': container_spec
        }
        worker_pool_specs.append(workers_spec)
        
    if reduction_server_count > 1:
        workers_spec = {
            'replica_count': reduction_server_count,
            'machine_spec': {
                'machine_type': reduction_server_machine_type,
            },
            'container_spec': {
                image_uri: reduction_server_image_uri
            }
        }
    
    return worker_pool_specs

### Prepare worker pool specification

In [7]:
MNLI_TRAIN_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record'
MNLI_VALID_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record'
BERT_HUB_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4'

job_name = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
output_dir = f'gs://jk-vertex-demos/jobs'
model_dir = f'{output_dir}/{job_name}/model'
tfhub_cache_dir = f'{output_dir}/tfhub-cache'
config_file = 'trainer/glue_mnli_matched.yaml'
mode = 'train_and_eval'
experiment = 'bert/sentence_prediction'

machine_type = 'n1-standard-8'
accelerator_count = 1
accelerator_type = 'NVIDIA_TESLA_T4'

train_steps = 1200
steps_per_loop = 100
summary_interval = 100
validation_interval = 400
checkpoint_interval = 200

replica_count = 2
global_batch_size = 32
all_reduce_alg = 'nccl'
#strategy = 'mirrored'
strategy = 'multi_worker_mirrored'

reduction_server_count=0



params_override = [
    'task.train_data.input_path=' + MNLI_TRAIN_SPLIT,
    'task.validation_data.input_path=' + MNLI_VALID_SPLIT,
    'task.train_data.global_batch_size=' + str(global_batch_size),
    'task.validation_data.global_batch_size=' + str(global_batch_size),
    'task.hub_module_url=' + BERT_HUB_URL,
    'runtime.num_gpus=' + str(accelerator_count),
    'runtime.distribution_strategy=' + strategy,
    'runtime.all_reduce_alg=' + all_reduce_alg,
    'trainer.train_steps=' + str(train_steps),
    'trainer.steps_per_loop=' + str(steps_per_loop),
    'trainer.summary_interval=' + str(summary_interval),
    'trainer.validation_interval=' + str(validation_interval),
    'trainer.checkpoint_interval=' + str(checkpoint_interval),
]


cmd = [
    "python", "trainer/train.py"
]
args = [
    '--experiment=' + experiment,
    '--mode=' + mode,
    '--model_dir=' + model_dir,
    '--config_file=' + config_file,
    '--tfhub_cache_dir=' + tfhub_cache_dir,
    '--params_override=' + ','.join(params_override),
]

worker_pool_specs = prepare_worker_pool_specs(
    image_uri=TRAIN_IMAGE,
    args=args,
    cmd=cmd,
    replica_count=replica_count,
    machine_type=machine_type,
    accelerator_count=accelerator_count,
    accelerator_type=accelerator_type
)

pp = pprint.PrettyPrinter()
print(pp.pformat(worker_pool_specs))

[{'container_spec': {'args': ['--experiment=bert/sentence_prediction',
                              '--mode=train_and_eval',
                              '--model_dir=gs://jk-vertex-demos/jobs/JOB_20210610_144019/model',
                              '--config_file=trainer/glue_mnli_matched.yaml',
                              '--tfhub_cache_dir=gs://jk-vertex-demos/jobs/tfhub-cache',
                              '--params_override=task.train_data.input_path=gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record,task.validation_data.input_path=gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record,task.train_data.global_batch_size=32,task.validation_data.global_batch_size=32,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4,runtime.num_gpus=1,runtime.distribution_strategy=mirrored,runtime.all_reduce_alg=nccl,trainer.train_steps=1200,trainer.steps_per_loop=100,trainer.summary_interval=100,trainer.validation_interval=400,trainer.checkpoint_interval=

### Submit and monitor the job

In [8]:
PROJECT_ID = 'jk-mlops-dev'
REGION = 'us-west1'
STAGING_BUCKET = 'gs://jk-vertex-staging'

aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

In [9]:
display_name = job_name

job = aiplatform.CustomJob(
    display_name=display_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(sync=False)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-west1/customJobs/7943259027146801152
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-west1/customJobs/7943259027146801152')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-west1/training/7943259027146801152?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7943259027146801152 current state:
JobState.JOB_STATE_PENDING


In [10]:
job.resource_name

'projects/895222332033/locations/us-west1/customJobs/7943259027146801152'

INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7943259027146801152 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7943259027146801152 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7943259027146801152 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7943259027146801152 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7943259027146801152 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-west1/customJobs/7943259027146801152 current state:
JobState.JOB_STATE_PENDING


In [None]:
job.wait()

### Test the container image locally

In [None]:
MNLI_TRAIN_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record'
MNLI_VALID_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record'
BERT_HUB_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4'

num_gpus = 2 
strategy = 'mirrored'
#strategy = 'multi_worker_mirrored'

params_override = [
    'task.train_data.input_path=' + MNLI_TRAIN_SPLIT,
    'task.validation_data.input_path=' + MNLI_VALID_SPLIT,
    'task.hub_module_url=' + BERT_HUB_URL,
    'runtime.num_gpus=' + str(num_gpus),
    'runtime.distribution_strategy=' + strategy,
]

params = ','.join(params_override)

In [None]:
!docker run -it --rm --gpus all {TRAIN_IMAGE} trainer/train.py \
--experiment=bert/sentence_prediction \
--mode=train_and_eval \
--model_dir={STAGING_BUCKET}/test \
--config_file=trainer/glue_mnli_matched.yaml \
--params_override={params}  
