# Distributed training with Vertex Reduction server

In [60]:
import os
import pprint
import sys
import time
import shutil

from google.cloud import aiplatform
from google.cloud.aiplatform_v1beta1 import types
from google.cloud.aiplatform_v1beta1.services.job_service import \
    JobServiceClient

## Configure GCP settings

In [61]:
PROJECT = 'jk-mlops-dev'
REGION = 'us-central1'
API_ENDPOINT = f'{REGION}-aiplatform.googleapis.com'
GCS_BUCKET = f'gs://jk-staging-{REGION}'

### Create a GCS bucket

In [62]:
objects = !gsutil ls {GCS_BUCKET}
if objects:
    if 'BucketNotFoundException' in objects[0]:
        print('Creating a new bucket')
        !gsutil mb -l {REGION} {GCS_BUCKET} 

### Initialize Vertex SDK

In [63]:
aiplatform.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=f'{GCS_BUCKET}/vertex'
)

## Build a training container

In [64]:
TRAIN_IMAGE = f'gcr.io/{PROJECT}/model_garden'

In [65]:
!gcloud builds submit --tag {TRAIN_IMAGE} model_garden_image

Creating temporary tarball archive of 9 file(s) totalling 27.7 KiB before compression.
Uploading tarball of [model_garden_image] to [gs://jk-mlops-dev_cloudbuild/source/1624546720.217471-bc8f902bedcb42f48c805a20c1b42465.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jk-mlops-dev/locations/global/builds/40831f61-9b95-4a4b-976e-645965bfb90e].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/40831f61-9b95-4a4b-976e-645965bfb90e?project=895222332033].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "40831f61-9b95-4a4b-976e-645965bfb90e"

FETCHSOURCE
Fetching storage object: gs://jk-mlops-dev_cloudbuild/source/1624546720.217471-bc8f902bedcb42f48c805a20c1b42465.tgz#1624546720586901
Copying gs://jk-mlops-dev_cloudbuild/source/1624546720.217471-bc8f902bedcb42f48c805a20c1b42465.tgz#1624546720586901...
/ [1 files][  6.0 KiB/  6.0 KiB]                                                
Operation completed over 1 o

## Prepare training data

In [66]:
OUTPUT_DIR = f'{GCS_BUCKET}/datasets'
BERT_DIR = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16'
TASK = 'MNLI'

In [67]:
worker_pool_specs = [
    {
        'machine_spec': {
            'machine_type': 'n1-standard-8'
        },
        'replica_count': 1,
        'container_spec': {
            'image_uri': TRAIN_IMAGE,
            'command': ['python', 'dataprep/create_finetuning_data.py'],
            'args': [
                '--fine_tuning_task_type=classification',
                '--tfds_params=dataset=glue/mnli,text_key=hypothesis,text_b_key=premise,train_split=train,dev_split=validation_matched',
                '--max_seq_length=128',
                f'--vocab_file={BERT_DIR}/vocab.txt',
                f'--meta_data_file_path={OUTPUT_DIR}/{TASK}/{TASK}_meta_data',
                f'--train_data_output_path={OUTPUT_DIR}/{TASK}/{TASK}_train.tf_record',
                f'--eval_data_output_path={OUTPUT_DIR}/{TASK}/{TASK}_eval.tf_record',
            ]
        }
    }
]

In [68]:
job_name = "PREPARE_DATA_{}".format(time.strftime("%Y%m%d_%H%M%S"))

job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs
)

job.run(sync=True)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/8483343536756883456
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/8483343536756883456')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/8483343536756883456?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/8483343536756883456 current state:
JobState.JOB_STATE_QUEUED
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/8483343536756883456 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/c

## Configure and run MNLI fine tuning job


### Define a helper function

In [83]:
def prepare_custom_job_spec(
    job_name,
    image_uri,
    args,
    cmd, 
    replica_count=1,
    machine_type='n1-standard-4',
    accelerator_count=0,
    accelerator_type='ACCELERATOR_TYPE_UNSPECIFIED',
    reduction_server_count=0,
    reduction_server_machine_type='n1-highcpu-16',
    reduction_server_image_uri='us-docker.pkg.dev/vertex-ai-restricted/training/reductionserver:latest'
):

    if accelerator_count > 0:
        machine_spec = {
            'machine_type': machine_type,
            'accelerator_type': accelerator_type,
            'accelerator_count': accelerator_count,
        }
    else:
        machine_spec = {
            'machine_type': machine_type
        }
    
    container_spec = {
        'image_uri': image_uri,
        'args': args,
        'command': cmd,
    }
    
    chief_spec = {
        'replica_count': 1,
        'machine_spec': machine_spec,
        'container_spec': container_spec
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            'replica_count': replica_count - 1,
            'machine_spec': machine_spec,
            'container_spec': container_spec
        }
        worker_pool_specs.append(workers_spec)
        
    if reduction_server_count > 1:
        workers_spec = {
            'replica_count': reduction_server_count,
            'machine_spec': {
                'machine_type': reduction_server_machine_type,
            },
            'container_spec': {
                'image_uri': reduction_server_image_uri
            }
        }
        worker_pool_specs.append(workers_spec)
        
    custom_job_spec = {
        'display_name': job_name,
        'job_spec': {
            'worker_pool_specs': worker_pool_specs
        }
    }
    
    return custom_job_spec

### Configure MNLI experiment settings

#### Base experiment configuration

In [84]:
EXPERIMENT = 'bert/sentence_prediction'
CONFIG_FILE = 'trainer/glue_mnli_matched.yaml'
MODE = 'train'

#### Parameter overrides

In [140]:
MNLI_TRAIN_SPLIT = f'{OUTPUT_DIR}/{TASK}/{TASK}_train.tf_record'
MNLI_VALID_SPLIT = f'{OUTPUT_DIR}/{TASK}/{TASK}_eval.tf_record'
BERT_HUB_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4'

REPLICA_COUNT = 8
PER_REPLICA_BATCH_SIZE = 32
GLOBAL_BATCH_SIZE = REPLICA_COUNT * PER_REPLICA_BATCH_SIZE

ACCELERATOR_COUNT = 1
ALL_REDUCE_ALG = 'nccl'
STRATEGY = 'multi_worker_mirrored'

TRAINING_STEPS = 2000
STEPS_PER_LOOP = 100
SUMMARY_INTERVAL = 100
VALIDATION_INTERVAL = 2000
CHECKPOINT_INTERVAL = 2000

PARAMS_OVERRIDE = ','.join([
    'task.train_data.input_path=' + MNLI_TRAIN_SPLIT,
    'task.validation_data.input_path=' + MNLI_VALID_SPLIT,
    'task.train_data.global_batch_size=' + str(GLOBAL_BATCH_SIZE),
    'task.validation_data.global_batch_size=' + str(GLOBAL_BATCH_SIZE),
    'task.hub_module_url=' + BERT_HUB_URL,
    'runtime.num_gpus=' + str(ACCELERATOR_COUNT),
    'runtime.distribution_strategy=' + STRATEGY,
    'runtime.all_reduce_alg=' + ALL_REDUCE_ALG,
    'trainer.train_steps=' + str(TRAINING_STEPS),
    'trainer.steps_per_loop=' + str(STEPS_PER_LOOP),
    'trainer.summary_interval=' + str(SUMMARY_INTERVAL),
    'trainer.validation_interval=' + str(VALIDATION_INTERVAL),
    'trainer.checkpoint_interval=' + str(CHECKPOINT_INTERVAL),
])

NameError: name 'mixed_precision_dtype' is not defined

### Create Vertex training custom job spec

In [136]:
JOB_NAME = 'MNLI_{}'.format(time.strftime('%Y%m%d_%H%M%S'))
MODEL_DIR = f'{GCS_BUCKET}/{JOB_NAME}/model'
WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
WORKER_CMD = ['python', 'trainer/train.py']
WORKER_ARGS = [
    '--experiment=' + EXPERIMENT,
    '--mode=' + MODE,
    '--model_dir=' + MODEL_DIR,
    '--config_file=' + CONFIG_FILE,
    '--params_override=' + PARAMS_OVERRIDE,
]

REDUCTION_SERVER_COUNT = 10
REDUCTION_SERVER_MACHINE_TYPE = 'n1-highcpu-16'

In [137]:
custom_job_spec = prepare_custom_job_spec(
    job_name=JOB_NAME,
    image_uri=TRAIN_IMAGE,
    args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

pp = pprint.PrettyPrinter()
print(pp.pformat(custom_job_spec))

{'display_name': 'MNLI_20210625_145933',
 'job_spec': {'worker_pool_specs': [{'container_spec': {'args': ['--experiment=bert/sentence_prediction',
                                                                 '--mode=train',
                                                                 '--model_dir=gs://jk-staging-us-central1/MNLI_20210625_145933/model',
                                                                 '--config_file=trainer/glue_mnli_matched.yaml',
                                                                 '--params_override=task.train_data.input_path=gs://jk-staging-us-central1/datasets/MNLI/MNLI_train.tf_record,task.validation_data.input_path=gs://jk-staging-us-central1/datasets/MNLI/MNLI_eval.tf_record,task.train_data.global_batch_size=32,task.validation_data.global_batch_size=32,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4,runtime.num_gpus=2,runtime.distribution_strategy=mirrored,runtime.all_reduce_alg=nccl,trainer

### Submit and monitor the job

In [138]:
options = dict(api_endpoint=API_ENDPOINT)
client = JobServiceClient(client_options=options)

parent = f"projects/{PROJECT}/locations/{REGION}"

response = client.create_custom_job(
    parent=parent, custom_job=custom_job_spec
)

response

InvalidArgument: 400 The accelerator count "2" is not allowed. The allowed numbers of accelerator "NVIDIA_TESLA_A100" to use with machine type "a2-highgpu-1g" are: [1].

In [139]:
client.get_custom_job(name=response.name).state

<JobState.JOB_STATE_SUCCEEDED: 4>

## Temporary - Upload logs to Tensorboard

In [134]:
print('TENSORBOARD={}'.format('projects/895222332033/locations/us-central1/tensorboards/5983067289333792768'))
print('LOGDIR={}'.format(MODEL_DIR))
print('EXPERIMENT={}'.format(JOB_NAME))
print('./tb-gcp-uploader --tensorboard_resource_name $TENSORBOARD   --logdir=$LOGDIR   --experiment_name=$EXPERIMENT --one_shot=True')

TENSORBOARD=projects/895222332033/locations/us-central1/tensorboards/5983067289333792768
LOGDIR=gs://jk-staging-us-central1/MNLI_20210624_234931/model
EXPERIMENT=MNLI_20210624_234931
./tb-gcp-uploader --tensorboard_resource_name $TENSORBOARD   --logdir=$LOGDIR   --experiment_name=$EXPERIMENT --one_shot=True


## Temporary - Test the container image locally

In [128]:
!docker build -t {TRAIN_IMAGE} model_garden_image

Sending build context to Docker daemon  38.91kB
Step 1/8 : FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-5
 ---> 950969e5619c
Step 2/8 : RUN apt remove -y google-fast-socket &&  echo "deb https://packages.cloud.google.com/apt google-fast-socket main" | tee /etc/apt/sources.list.d/google-fast-socket.list &&  curl -s -L https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - &&  apt update && apt install -y google-reduction-server
 ---> Using cache
 ---> 8964668e6bca
Step 3/8 : RUN pip install tf-models-official==2.5.0 tensorflow-text==2.5.0
 ---> Using cache
 ---> 04fc51107496
Step 4/8 : WORKDIR /
 ---> Using cache
 ---> 8eaa3f9eef6c
Step 5/8 : COPY trainer /trainer
 ---> Using cache
 ---> 39b9d75adec1
Step 6/8 : COPY dataprep /dataprep
 ---> Using cache
 ---> 04964b034c96
Step 7/8 : ENTRYPOINT ["python"]
 ---> Using cache
 ---> a7e532e60f9a
Step 8/8 : CMD ["-c", "print('TF Model Garden')"]
 ---> Using cache
 ---> 6659694cd2d3
Successfully built 6659694cd2d3
Successf

In [131]:
EXPERIMENT = 'bert/sentence_prediction'
CONFIG_FILE = 'trainer/glue_mnli_matched.yaml'
MODE = 'train'

MNLI_TRAIN_SPLIT = f'{OUTPUT_DIR}/{TASK}/{TASK}_train.tf_record'
MNLI_VALID_SPLIT = f'{OUTPUT_DIR}/{TASK}/{TASK}_eval.tf_record'
BERT_HUB_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4'

PER_REPLICA_BATCH_SIZE = 16
ACCELERATOR_COUNT = 2
ALL_REDUCE_ALG = 'nccl'
STRATEGY = 'mirrored'
GLOBAL_BATCH_SIZE = ACCELERATOR_COUNT * PER_REPLICA_BATCH_SIZE

TRAINING_STEPS = 200
STEPS_PER_LOOP = 50
SUMMARY_INTERVAL = 50
VALIDATION_INTERVAL = 200
CHECKPOINT_INTERVAL = 200

MIXED_PRECISION_TYPE = 'mixed_float16'

LOCAL_DIR = '/tmp'

PARAMS_OVERRIDE = ','.join([
    'task.train_data.input_path=' + MNLI_TRAIN_SPLIT,
    'task.validation_data.input_path=' + MNLI_VALID_SPLIT,
    'task.train_data.global_batch_size=' + str(GLOBAL_BATCH_SIZE),
    'task.validation_data.global_batch_size=' + str(GLOBAL_BATCH_SIZE),
    'task.hub_module_url=' + BERT_HUB_URL,
    'runtime.num_gpus=' + str(ACCELERATOR_COUNT),
    'runtime.distribution_strategy=' + STRATEGY,
    'runtime.all_reduce_alg=' + ALL_REDUCE_ALG,
#    'runtime.mixed_precision_dtype=' + MIXED_PRECISION_TYPE,
    'trainer.train_steps=' + str(TRAINING_STEPS),
    'trainer.steps_per_loop=' + str(STEPS_PER_LOOP),
    'trainer.summary_interval=' + str(SUMMARY_INTERVAL),
    'trainer.validation_interval=' + str(VALIDATION_INTERVAL),
    'trainer.checkpoint_interval=' + str(CHECKPOINT_INTERVAL),
])

In [132]:
! docker run -it --rm --gpus all {TRAIN_IMAGE} trainer/train.py \
--experiment={EXPERIMENT} \
--mode={MODE} \
--model_dir={LOCAL_DIR}/test \
--config_file={CONFIG_FILE}\
--params_override={PARAMS_OVERRIDE}  


2021-06-25 02:19:56.981187: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
None
I0625 02:20:04.301980 139903761540928 train.py:59] Setting model_dir to: /tmp/test
I0625 02:20:04.317953 139903761540928 train_utils.py:286] Final experiment parameters: {'runtime': {'all_reduce_alg': 'nccl',
             'batchnorm_spatial_persistent': False,
             'dataset_num_private_threads': None,
             'default_shard_dim': -1,
             'distribution_strategy': 'mirrored',
             'enable_xla': False,
             'gpu_thread_mode': None,
             'loss_scale': None,
             'mixed_precision_dtype': None,
             'num_cores_per_replica': 1,
             'num_gpus': 2,
             'num_packs': 1,
             'per_gpu_thread_count': 0,
             'run_eagerly': False,
             'task_index': -1,
             'tpu': None,
             'tpu_enable_xla_dynamic_padder': None,
             'worke