# Distributed training with Vertex Reduction server

In [1]:
import os
import pprint
import sys
import time
import shutil

from google.cloud import aiplatform
from google.cloud.aiplatform_v1beta1 import types
from google.cloud.aiplatform_v1beta1.services.job_service import \
    JobServiceClient

## Configure GCP settings

In [2]:
PROJECT = 'jk-mlops-dev'
REGION = 'us-central1'
API_ENDPOINT = f'{REGION}-aiplatform.googleapis.com'
GCS_BUCKET = f'gs://jk-staging-{REGION}'

### Create a GCS bucket

In [3]:
objects = !gsutil ls {GCS_BUCKET}
if objects:
    if 'BucketNotFoundException' in objects[0]:
        print('Creating a new bucket')
        !gsutil mb -l {REGION} {GCS_BUCKET} 

### Initialize Vertex SDK

In [4]:
aiplatform.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=f'{GCS_BUCKET}/vertex'
)

## Build a training container

In [5]:
TRAIN_IMAGE = f'gcr.io/{PROJECT}/model_garden'

In [6]:
!gcloud builds submit --tag {TRAIN_IMAGE} model_garden_image

Creating temporary tarball archive of 9 file(s) totalling 27.7 KiB before compression.
Uploading tarball of [model_garden_image] to [gs://jk-mlops-dev_cloudbuild/source/1624314140.348313-aca78fcba9064d28ac28ad8cd52e534e.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jk-mlops-dev/locations/global/builds/da1dfe86-1c10-4170-b76a-852c5f39cbe4].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/da1dfe86-1c10-4170-b76a-852c5f39cbe4?project=895222332033].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "da1dfe86-1c10-4170-b76a-852c5f39cbe4"

FETCHSOURCE
Fetching storage object: gs://jk-mlops-dev_cloudbuild/source/1624314140.348313-aca78fcba9064d28ac28ad8cd52e534e.tgz#1624314140684650
Copying gs://jk-mlops-dev_cloudbuild/source/1624314140.348313-aca78fcba9064d28ac28ad8cd52e534e.tgz#1624314140684650...
/ [1 files][  6.0 KiB/  6.0 KiB]                                                
Operation completed over 1 o

## Prepare training data

In [7]:
OUTPUT_DIR = f'{GCS_BUCKET}/datasets'
BERT_DIR = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16'
TASK = 'MNLI'

In [8]:
worker_pool_specs = [
    {
        'machine_spec': {
            'machine_type': 'n1-standard-8'
        },
        'replica_count': 1,
        'container_spec': {
            'image_uri': TRAIN_IMAGE,
            'command': ['python', 'dataprep/create_finetuning_data.py'],
            'args': [
                '--fine_tuning_task_type=classification',
                '--tfds_params=dataset=glue/mnli,text_key=hypothesis,text_b_key=premise,train_split=train,dev_split=validation_matched',
                '--max_seq_length=128',
                f'--vocab_file={BERT_DIR}/vocab.txt',
                f'--meta_data_file_path={OUTPUT_DIR}/{TASK}/{TASK}_meta_data',
                f'--train_data_output_path={OUTPUT_DIR}/{TASK}/{TASK}_train.tf_record',
                f'--eval_data_output_path={OUTPUT_DIR}/{TASK}/{TASK}_eval.tf_record',
            ]
        }
    }
]

In [9]:
job_name = "PREPARE_DATA_{}".format(time.strftime("%Y%m%d_%H%M%S"))

job = aiplatform.CustomJob(
    display_name=job_name,
    worker_pool_specs=worker_pool_specs
)

job.run(sync=True)

INFO:google.cloud.aiplatform.jobs:Creating CustomJob
INFO:google.cloud.aiplatform.jobs:CustomJob created. Resource name: projects/895222332033/locations/us-central1/customJobs/2139882724117184512
INFO:google.cloud.aiplatform.jobs:To use this CustomJob in another session:
INFO:google.cloud.aiplatform.jobs:custom_job = aiplatform.CustomJob.get('projects/895222332033/locations/us-central1/customJobs/2139882724117184512')
INFO:google.cloud.aiplatform.jobs:View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/2139882724117184512?project=895222332033
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/2139882724117184512 current state:
JobState.JOB_STATE_QUEUED
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/customJobs/2139882724117184512 current state:
JobState.JOB_STATE_PENDING
INFO:google.cloud.aiplatform.jobs:CustomJob projects/895222332033/locations/us-central1/c

## Configure and run MNLI fine tuning job


### Configure MNLI experiment settings

#### Base experiment configuration

In [10]:
EXPERIMENT = 'bert/sentence_prediction'
CONFIG_FILE = 'trainer/glue_mnli_matched.yaml'
MODE = 'train'

#### Parameter overrides

In [11]:
MNLI_TRAIN_SPLIT = f'{OUTPUT_DIR}/{TASK}/{TASK}_train.tf_record'
MNLI_VALID_SPLIT = f'{OUTPUT_DIR}/{TASK}/{TASK}_eval.tf_record'
BERT_HUB_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4'

REPLICA_COUNT = 8
PER_REPLICA_BATCH_SIZE = 32
GLOBAL_BATCH_SIZE = REPLICA_COUNT * PER_REPLICA_BATCH_SIZE

ACCELERATOR_COUNT = 1
ALL_REDUCE_ALG = 'nccl'
STRATEGY = 'multi_worker_mirrored'

TRAINING_STEPS = 2000
STEPS_PER_LOOP = 100
SUMMARY_INTERVAL = 100
VALIDATION_INTERVAL = 2000
CHECKPOINT_INTERVAL = 2000

PARAMS_OVERRIDE = ','.join([
    'task.train_data.input_path=' + MNLI_TRAIN_SPLIT,
    'task.validation_data.input_path=' + MNLI_VALID_SPLIT,
    'task.train_data.global_batch_size=' + str(GLOBAL_BATCH_SIZE),
    'task.validation_data.global_batch_size=' + str(GLOBAL_BATCH_SIZE),
    'task.hub_module_url=' + BERT_HUB_URL,
    'runtime.num_gpus=' + str(ACCELERATOR_COUNT),
    'runtime.distribution_strategy=' + STRATEGY,
    'runtime.all_reduce_alg=' + ALL_REDUCE_ALG,
 #   'runtime.mixed_precision_dtype=' + mixed_precision_dtype,
    'trainer.train_steps=' + str(TRAINING_STEPS),
    'trainer.steps_per_loop=' + str(STEPS_PER_LOOP),
    'trainer.summary_interval=' + str(SUMMARY_INTERVAL),
    'trainer.validation_interval=' + str(VALIDATION_INTERVAL),
    'trainer.checkpoint_interval=' + str(CHECKPOINT_INTERVAL),
])

### Create Vertex training custom job spec

In [15]:
JOB_NAME = 'MNLI_{}'.format(time.strftime('%Y%m%d_%H%M%S'))
MODEL_DIR = f'{GCS_BUCKET}/{JOB_NAME}/model'
WORKER_MACHINE_TYPE = 'a2-highgpu-1g'
ACCELERATOR_TYPE = 'NVIDIA_TESLA_A100'
WORKER_CMD = ['python', 'trainer/train.py']
WORKER_ARGS = [
    '--experiment=' + EXPERIMENT,
    '--mode=' + MODE,
    '--model_dir=' + MODEL_DIR,
    '--config_file=' + CONFIG_FILE,
    '--params_override=' + PARAMS_OVERRIDE,
]

REDUCTION_SERVER_COUNT = 0
REDUCTION_SERVER_MACHINE_TYPE = 'n1-highcpu-16'

In [16]:
def prepare_custom_job_spec(
    job_name,
    image_uri,
    args,
    cmd, 
    replica_count=1,
    machine_type='n1-standard-4',
    accelerator_count=0,
    accelerator_type='ACCELERATOR_TYPE_UNSPECIFIED',
    reduction_server_count=0,
    reduction_server_machine_type='n1-highcpu-16',
    reduction_server_image_uri='us-docker.pkg.dev/vertex-ai-restricted/training/reductionserver:latest'
):

    if accelerator_count > 0:
        machine_spec = {
            'machine_type': machine_type,
            'accelerator_type': accelerator_type,
            'accelerator_count': accelerator_count,
        }
    else:
        machine_spec = {
            'machine_type': machine_type
        }
    
    container_spec = {
        'image_uri': image_uri,
        'args': args,
        'command': cmd,
    }
    
    chief_spec = {
        'replica_count': 1,
        'machine_spec': machine_spec,
        'container_spec': container_spec
    }

    worker_pool_specs = [chief_spec]
    if replica_count > 1:
        workers_spec = {
            'replica_count': replica_count - 1,
            'machine_spec': machine_spec,
            'container_spec': container_spec
        }
        worker_pool_specs.append(workers_spec)
        
    if reduction_server_count > 1:
        workers_spec = {
            'replica_count': reduction_server_count,
            'machine_spec': {
                'machine_type': reduction_server_machine_type,
            },
            'container_spec': {
                'image_uri': reduction_server_image_uri
            }
        }
        worker_pool_specs.append(workers_spec)
        
    custom_job_spec = {
        'display_name': job_name,
        'job_spec': {
            'worker_pool_specs': worker_pool_specs
        }
    }
    
    return custom_job_spec

In [17]:
custom_job_spec = prepare_custom_job_spec(
    job_name=JOB_NAME,
    image_uri=TRAIN_IMAGE,
    args=WORKER_ARGS,
    cmd=WORKER_CMD,
    replica_count=REPLICA_COUNT,
    machine_type=WORKER_MACHINE_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    reduction_server_count=REDUCTION_SERVER_COUNT,
    reduction_server_machine_type=REDUCTION_SERVER_MACHINE_TYPE,
)

pp = pprint.PrettyPrinter()
print(pp.pformat(custom_job_spec))

{'display_name': 'MNLI_20210621_230150',
 'job_spec': {'worker_pool_specs': [{'container_spec': {'args': ['--experiment=bert/sentence_prediction',
                                                                 '--mode=train',
                                                                 '--model_dir=gs://jk-staging-us-central1/MNLI_20210621_230150/model',
                                                                 '--config_file=trainer/glue_mnli_matched.yaml',
                                                                 '--params_override=task.train_data.input_path=gs://jk-staging-us-central1/datasets/MNLI/MNLI_train.tf_record,task.validation_data.input_path=gs://jk-staging-us-central1/datasets/MNLI/MNLI_eval.tf_record,task.train_data.global_batch_size=256,task.validation_data.global_batch_size=256,task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4,runtime.num_gpus=1,runtime.distribution_strategy=multi_worker_mirrored,runtime.all_reduce_a

### Submit and monitor the job

In [18]:
options = dict(api_endpoint=API_ENDPOINT)
client = JobServiceClient(client_options=options)

parent = f"projects/{PROJECT}/locations/{REGION}"

response = client.create_custom_job(
    parent=parent, custom_job=custom_job_spec
)

response

name: "projects/895222332033/locations/us-central1/customJobs/349701872237412352"
display_name: "MNLI_20210621_230150"
job_spec {
  worker_pool_specs {
    machine_spec {
      machine_type: "a2-highgpu-1g"
      accelerator_type: NVIDIA_TESLA_A100
      accelerator_count: 1
    }
    replica_count: 1
    disk_spec {
      boot_disk_type: "pd-ssd"
      boot_disk_size_gb: 100
    }
    container_spec {
      image_uri: "gcr.io/jk-mlops-dev/model_garden"
      command: "python"
      command: "trainer/train.py"
      args: "--experiment=bert/sentence_prediction"
      args: "--mode=train"
      args: "--model_dir=gs://jk-staging-us-central1/MNLI_20210621_230150/model"
      args: "--config_file=trainer/glue_mnli_matched.yaml"
      args: "--params_override=task.train_data.input_path=gs://jk-staging-us-central1/datasets/MNLI/MNLI_train.tf_record,task.validation_data.input_path=gs://jk-staging-us-central1/datasets/MNLI/MNLI_eval.tf_record,task.train_data.global_batch_size=256,task.validat

In [19]:
client.get_custom_job(name=response.name).state

<JobState.JOB_STATE_QUEUED: 1>

## Upload logs to Tensorboard

In [None]:
print('TENSORBOARD={}'.format('projects/895222332033/locations/us-central1/tensorboards/5983067289333792768'))
print('LOGDIR={}'.format(model_dir))
print('EXPERIMENT={}'.format(job_name))
print('./tb-gcp-uploader --tensorboard_resource_name $TENSORBOARD   --logdir=$LOGDIR   --experiment_name=$EXPERIMENT --one_shot=True')

### Test the container image locally

In [None]:
MNLI_TRAIN_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record'
MNLI_VALID_SPLIT = 'gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record'
BERT_HUB_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4'

output_dir = '/tmp'
num_gpus = 2 
strategy = 'mirrored'
#strategy = 'multi_worker_mirrored'

params_override = [
    'task.train_data.input_path=' + MNLI_TRAIN_SPLIT,
    'task.validation_data.input_path=' + MNLI_VALID_SPLIT,
    'task.hub_module_url=' + BERT_HUB_URL,
    'runtime.num_gpus=' + str(num_gpus),
    'runtime.distribution_strategy=' + strategy,
    'runtime.mixed_precision_dtype=' + 'tf.float16',
]

params = ','.join(params_override)

In [None]:
! docker run -it --rm --gpus all {TRAIN_IMAGE} trainer/train.py \
--experiment=bert/sentence_prediction \
--mode=train_and_eval \
--model_dir={output_dir}/test \
--config_file=trainer/glue_mnli_matched.yaml \
--params_override={params}  


In [None]:
#from google.cloud.aiplatform.gapic import \
#    JobServiceClient

In [None]:
STAGING_BUCKET = f'gs://jk-vertex-{REGION}'

aiplatform.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)
REGION

In [None]:
display_name = job_name

job = aiplatform.CustomJob(
    display_name=display_name,
    worker_pool_specs=worker_pool_specs,
)

job.run(sync=True)

# Parking lot

In [None]:
!docker run -it --rm --gpus all \
--volume {MNLI_LOCAL_FOLDER}/MNLI:/data/MNLI \
--env OUTPUT_DIR={OUTPUT_DIR} \
--env TASK=MNLI \
--env BERT_DIR={BERT_DIR} \
{TRAIN_IMAGE} \
'python models/official/nlp/data/create_finetuning_data.py \
 --input_data_dir=/data/MNLI \
 --vocab_file=${BERT_DIR}/vocab.txt \
 --train_data_output_path=${OUTPUT_DIR}/${TASK}/${TASK}_train.tf_record \
 --eval_data_output_path=${OUTPUT_DIR}/${TASK}/${TASK}_eval.tf_record \
 --meta_data_file_path=${OUTPUT_DIR}/${TASK}/${TASK}_meta_data \
 --fine_tuning_task_type=classification --max_seq_length=128 \
 --classification_task_name=${TASK}'

In [None]:
!docker run -it --rm --gpus all \
{TRAIN_IMAGE} \
dataprep/create_finetuning_data.py \
--fine_tuning_task_type=classification \
--tfds_params="dataset=glue/mnli,text_key=hypothesis,text_b_key=premise,train_split=train,dev_split=validation_matched" \
--max_seq_length=128 \
--vocab_file={BERT_DIR}/vocab.txt \
--meta_data_file_path={OUTPUT_DIR}/{TASK}/{TASK}_meta_data \
--train_data_output_path={OUTPUT_DIR}/{TASK}/{TASK}_train.tf_record \
--eval_data_output_path={OUTPUT_DIR}/{TASK}/{TASK}_eval.tf_record 



In [None]:
!docker run -it --rm --gpus all \
{TRAIN_IMAGE} \
dataprep/create_finetuning_data.py \
--fine_tuning_task_type=classification \
--tfds_params="dataset=glue/mnli,text_key=hypothesis,text_b_key=premise" \
--max_seq_length=128 \
--vocab_file={BERT_DIR}/vocab.txt \
--meta_data_file_path={OUTPUT_DIR}/{TASK}/{TASK}_meta_data \
--train_data_output_path={OUTPUT_DIR}/{TASK}/{TASK}_train.tf_record \
--eval_data_output_path={OUTPUT_DIR}/{TASK}/{TASK}_eval.tf_record 

