# Distributed training with Vertex Reduction server

In [1]:
import copy
import gin
import json
import os
import pprint
import requests
import tensorflow as tf

from deepdiff import DeepDiff

from official.common import distribute_utils
from official.common import registry_imports
from official.common import flags as tfm_flags
from official.core import task_factory
from official.core import train_lib
from official.core import train_utils
from official.modeling import performance

from official.nlp.bert import tokenization
from official.nlp.data import classifier_data_lib
from official.nlp.data import sentence_retrieval_lib
from official.nlp.data import squad_lib as squad_lib_wp
from official.nlp.data import squad_lib_sp
from official.nlp.data import tagging_data_lib

from official.core import base_task
from official.core import base_trainer
from official.core import config_definitions
from official.core import exp_factory
from official.modeling import hyperparams

## Set environment constants

In [2]:
PROJECT_ID = 'jk-mlops-dev'
GCS_LOCATION = 'gs://jk-vertex-demos'
BERT_DIR = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16'

## Prepare data

In [None]:
def generate_mnli_tfrecords(output_gcs_location, vocab_file, mnli_type='matched', 
                            max_seq_length=128, do_lower_case=True):
    """Generates MNLI training and validation splits in the TFRecord format
    compatible with TensorfFlow NLP Modelling Toolkit."""

    train_data_output_path = f'{output_gcs_location}/mnli_train.tf_record'
    eval_data_output_path = f'{output_gcs_location}/mnli_valid.tf_record'
    metadata_file_path = f'{output_gcs_location}/metadata.json'

    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)

    processor_text_fn = tokenization.convert_to_unicode

    if mnli_type == 'matched':
        tfds_params = 'dataset=glue/mnli,text_key=hypothesis,text_b_key=premise,train_split=train,dev_split=validation_matched'
    else: 
        tfds_params = 'dataset=glue/mnli,text_key=hypothesis,text_b_key=premise,train_split=train,dev_split=validation_mismatched'

    processor = classifier_data_lib.TfdsProcessor(
        tfds_params=tfds_params, process_text_fn=processor_text_fn)

    metadata = classifier_data_lib.generate_tf_record_from_data_file(
        processor,
        None,
        tokenizer,
        train_data_output_path=train_data_output_path,
        eval_data_output_path=eval_data_output_path,
        max_seq_length=max_seq_length)

    with tf.io.gfile.GFile(metadata_file_path, "w") as writer:
        writer.write(json.dumps(metadata, indent=4) + "\n")

    return train_data_output_path, eval_data_output_path, metadata_file_path



In [None]:
vocab_file = f'{BERT_DIR}/vocab.txt'
output_location = f'{GCS_LOCATION}/datasets/MNLI'

train_file, eval_file, metadata_file = generate_mnli_tfrecords(output_location, vocab_file)

## Train

In [3]:
train_file = 'gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record'
eval_file = 'gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record'

In [4]:
model_dir = '/tmp/models/bert_mnli'

### Retrieve the default configuration for sentence prediction

In [5]:
experiment = 'bert/sentence_prediction'

params = exp_factory.get_exp_config(experiment)

### Display the default configuration

In [6]:
pp = pprint.PrettyPrinter()

print(pp.pformat(params.as_dict()))

{'runtime': {'all_reduce_alg': None,
             'batchnorm_spatial_persistent': False,
             'dataset_num_private_threads': None,
             'default_shard_dim': -1,
             'distribution_strategy': 'mirrored',
             'enable_xla': False,
             'gpu_thread_mode': None,
             'loss_scale': None,
             'mixed_precision_dtype': None,
             'num_cores_per_replica': 1,
             'num_gpus': 0,
             'num_packs': 1,
             'per_gpu_thread_count': 0,
             'run_eagerly': False,
             'task_index': -1,
             'tpu': None,
             'tpu_enable_xla_dynamic_padder': None,
             'worker_hosts': None},
 'task': {'hub_module_url': '',
          'init_checkpoint': '',
          'init_cls_pooler': False,
          'metric_type': 'accuracy',
          'model': {'encoder': {'bert': {'attention_dropout_rate': 0.1,
                                         'dropout_rate': 0.1,
                                  

In [7]:
default_params = copy.deepcopy(params)

### Fine tune the configuration for the GLUE MNLI matched experiment

In [8]:
updated_params = {
    'task': {
        'hub_module_url': 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4',
        'train_data': {
            'input_path': train_file,
            'seq_length': 128,
            'global_batch_size': 32
        },
        'validation_data': {
            'input_path': eval_file,
            'seq_length': 128,
            'global_batch_size': 32
        },
        'model': {
            'num_classes': 3
        }
    },

    'trainer': {
        'train_steps': 36813,
        'validation_steps': 307,
        'validation_interval': 6135,
        'checkpoint_interval': 3000,
        'best_checkpoint_export_subdir': 'best_ckpt',
        'best_checkpoint_eval_metric': 'cls_accuracy',
        'best_checkpoint_metric_comp': 'higher',
        'optimizer_config': {
            'learning_rate': {
                'polynomial': {
                    'decay_steps': 36813
                }
            },
            'warmup': {
                'polynomial': {
                    'warmup_steps': 3681
                }
            }
        }
    },

    'runtime': {
        'num_gpus': 1,
        'distribution_strategy': 'mirrored'
    }
}

params.override(updated_params, is_strict=True)

### Show the difference

In [9]:
DeepDiff(default_params, params)

{'type_changes': {'root.trainer.optimizer_config.learning_rate.polynomial.decay_steps': {'old_type': NoneType,
   'new_type': int,
   'old_value': None,
   'new_value': 36813},
  'root.trainer.optimizer_config.warmup.polynomial.warmup_steps': {'old_type': NoneType,
   'new_type': int,
   'old_value': None,
   'new_value': 3681}},
 'values_changed': {'root.task.model.num_classes': {'new_value': 3,
   'old_value': 0},
  'root.task.train_data.input_path': {'new_value': 'gs://jk-vertex-demos/datasets/MNLI/mnli_train.tf_record',
   'old_value': ''},
  'root.task.validation_data.input_path': {'new_value': 'gs://jk-vertex-demos/datasets/MNLI/mnli_valid.tf_record',
   'old_value': ''},
  'root.task.hub_module_url': {'new_value': 'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4',
   'old_value': ''},
  'root.trainer.checkpoint_interval': {'new_value': 3000, 'old_value': 1000},
  'root.trainer.train_steps': {'new_value': 36813, 'old_value': 0},
  'root.trainer.validation_steps': 

### Run the experiment

In [10]:
distribution_strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus)





INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [11]:
with distribution_strategy.scope():
    task = task_factory.get_task(params.task, logging_dir=model_dir)


In [12]:
train_lib.run_experiment(
      distribution_strategy=distribution_strategy,
      task=task,
      mode='train_and_eval',
      params=params,
      model_dir=model_dir)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


restoring or initializing model...
restored model from /tmp/models/bert_mnli/ckpt-25540.
restored from checkpoint: /tmp/models/bert_mnli/ckpt-25540
train | step:  25540 | training until step 31675...


ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[32,16,128,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node while/body/_1/bert_classifier/model/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/bert_encoder/StatefulPartitionedCall/transformer/layer_14/self_attention/einsum/Einsum}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Func/while/body/_1/output_control_node/_4833/_47]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[32,16,128,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node while/body/_1/bert_classifier/model/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/bert_encoder/StatefulPartitionedCall/transformer/layer_14/self_attention/einsum/Einsum}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored. [Op:__inference_loop_fn_131206]

Function call stack:
loop_fn -> loop_fn
