In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TPU node sandbox



## Environment Setup

In [33]:
import tensorflow as tf
import numpy as np
import pandas as pd
import json

import os
import tensorflow_datasets as tfds

import tensorflow_hub as hub
import tensorflow_text as text

from cloud_tpu_client import Client

from absl import app
from absl import flags
import gin

from official.common import distribute_utils
# pylint: disable=unused-import
from official.common import registry_imports
# pylint: enable=unused-import
from official.common import flags as tfm_flags
from official.core import task_factory
from official.core import train_lib
from official.core import train_utils
from official.modeling import performance
from official.nlp import continuous_finetune_lib
from official.core import config_definitions
from official.core import exp_factory
from official.modeling import hyperparams

## Configure GCP settings

In [12]:
PROJECT = 'jk-mlops-dev'
REGION = 'us-central1'
TPU_NODE_NAME = 'jk-tpu-node'
ZONE = 'us-central1-a'
GCS_BUCKET = 'gs://jk-tpu-staging'

c = Client(tpu=TPU_NODE_NAME, zone=ZONE)

print(c.accelerator_type())
print(c.name())
print(c.state())
print(c.runtime_version())

v3-8
jk-tpu-node
READY
2.5.0


In [13]:
c.configure_tpu_version(tf.__version__, restart_type='ifNeeded')
c.wait_for_healthy()



Note: The TPU initialization code has to be at the beginning of your program.

In [14]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_NODE_NAME)
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

2021-09-11 18:57:09.250576: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.122.28.50:8470}
2021-09-11 18:57:09.250641: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:33496}
2021-09-11 18:57:09.255275: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.122.28.50:8470}
2021-09-11 18:57:09.255350: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:33496}






INFO:tensorflow:Initializing the TPU system: jk-tpu-node


INFO:tensorflow:Initializing the TPU system: jk-tpu-node


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU')]


In [49]:
DATASET_LOCATION = "gs://cloud-samples-data/vertex-ai/community-content/datasets/MNLI"
TRAIN_FILE = f"{DATASET_LOCATION}/mnli_train.tf_record"
EVAL_FILE = f"{DATASET_LOCATION}/mnli_valid.tf_record"
METADATA_FILE = f"{DATASET_LOCATION}/metadata.json"

CONFIG_FILE = 'glue_mnli_matched.yaml'
EXPERIMENT = 'bert/sentence_prediction'
HUB_MODULE_URL = 'gs://tfhub-modules/tensorflow/bert_en_cased_L-24_H-1024_A-16/4/uncompressed'
INIT_CHECKPOINT = 'gs://jk-tpu-staging/uncased_L-12-H-768_A-12'
LOGGING_DIR = f'{GCS_BUCKET}/job'
MODEL_DIR = f'{GCS_BUCKET}/model'

In [50]:
%%writefile {CONFIG_FILE}

task:
  hub_module_url: ''
  model:
    num_classes: 3
  init_checkpoint: ''
  metric_type: 'accuracy'
  train_data:
    drop_remainder: true
    global_batch_size: 32
    input_path: ''
    is_training: true
    seq_length: 128
    label_type: 'int'
  validation_data:
    drop_remainder: false
    global_batch_size: 32
    input_path: ''
    is_training: false
    seq_length: 128
    label_type: 'int'
trainer:
  checkpoint_interval: 3000
  optimizer_config:
    learning_rate:
      polynomial:
        # 100% of train_steps.
        decay_steps: 36813
        end_learning_rate: 0.0
        initial_learning_rate: 3.0e-05
        power: 1.0
      type: polynomial
    optimizer:
      type: adamw
    warmup:
      polynomial:
        power: 1
        # ~10% of train_steps.
        warmup_steps: 3681
      type: polynomial
  steps_per_loop: 1000
  summary_interval: 1000
  # Training data size 392,702 examples, 3 epochs.
  train_steps: 36813
  validation_interval: 6135
  # Eval data size = 9815 examples.
  validation_steps: 307
  best_checkpoint_export_subdir: 'best_ckpt'
  best_checkpoint_eval_metric: 'cls_accuracy'
  best_checkpoint_metric_comp: 'higher'
runtime:
  distribution_strategy: 'multi_worker_mirrored'
  all_reduce_alg: 'nccl'

Overwriting glue_mnli_matched.yaml


In [51]:

params = hyperparams.override_params_dict(params, CONFIG_FILE, is_strict=True)
params.override({
    'trainer': {
        'train_steps': 2000,
        'steps_per_loop': 100,
        'summary_interval': 100,
        'validation_interval': 2000,
        'checkpoint_interval': 2000,  
    },

    'task': {
        'init_checkpoint': INIT_CHECKPOINT,
        'train_data': {
            'global_batch_size': 128, 
            'input_path': TRAIN_FILE, 
        },
    
        'validation_data': {
            'global_batch_size': 128, 
            'input_path': EVAL_FILE,
        },


    },

    'runtime': {
        'tpu': TPU_NODE_NAME,
        'distribution_strategy': 'tpu'
    }

})

In [52]:
params.task.model.encoder.bert

BertEncoderConfig(vocab_size=30522, hidden_size=1024, num_layers=12, num_attention_heads=16, hidden_activation='gelu', intermediate_size=4096, dropout_rate=0.1, attention_dropout_rate=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, embedding_size=None, output_range=None, return_all_encoder_outputs=False)

In [53]:
distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        tpu_address=params.runtime.tpu,
        **params.runtime.model_parallelism())

2021-09-11 19:18:41.451121: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.122.28.50:8470}
2021-09-11 19:18:41.451221: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:33496}
2021-09-11 19:18:41.457570: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.122.28.50:8470}
2021-09-11 19:18:41.457624: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:33496}






INFO:tensorflow:Initializing the TPU system: jk-tpu-node


INFO:tensorflow:Initializing the TPU system: jk-tpu-node


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [54]:
with distribution_strategy.scope():
      task = task_factory.get_task(params.task, logging_dir=LOGGING_DIR)


In [55]:
train_lib.run_experiment(
        distribution_strategy=distribution_strategy,
        task=task,
        mode='train',
        params=params,
        model_dir=MODEL_DIR)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
2021-09-11 19:18:56.152462: W ./tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h:57] Ignoring an error encountered when deleting remote tensors handles: Invalid argument: Unable to find the relevant tensor remote_handle: Op ID: 6362, Output num: 291
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1631387936.152299498","description":"Error received from peer ipv4:10.122.28.50:8470","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Unable to find the relevant tensor remote_handle: Op ID: 6362, Output num: 291","grpc_status":3}


restoring or initializing model...
initialized model.
train | step:      0 | training until step 2000...


