In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TPU node sandbox



## Environment Setup

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import json

import os
import tensorflow_datasets as tfds

import tensorflow_hub as hub
import tensorflow_text as text

from cloud_tpu_client import Client

from absl import app
from absl import flags
import gin

from official.common import distribute_utils
# pylint: disable=unused-import
from official.common import registry_imports
# pylint: enable=unused-import
from official.common import flags as tfm_flags
from official.core import task_factory
from official.core import train_lib
from official.core import train_utils
from official.modeling import performance
from official.nlp import continuous_finetune_lib
from official.core import config_definitions
from official.core import exp_factory
from official.modeling import hyperparams

## Configure GCP settings

In [4]:
PROJECT = 'jk-mlops-dev'
REGION = 'us-central1'
TPU_NODE_NAME = 'jk-tpu-node'
ZONE = 'us-central1-a'
GCS_BUCKET = 'gs://jk-tpu-staging'

c = Client(tpu=TPU_NODE_NAME, zone=ZONE)

print(c.accelerator_type())
print(c.name())
print(c.state())
print(c.runtime_version())

v3-8
jk-tpu-node
READY
2.5.0


In [5]:
c.configure_tpu_version(tf.__version__, restart_type='ifNeeded')
c.wait_for_healthy()



Note: The TPU initialization code has to be at the beginning of your program.

In [6]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_NODE_NAME)
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

2021-09-13 20:05:17.051805: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-09-13 20:05:17.051894: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-09-13 20:05:17.051916: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jk-tpu-node-1): /proc/driver/nvidia/version does not exist


INFO:tensorflow:Initializing the TPU system: jk-tpu-node


2021-09-13 20:05:17.223667: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-13 20:05:17.238361: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.122.28.50:8470}
2021-09-13 20:05:17.238415: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:34994}
2021-09-13 20:05:17.255943: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.122.28.50:8470}
2021-09-13 20:05:17.255998: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:34994}
2021-09-

INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU')]


In [7]:
DATASET_LOCATION = "gs://cloud-samples-data/vertex-ai/community-content/datasets/MNLI"
TRAIN_FILE = f"{DATASET_LOCATION}/mnli_train.tf_record"
EVAL_FILE = f"{DATASET_LOCATION}/mnli_valid.tf_record"
METADATA_FILE = f"{DATASET_LOCATION}/metadata.json"

CONFIG_FILE = 'glue_mnli_matched.yaml'
EXPERIMENT = 'bert/sentence_prediction'
HUB_MODULE_URL = 'gs://tfhub-modules/tensorflow/bert_en_cased_L-24_H-1024_A-16/4/uncompressed'

INIT_CHECKPOINT = 'gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12'
LOGGING_DIR = f'{GCS_BUCKET}/job'
MODEL_DIR = f'{GCS_BUCKET}/model'

In [8]:
%%writefile {CONFIG_FILE}

task:
  hub_module_url: ''
  model:
    num_classes: 3
  init_checkpoint: ''
  metric_type: 'accuracy'
  train_data:
    drop_remainder: true
    global_batch_size: 32
    input_path: ''
    is_training: true
    seq_length: 128
    label_type: 'int'
  validation_data:
    drop_remainder: false
    global_batch_size: 32
    input_path: ''
    is_training: false
    seq_length: 128
    label_type: 'int'
trainer:
  checkpoint_interval: 3000
  optimizer_config:
    learning_rate:
      polynomial:
        # 100% of train_steps.
        decay_steps: 36813
        end_learning_rate: 0.0
        initial_learning_rate: 3.0e-05
        power: 1.0
      type: polynomial
    optimizer:
      type: adamw
    warmup:
      polynomial:
        power: 1
        # ~10% of train_steps.
        warmup_steps: 3681
      type: polynomial
  steps_per_loop: 1000
  summary_interval: 1000
  # Training data size 392,702 examples, 3 epochs.
  train_steps: 36813
  validation_interval: 6135
  # Eval data size = 9815 examples.
  validation_steps: 307
  best_checkpoint_export_subdir: 'best_ckpt'
  best_checkpoint_eval_metric: 'cls_accuracy'
  best_checkpoint_metric_comp: 'higher'


Overwriting glue_mnli_matched.yaml


In [9]:
params = exp_factory.get_exp_config(EXPERIMENT)
params = hyperparams.override_params_dict(params, CONFIG_FILE, is_strict=True)
params.override({
    'trainer': {
        'train_steps': 2000,
        'steps_per_loop': 100,
        'summary_interval': 100,
        'validation_interval': 2000,
        'checkpoint_interval': 2000,  
    },

    'task': {
        'init_checkpoint': INIT_CHECKPOINT,
        'train_data': {
            'global_batch_size': 256, 
            'input_path': TRAIN_FILE, 
        },
    
        'validation_data': {
            'global_batch_size': 256, 
            'input_path': EVAL_FILE,
        },


    },

    'runtime': {
        'tpu': TPU_NODE_NAME,
        'distribution_strategy': 'tpu'
    }

})

In [10]:
params.task.model.encoder.bert

BertEncoderConfig(vocab_size=30522, hidden_size=768, num_layers=12, num_attention_heads=12, hidden_activation='gelu', intermediate_size=3072, dropout_rate=0.1, attention_dropout_rate=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, embedding_size=None, output_range=None, return_all_encoder_outputs=False)

In [11]:
distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        tpu_address=params.runtime.tpu,
        **params.runtime.model_parallelism())



2021-09-13 20:05:40.975600: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.122.28.50:8470}
2021-09-13 20:05:40.975736: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:34994}
2021-09-13 20:05:40.979695: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.122.28.50:8470}
2021-09-13 20:05:40.979744: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:34994}


INFO:tensorflow:Initializing the TPU system: jk-tpu-node


INFO:tensorflow:Initializing the TPU system: jk-tpu-node


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [12]:
params.task

SentencePredictionConfig(init_checkpoint='gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12', model=ModelConfig(num_classes=3, use_encoder_pooler=False, encoder=EncoderConfig(type='bert', albert=AlbertEncoderConfig(vocab_size=30000, embedding_width=128, hidden_size=768, num_layers=12, num_attention_heads=12, hidden_activation='gelu', intermediate_size=3072, dropout_rate=0.0, attention_dropout_rate=0.0, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02), bert=BertEncoderConfig(vocab_size=30522, hidden_size=768, num_layers=12, num_attention_heads=12, hidden_activation='gelu', intermediate_size=3072, dropout_rate=0.1, attention_dropout_rate=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, embedding_size=None, output_range=None, return_all_encoder_outputs=False), bigbird=BigBirdEncoderConfig(vocab_size=50358, hidden_size=768, num_layers=12, num_attention_heads=12, hidden_activation='gelu', intermediate_size=3072, dropout_rate=0.1, atte

In [13]:
with distribution_strategy.scope():
      task = task_factory.get_task(params.task, logging_dir=LOGGING_DIR)


In [14]:
train_lib.run_experiment(
        distribution_strategy=distribution_strategy,
        task=task,
        mode='train',
        params=params,
        model_dir=MODEL_DIR)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


restoring or initializing model...
ERROR:tensorflow:Couldn't match files for checkpoint /usr/local/google/home/jacobdevlin/expts/bert_model_releases/uncased_L-12_H-768_A-12/bert_model.ckpt


ERROR:tensorflow:Couldn't match files for checkpoint /usr/local/google/home/jacobdevlin/expts/bert_model_releases/uncased_L-12_H-768_A-12/bert_model.ckpt


initialized model.
train | step:      0 | training until step 2000...








train | step:    100 | steps/sec:    1.6 | output: 
    {'cls_accuracy': 0.3380078,
     'learning_rate': 7.3350293e-07,
     'training_loss': 1.1024282}
saved checkpoint to gs://jk-tpu-staging/model/ckpt-100.
train | step:    200 | steps/sec:    4.3 | output: 
    {'cls_accuracy': 0.34816405,
     'learning_rate': 1.4670059e-06,
     'training_loss': 1.0992756}
train | step:    300 | steps/sec:   12.9 | output: 
    {'cls_accuracy': 0.35996094,
     'learning_rate': 2.2005088e-06,
     'training_loss': 1.0958779}
train | step:    400 | steps/sec:   13.0 | output: 
    {'cls_accuracy': 0.36558592,
     'learning_rate': 2.9340117e-06,
     'training_loss': 1.0923172}
train | step:    500 | steps/sec:   13.2 | output: 
    {'cls_accuracy': 0.3769922,
     'learning_rate': 3.6675146e-06,
     'training_loss': 1.0896847}
train | step:    600 | steps/sec:   13.1 | output: 
    {'cls_accuracy': 0.38214844,
     'learning_rate': 4.4010176e-06,
     'training_loss': 1.0867115}
train | step:   

(<official.nlp.modeling.models.bert_classifier.BertClassifier at 0x7f23202d3350>,
 {})