In [1]:
### This Notebook is intented to build a trainer
### using GPT2 and CNN daily mail

In [2]:
!nvidia-smi

Wed Jun 30 03:35:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   38C    P0    45W / 300W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |
| N/A   36C    P0    44W / 300W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [3]:
import sys
sys.path.append("/home/USER/TF_NEW/tf-transformers/src/")

In [4]:
import tensorflow as tf
import tqdm
import time

import functools
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf

In [5]:
from datasets import load_from_disk
from transformers import GPT2TokenizerFast

from tf_transformers.data.utils import separate_x_y, auto_batch
from tf_transformers.losses import cross_entropy_loss
from tf_transformers.optimization import create_optimizer
from tf_transformers.models import GPT2Model as Model

In [6]:
with initialize(config_path="confs"):
    cfg = compose(config_name='config.yaml')
    print(cfg)

{'dataset': {'take_sample': True, 'name': 'cnn_dailymail', 'batch_size': 32, 'max_seq_length': 512, 'max_target_length': 64, 'train_columns': ['input_ids', 'labels', 'labels_mask'], 'val_columns': ['input_ids', 'labels', 'labels_mask', 'highlights'], 'x_keys': ['input_ids'], 'y_keys': ['labels', 'labels_mask', 'highlights'], 'src_column_name': 'article', 'target_column_name': 'highlights'}, 'model': {'name': 'gpt2'}, 'text_generation': {'max_iterations': 64, 'mode': 'greedy'}}


In [7]:
#### Load Tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token_id

In [8]:
### Load GPT2 Model

# model , model_config = GPT2Model.from_pretrained(model_name="gpt2")

In [8]:
#### Load CNN daily mail dataset
dataset = load_from_disk("/home/jovyan/PRE_MODELS/HuggingFace_models/datasets/cnn_dailymail/")

In [9]:

def map_tokenizer(e,
                  src_column_name,
                  target_column_name,
                  max_seq_length, 
                  max_target_length):
    """Convert src text and target text to input_ids"""
    inputs  = tokenizer(e[src_column_name], truncation=True, padding=False, max_length=max_seq_length)
    targets = tokenizer(e[target_column_name], truncation=True, padding=False, max_length=max_target_length)
    e['input_ids'] = inputs['input_ids']
    e['target_input_ids'] = targets['input_ids']
    return e

def get_dataset(dataset, tokenizer, mode, cfg):
    
    take_sample = cfg.dataset.take_sample
    batch_size = cfg.dataset.batch_size
    x_keys = cfg.dataset.x_keys
    y_keys = cfg.dataset.y_keys
    src_column_name = cfg.dataset.src_column_name
    target_column_name = cfg.dataset.target_column_name
    max_seq_length = cfg.dataset.max_seq_length
    max_target_length = cfg.dataset.max_target_length
    
    if mode in ['train']:
        COLUMNS = cfg.dataset.train_columns
        shuffle = True
    if mode in ['val']:
        COLUMNS = cfg.dataset.val_columns
        shuffle = False
        
    if take_sample:
        dataset = dataset.select(range(5000))
    
    # Batched is better here
    fn_kwargs = { 'src_column_name': src_column_name, 
                  'target_column_name': target_column_name,
                  'max_seq_length': max_seq_length, 
                  'max_target_length': max_target_length}
    dataset = dataset.map(map_tokenizer,batched=True, fn_kwargs=fn_kwargs)
    # Merge it together (for Encoder only models)
    dataset = dataset.map(lambda x: {"input_ids": x["input_ids"] + x["target_input_ids"]})
    dataset = dataset.map(lambda x: {"input_ids": x["input_ids"][:-1], 
                                                  "labels": x["input_ids"][1:], 
                                                  "labels_mask": [1] * len(x["input_ids"][1:])})
    # HF dataset to tf dataset
    dataset.set_format(type="tensorflow", columns=COLUMNS)
    features = {}
    for x in COLUMNS:
        if isinstance(dataset[x], tf.RaggedTensor):
            if dataset[x].dtype in [tf.int32, tf.int64]:
                if x == 'labels_mask': # labels_mask should be 0
                    features[x] = tf.cast(dataset[x], dtype=tf.int32).to_tensor(default_value=0,
                                                               shape=[None, max_seq_length+max_target_length])
                    continue
                features[x] = tf.cast(dataset[x], dtype=tf.int32).to_tensor(default_value=tokenizer.pad_token_id,
                                                               shape=[None, max_seq_length+max_target_length])
        else:
            features[x] = dataset[x]
    tfdataset = tf.data.Dataset.from_tensor_slices(features)
    tfdataset = auto_batch(tfdataset, 
                           batch_size, 
                           shuffle=shuffle, 
                           x_keys=x_keys, 
                           y_keys=y_keys)
    return tfdataset
    

In [10]:
tfdataset =  get_dataset( dataset['train'], tokenizer, "train", cfg)
tfdataset_validation =  get_dataset( dataset['validation'], tokenizer, "val", cfg)

Loading cached processed dataset at /home/jovyan/PRE_MODELS/HuggingFace_models/datasets/cnn_dailymail/train/cache-958823b5499194f8.arrow
Loading cached processed dataset at /home/jovyan/PRE_MODELS/HuggingFace_models/datasets/cnn_dailymail/train/cache-39cffd77994cbea9.arrow
Loading cached processed dataset at /home/jovyan/PRE_MODELS/HuggingFace_models/datasets/cnn_dailymail/train/cache-cd7112db9e70814b.arrow
  return np.array(array, copy=False, **self.np_array_kwargs)


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [43]:
def get_model():
    model , model_config = Model.from_pretrained("gpt2", return_layer=False)
    return model

def get_optimizer():
    optimizer, learning_rate_fn = create_optimizer(init_lr=2e-05, 
                                                  num_train_steps=1000,
                                                  num_warmup_steps=100)
    return optimizer

def lm_loss(y_true_dict, y_pred_dict):
    loss = cross_entropy_loss(labels=y_true_dict['labels'], 
                             logits=y_pred_dict['token_logits'], 
                             label_weights=y_true_dict['labels_mask'])
    return {"loss": loss}

In [50]:
import tensorflow as tf
import tqdm
from absl import logging

from tf_transformers.core import keras_utils
from tf_transformers.core.distribute_utils import get_distribution_strategy
from tf_transformers.core.performance_utils import (
    configure_optimizer,
    get_tf_dtype,
    is_float16,
    set_mixed_precision_policy,
)

logging.get_absl_logger().name = "trainer"


def flat_metric_dict(metric_dict):
    """Flatten the dict"""
    dict_flatten = {}
    dict_flatten['steps'] = list(metric_dict.keys())
    for _key, value in metric_dict.items():
        for sub_key, sub_value in value.items():
            if sub_key not in dict_flatten:
                dict_flatten[sub_key] = [sub_value]
            else:
                dict_flatten[sub_key].append(sub_value)
    return dict_flatten


def save_model_checkpoints(model, overwrite_checkpoint_dir, model_checkpoint_dir, max_number_of_models):
    # Model checkpoint
    if not overwrite_checkpoint_dir:
        import os

        if os.path.exists(model_checkpoint_dir):
            raise FileExistsError("Model directory exists")

    checkpoint = tf.train.Checkpoint(model=model)
    manager = tf.train.CheckpointManager(checkpoint, directory=model_checkpoint_dir, max_to_keep=max_number_of_models)
    return manager


def get_loss_metric_dict(model, dataset, loss_fn, validation_dataset, validation_loss_fn):
    for (batch_inputs, batch_labels) in dataset.take(1):
        model_outputs = model(batch_inputs)
        train_loss_dict = loss_fn(batch_labels, model_outputs)
        training_loss_dict_metric = {name: tf.keras.metrics.Mean(name, dtype=tf.float32) for name in train_loss_dict}

    training_loss_dict_metric["learning_rate"] = tf.keras.metrics.Mean(
        "learning_rate", dtype=tf.float32
    )  # We store learning rate here and reset after every global steps

    validation_loss_dict_metric = {}
    if validation_dataset and validation_loss_fn:
        for (batch_inputs, batch_labels) in dataset.take(1):
            model_outputs = model(batch_inputs)
            valid_loss_dict = validation_loss_fn(batch_labels, model_outputs)
            validation_loss_dict_metric = {
                name: tf.keras.metrics.Mean(name, dtype=tf.float32) for name in valid_loss_dict
            }

    return training_loss_dict_metric, validation_loss_dict_metric


def get_and_reset_metric_from_dict(metric_dict):
    if not metric_dict:
        return {}
    metric_result = {name: metric.result().numpy() for name, metric in metric_dict.items()}
    for _name, metric in metric_dict.items():
        metric.reset_states()
    return metric_result


def get_tensorboard_writers(model_checkpoint_dir):
    train_log_dir = model_checkpoint_dir + "/logs/train"
    test_log_dir = model_checkpoint_dir + "/logs/dev"
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
    test_summary_writer = tf.summary.create_file_writer(test_log_dir)
    return train_summary_writer, test_summary_writer


def write_metrics(metric_dict, writer, step):
    with writer.as_default():
        for name, result in metric_dict.items():
            tf.summary.scalar(name, result, step=step)


def train_and_eval(
    model,
    optimizer,
    strategy,
    epochs,
    steps_per_epoch,
    steps_per_call,
    train_dataset_iter,
    train_loss_fn,
    training_loss_dict_metric,
    validation_dataset_distributed,
    validation_loss_fn,
    validation_loss_dict_metric,
    validation_interval_steps,
    mixed_precision,
    callbacks,
    callbacks_interval_steps,
    trainer_kwargs,
    checkpoint_manager,
    model_checkpoint_dir,
    model_save_interval_steps,
):
    def save_model(epoch_end=False):
        if not epoch_end:
            if model_save_interval_steps:
                if global_step % model_save_interval_steps == 0:
                    checkpoint_manager.save()
                    logging.info("Model saved at step {}".format(global_step))
        else:
            checkpoint_manager.save()
            logging.info("Model saved at epoch {}".format(epoch))

    # Train Functions
    @tf.function
    def do_train(iterator):
        """The step function for one training step"""

        def train_step(dist_inputs):
            """The computation to run on each device."""
            batch_inputs, batch_labels = dist_inputs
            with tf.GradientTape() as tape:
                model_outputs = model(batch_inputs)
                loss = train_loss_fn(batch_labels, model_outputs)
                if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
                    loss_scaled = {name: optimizer.get_scaled_loss(loss_value) for name, loss_value in loss.items()}
                # TODO
                # Scales down the loss for gradients to be invariant from replicas.
                # loss = loss / strategy.num_replicas_in_sync
            if mixed_precision:
                scaled_gradients = tape.gradient(loss_scaled["loss"], model.trainable_variables)
                grads = optimizer.get_unscaled_gradients(scaled_gradients)
            else:
                grads = tape.gradient(loss["loss"], model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            # training_loss.update_state(loss * strategy.num_replicas_in_sync)
            return loss

        for _ in tf.range(tf.convert_to_tensor(steps_per_call)):
            dist_inputs = next(iterator)
            loss = strategy.run(train_step, args=(dist_inputs,))
            # strategy reduce
            loss = {
                name: strategy.reduce(tf.distribute.ReduceOp.MEAN, loss_value, axis=None)
                for name, loss_value in loss.items()
            }
            for name, loss_value in loss.items():
                training_loss = training_loss_dict_metric[name]
                training_loss.update_state(loss_value)
            # Get current learning rate
            if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
                current_lr = optimizer._optimizer._decayed_lr(tf.float32)
            else:
                current_lr = optimizer._decayed_lr(tf.float32)
            training_loss_dict_metric["learning_rate"].update_state(current_lr)
            # training_result = get_and_reset_metric_from_dict(training_loss_dict_metric)

    # do validation
    def do_validation(validation_dataset_distributed):
        """Validation step"""

        @tf.function
        def _validate_step(dist_inputs):

            batch_inputs, batch_labels = dist_inputs
            model_outputs = model(batch_inputs)
            loss = validation_loss_fn(batch_labels, model_outputs)
            return loss

        if not epoch_end:
            if (
                validation_dataset_distributed
                and validation_loss_fn
                and validation_interval_steps
                and (global_step % validation_interval_steps == 0)
            ):
                logging.info("Validation in progress at step {} . . . .".format(global_step))
                with tqdm.tqdm(validation_dataset_distributed, unit=" Val batch ") as val_batches:
                    for dist_inputs in val_batches:
                        loss = strategy.run(_validate_step, args=(dist_inputs,))
                        for name, loss_value in loss.items():
                            loss_value = strategy.reduce(tf.distribute.ReduceOp.MEAN, loss_value, axis=None)
                            validation_loss = validation_loss_dict_metric[name]
                            validation_loss.update_state(loss_value)

                validation_result = get_and_reset_metric_from_dict(validation_loss_dict_metric)
                validation_history[global_step] = validation_result
                write_metrics(validation_result, val_summary_writer, global_step)
                logging.info("Validation result at step {}".format(validation_result))
                print("\n")
        else:
            if validation_dataset_distributed and validation_loss_fn:
                logging.info("Validation in progress at epoch end {} . . . .".format(epoch))
                with tqdm.tqdm(validation_dataset_distributed, unit=" Val batch ") as val_batches:
                    for dist_inputs in val_batches:
                        loss = strategy.run(_validate_step, args=(dist_inputs,))
                        for name, loss_value in loss.items():
                            loss_value = strategy.reduce(tf.distribute.ReduceOp.MEAN, loss_value, axis=None)
                            validation_loss = validation_loss_dict_metric[name]
                            validation_loss.update_state(loss_value)

                validation_result = get_and_reset_metric_from_dict(validation_loss_dict_metric)
                write_metrics(validation_result, val_summary_writer, global_step)
                # validation_history[global_step] = validation_result
                logging.info("Validation result at epoch {} is {}".format(epoch, validation_result))
                print("\n")

    def do_callbacks(callbacks):
        """Call callbacks"""
        if not epoch_end:
            callback_scores = None
            if callbacks and callbacks_interval_steps:
                logging.info("Callbacks in progress at step {} . . . .".format(global_step))
                callback_scores = []
                for callback, callback_steps in zip(callbacks, callbacks_interval_steps):
                    if callback_steps and (global_step % callback_steps == 0):
                        score = callback(trainer_kwargs)
                        callback_scores.append(score)
                    else:
                        callback_scores.append(None)
            return callback_scores
        else:
            callback_scores = None
            if callbacks:
                logging.info("Callbacks in progress at epoch end {} . . . .".format(epoch))
                callback_scores = []
                for callback in callbacks:
                    score = callback(trainer_kwargs)
                    callback_scores.append(score)
            return callback_scores

    # Loop starts here
    # Get Tensorboard writers
    train_summary_writer, val_summary_writer = get_tensorboard_writers(model_checkpoint_dir)
    validation_history = {}
    training_history = {}
    global_step = 0
    epoch_end = False
    STEPS = steps_per_epoch // steps_per_call
    for epoch in range(1, epochs + 1):
        # start_epoch_time = time.time()
        with tqdm.trange(STEPS, unit="batch ") as tepoch:
            for step in tepoch:
                steps_covered = (step + 1) * steps_per_call
                global_step += steps_per_call
                tepoch.set_description(
                    "Epoch {}/{} --- Step {}/{} --- ".format(epoch, epochs, steps_covered, steps_per_epoch)
                )
                # Call Train
                do_train(train_dataset_iter)

                # Call Validation
                do_validation(validation_dataset_distributed)

                # Call Callbacks
                callback_scores = do_callbacks(callbacks)

                # Train Metrics
                training_result = get_and_reset_metric_from_dict(training_loss_dict_metric)
                training_history[global_step] = training_result
                write_metrics(training_result, train_summary_writer, global_step)
                # training_result["learning_rate"] = learning_rate_holder.result().numpy()
                # learning_rate_holder.reset_states()
                tepoch.set_postfix(**training_result)

                # Save model
                save_model()

        # Do after every epoch
        epoch_end = True
        save_model(epoch_end)
        do_validation(validation_dataset_distributed)
        callback_scores = do_callbacks(callbacks)
        epoch_end = False

    # Flatten the results
    training_history = flat_metric_dict(training_history)
    validation_history = flat_metric_dict(validation_history)
    return training_history, validation_history, callback_scores


class TrainerNew:
    def __init__(
        self,
        distribution_strategy,
        num_gpus=0,
        all_reduce_alg=None,
        num_packs=1,
        tpu_address=None,
        dtype='fp32',
        loss_scale='dynamic',
    ):

        self.distribution_strategy = get_distribution_strategy(
            distribution_strategy=distribution_strategy,
            num_gpus=num_gpus,
            all_reduce_alg=all_reduce_alg,
            num_packs=num_packs,
            tpu_address=tpu_address,
        )

        self.num_replicas = self.distribution_strategy.num_replicas_in_sync
        self._dtype = get_tf_dtype(dtype)

        # Setting dtype policy
        set_mixed_precision_policy(self._dtype)
        self.use_float16 = is_float16(self._dtype)
        self.loss_scale = loss_scale

        # # TODO
        # if self.use_tpu:
        # params["num_replicas"] = self.distribution_strategy.num_replicas_in_sync
        # else:
        # logging.info("Running transformer with num_gpus = %d", num_gpus)

        # Add keras utils threads

    @property
    def use_tpu(self):
        if self.distribution_strategy:
            return isinstance(self.distribution_strategy, tf.distribute.TPUStrategy)
        return False

    def run(
        self,
        model_fn,
        optimizer_fn,
        train_dataset,
        train_loss_fn,
        epochs,
        steps_per_epoch,
        model_checkpoint_dir,
        validation_dataset=None,
        validation_loss_fn=None,
        validation_interval_steps=None,
        steps_per_call=100,
        enable_xla=True,
        callbacks=None,
        callbacks_interval_steps=None,
        overwrite_checkpoint_dir=False,
        max_number_of_models=10,
        model_save_interval_steps=None,
    ):

        if steps_per_epoch:
            logging.info("Make sure `steps_per_epoch` should be less than or equal to number of batches in dataset.")
        if callbacks:
            assert len(callbacks) == len(callbacks_interval_steps)

        # Enable XLA
        keras_utils.set_session_config(enable_xla=enable_xla)
        logging.info("Policy: ----> {}".format(keras_utils.get_policy_name()))
        logging.info("Strategy: ---> {}".format(self.distribution_strategy))
        if self.use_tpu:
            logging.info("Num TPU Devices: ---> {}".format(self.distribution_strategy.num_replicas_in_sync))
        else:
            logging.info("Num GPU Devices: ---> {}".format(self.distribution_strategy.num_replicas_in_sync))

        # Under Strategy Scope
        with self.distribution_strategy.scope():
            # Model
            model = model_fn()

            # Optimizer
            optimizer = optimizer_fn()
            optimizer = configure_optimizer(optimizer, use_float16=self.use_float16, loss_scale=self.loss_scale)

        # Checkpoint manager
        checkpoint_manager = save_model_checkpoints(
            model, overwrite_checkpoint_dir, model_checkpoint_dir, max_number_of_models
        )

        # Get metric dicts before distributing the dataset
        # ddistributed datasets has no attribute .take
        logging.info("Inferring metric shapes . . . . .")
        training_loss_dict_metric, validation_loss_dict_metric = get_loss_metric_dict(
            model, train_dataset, train_loss_fn, validation_dataset, validation_loss_fn
        )
        # Distribute dataset
        train_dataset_distributed = self.distribution_strategy.experimental_distribute_dataset(
            train_dataset.repeat(epochs + 1)
        )
        validation_dataset_distributed = None
        if validation_dataset:
            validation_dataset_distributed = self.distribution_strategy.experimental_distribute_dataset(
                validation_dataset
            )

        # Make train dataset iterator
        train_dataset_distributed = iter(train_dataset_distributed)

        history = {}
        training_history, validation_history, callback_scores = train_and_eval(
            model,
            optimizer,
            self.distribution_strategy,
            epochs,
            steps_per_epoch,
            steps_per_call,
            train_dataset_distributed,
            train_loss_fn,
            training_loss_dict_metric,
            validation_dataset_distributed,
            validation_loss_fn,
            validation_loss_dict_metric,
            validation_interval_steps,
            self.use_float16,
            callbacks,
            callbacks_interval_steps,
            locals(),
            checkpoint_manager,
            model_checkpoint_dir,
            model_save_interval_steps,
        )
        history['training_history'] = training_history
        history['validation_hsitory'] = validation_history
        history['callbacks'] = callback_scores
        return history


In [51]:
class SimpleCallback():
    
    def __init__(self):
        pass
    
    def __call__(self, trainer_kwargs):
        import pprint
        pprint.pprint(trainer_kwargs)
        
simple_callback = SimpleCallback()

In [52]:
trainer = TrainerNew(
            distribution_strategy="mirrored", 
            num_gpus=2, 
            all_reduce_alg='nccl', 
            num_packs=1, 
            dtype="fp16")

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [53]:
import tempfile
tempdir = tempfile.mkdtemp()

In [54]:
tempdir = "/tmp/tmp14vkffu3"
print("Temp dir", tempdir)

Temp dir /tmp/tmp14vkffu3


In [55]:
history = trainer.run(
        model_fn = get_model,
        optimizer_fn = get_optimizer,
        train_dataset = tfdataset,
        train_loss_fn = lm_loss,
        epochs = 1,
        steps_per_epoch = 21,
        model_checkpoint_dir= tempdir,
        validation_dataset =  tfdataset_validation.take(10),  # tfdataset_validation.take(10)
        validation_loss_fn=lm_loss,
        validation_interval_steps=None,
        steps_per_call=10,
        enable_xla=False,
        callbacks=[simple_callback],
        callbacks_interval_steps=[10],
        overwrite_checkpoint_dir=True,
        max_number_of_models=10,
        model_save_interval_steps=None
    )

INFO:trainer:Make sure `steps_per_epoch` should be less than or equal to number of batches in dataset.
INFO:trainer:Policy: ----> mixed_float16
INFO:trainer:Strategy: ---> <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7fe29d2ec490>
INFO:trainer:Num GPU Devices: ---> 2
You are using a model of type gpt2 to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
INFO:trainer:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/gpt2
INFO:trainer:Using Adamw optimizer
INFO:trainer:Inferring metric shapes . . . . .
Epoch 1/1 --- Step 10/21 --- :   0%|          | 0/2 [00:00<?, ?batch /s]

INFO:tensorflow:batch_all_reduce: 147 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 147 all-reduces with algorithm = nccl, num_packs = 1






INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').


INFO:tensorflow:batch_all_reduce: 147 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 147 all-reduces with algorithm = nccl, num_packs = 1
INFO:trainer:Callbacks in progress at step 10 . . . .
Epoch 1/1 --- Step 20/21 --- :  50%|█████     | 1/2 [00:55<00:55, 55.74s/batch , learning_rate=1.1e-6, loss=10.9]

{'callbacks': [<__main__.SimpleCallback object at 0x7fe29d2ec820>],
 'callbacks_interval_steps': [10],
 'checkpoint_manager': <tensorflow.python.training.checkpoint_management.CheckpointManager object at 0x7fe2428311f0>,
 'enable_xla': False,
 'epochs': 1,
 'history': {},
 'max_number_of_models': 10,
 'model': <tf_transformers.core.legacy_model.LegacyModel object at 0x7fe234040e80>,
 'model_checkpoint_dir': '/tmp/tmp14vkffu3',
 'model_fn': <function get_model at 0x7fe2427e6dc0>,
 'model_save_interval_steps': None,
 'optimizer': <tensorflow.python.keras.mixed_precision.loss_scale_optimizer.LossScaleOptimizer object at 0x7fdf8c57e220>,
 'optimizer_fn': <function get_optimizer at 0x7fe2427a00d0>,
 'overwrite_checkpoint_dir': True,
 'self': <__main__.TrainerNew object at 0x7fe242ab3550>,
 'steps_per_call': 10,
 'steps_per_epoch': 21,
 'train_dataset': <PrefetchDataset shapes: ({input_ids: (None, None)}, {labels: (None, None), labels_mask: (None, None)}), types: ({input_ids: tf.int32}, {lab

INFO:trainer:Callbacks in progress at step 20 . . . .
Epoch 1/1 --- Step 20/21 --- : 100%|██████████| 2/2 [01:03<00:00, 31.67s/batch , learning_rate=3.1e-6, loss=10.7]


{'callbacks': [<__main__.SimpleCallback object at 0x7fe29d2ec820>],
 'callbacks_interval_steps': [10],
 'checkpoint_manager': <tensorflow.python.training.checkpoint_management.CheckpointManager object at 0x7fe2428311f0>,
 'enable_xla': False,
 'epochs': 1,
 'history': {},
 'max_number_of_models': 10,
 'model': <tf_transformers.core.legacy_model.LegacyModel object at 0x7fe234040e80>,
 'model_checkpoint_dir': '/tmp/tmp14vkffu3',
 'model_fn': <function get_model at 0x7fe2427e6dc0>,
 'model_save_interval_steps': None,
 'optimizer': <tensorflow.python.keras.mixed_precision.loss_scale_optimizer.LossScaleOptimizer object at 0x7fdf8c57e220>,
 'optimizer_fn': <function get_optimizer at 0x7fe2427a00d0>,
 'overwrite_checkpoint_dir': True,
 'self': <__main__.TrainerNew object at 0x7fe242ab3550>,
 'steps_per_call': 10,
 'steps_per_epoch': 21,
 'train_dataset': <PrefetchDataset shapes: ({input_ids: (None, None)}, {labels: (None, None), labels_mask: (None, None)}), types: ({input_ids: tf.int32}, {lab

INFO:trainer:Model saved at epoch 1
INFO:trainer:Validation in progress at epoch end 1 . . . .
10 Val batch  [00:06,  1.50 Val batch /s]
INFO:trainer:Validation result at epoch 1 is {'loss': 10.51753}
INFO:trainer:Callbacks in progress at epoch end 1 . . . .




{'callbacks': [<__main__.SimpleCallback object at 0x7fe29d2ec820>],
 'callbacks_interval_steps': [10],
 'checkpoint_manager': <tensorflow.python.training.checkpoint_management.CheckpointManager object at 0x7fe2428311f0>,
 'enable_xla': False,
 'epochs': 1,
 'history': {},
 'max_number_of_models': 10,
 'model': <tf_transformers.core.legacy_model.LegacyModel object at 0x7fe234040e80>,
 'model_checkpoint_dir': '/tmp/tmp14vkffu3',
 'model_fn': <function get_model at 0x7fe2427e6dc0>,
 'model_save_interval_steps': None,
 'optimizer': <tensorflow.python.keras.mixed_precision.loss_scale_optimizer.LossScaleOptimizer object at 0x7fdf8c57e220>,
 'optimizer_fn': <function get_optimizer at 0x7fe2427a00d0>,
 'overwrite_checkpoint_dir': True,
 'self': <__main__.TrainerNew object at 0x7fe242ab3550>,
 'steps_per_call': 10,
 'steps_per_epoch': 21,
 'train_dataset': <PrefetchDataset shapes: ({input_ids: (None, None)}, {labels: (None, None), labels_mask: (None, None)}), types: ({input_ids: tf.int32}, {l

In [56]:
!ls /tmp/tmp14vkffu3

checkpoint  ckpt-1.data-00000-of-00001	ckpt-1.index  logs


In [58]:
!ls /tmp/tmp14vkffu3/logs/train/

events.out.tfevents.1624965412.tfnewgpu-0.4005.12076.v2
events.out.tfevents.1624965766.tfnewgpu-0.4005.30017.v2
events.out.tfevents.1624965974.tfnewgpu-0.4469.12065.v2
events.out.tfevents.1624966428.tfnewgpu-0.4911.12065.v2
events.out.tfevents.1624966509.tfnewgpu-0.4911.30006.v2
events.out.tfevents.1624974375.tfnewgpu-0.4911.125679.v2
events.out.tfevents.1624977036.tfnewgpu-0.5707.12065.v2
events.out.tfevents.1624977618.tfnewgpu-0.5707.48989.v2
events.out.tfevents.1624978018.tfnewgpu-0.5707.125427.v2
events.out.tfevents.1624981946.tfnewgpu-0.5707.367256.v2
events.out.tfevents.1624982247.tfnewgpu-0.5707.437411.v2
events.out.tfevents.1625025212.tfnewgpu-0.6603.52046.v2
events.out.tfevents.1625028255.tfnewgpu-0.6603.121581.v2
