In [9]:
import sys
sys.path.append("/home/jovyan/TF_NEW/tf-transformers/src/")


In [10]:
import os
import tempfile
import json
import glob
import datasets
import shutil
import tensorflow as tf

from tf_transformers.data import TFReader, TFWriter
from tf_transformers.models import RobertaModel, EncoderDecoder
from tf_transformers.losses import cross_entropy_loss, cross_entropy_loss_label_smoothing

from transformers import RobertaTokenizer

In [31]:
# Convert data to features using specific length
# into a temp dir (and log it as well for monitoring)

def write_tfrecord(data, 
                    batch_size, 
                    tokenizer, 
                    encoder_max_length, 
                    decoder_max_length, 
                    mode, 
                    tfrecord_dir, 
                    take_sample=False, 
                    verbose=10000):
    
    if mode not in ["train", "eval"]:
        raise ValueError("Inavlid mode `{}` specified. Available mode is ['train', 'eval']".format(mode))
    
    def get_tfrecord_example(data):
        result = {}
        for f in data:
            input_ids = [tokenizer.cls_token] + tokenizer.tokenize(f['document'])[: encoder_max_length-2] + [tokenizer.sep_token] # -2 to add CLS and SEP
            input_ids = tokenizer.convert_tokens_to_ids(input_ids)
            input_mask = [1] * len(input_ids)
            input_type_ids = [0] * len(input_ids)

            decoder_input_ids = [tokenizer.cls_token] + tokenizer.tokenize(f['summary'])[: decoder_max_length-2] + [tokenizer.sep_token]
            decoder_input_ids = tokenizer.convert_tokens_to_ids(decoder_input_ids)
            decoder_input_type_ids = [0] * len(decoder_input_ids)

            result = {}
            result['encoder_input_ids'] = input_ids
            result['encoder_input_mask'] = input_mask
            result['encoder_input_type_ids'] = input_type_ids
            result['decoder_input_ids'] = decoder_input_ids[:-1] # except last word
            result['decoder_input_type_ids'] = decoder_input_type_ids[:-1] # except last word

            result['labels'] = decoder_input_ids[1:] # not including first word
            result['labels_mask'] = [1] * len(decoder_input_ids[1:])

            # Decoder doesnt need input_mask because by default decoder has causal mask mode

            yield result

    schema = {
        "encoder_input_ids": ("var_len", "int"),
        "encoder_input_mask": ("var_len", "int"),
        "encoder_input_type_ids": ("var_len", "int"),
        "decoder_input_ids": ("var_len", "int"),
        "decoder_input_type_ids": ("var_len", "int"),
        "labels": ("var_len", "int"),
        "labels_mask": ("var_len", "int"),
    }
    
    # Create a temp dir
    if mode == "train":
        # Write tf records
        train_data_dir = os.path.join(tfrecord_dir,"train")        
        tfrecord_filename = 'pubmed'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=train_data_dir,
                            tag='train',
                            overwrite=True,
                            verbose_counter=verbose
                     )
        data_train = data['train']
        # Take sample
        if take_sample:
            data_train = data_train.select(range(500))
            
        tfwriter.process(parse_fn=get_tfrecord_example(data_train))
    if mode == "eval":
        # Write tfrecords
        eval_data_dir = os.path.join(tfrecord_dir,"eval")
        tfrecord_filename = 'pubmed'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=eval_data_dir,
                            tag='eval',
                            overwrite=True,
                            verbose_counter=verbose
                            )
        data_eval = data['validation']
        # Take sample
        if take_sample:
            data_eval = data_eval.select(range(500))
        tfwriter.process(parse_fn=get_tfrecord_example(data_eval))
        
def read_tfrecord(tfrecord_dir, batch_size, shuffle=False, drop_remainder=False):
        # Read tfrecord to dataset
        schema = json.load(open("{}/schema.json".format(tfrecord_dir)))
        stats  = json.load(open('{}/stats.json'.format(tfrecord_dir)))
        all_files = glob.glob("{}/*.tfrecord".format(tfrecord_dir))
        tf_reader = TFReader(schema=schema, 
                            tfrecord_files=all_files)

        x_keys = ['encoder_input_ids', 'encoder_input_type_ids', 'encoder_input_mask', 'decoder_input_ids', 'decoder_input_type_ids']
        y_keys = ['labels', 'labels_mask']
        dataset = tf_reader.read_record(auto_batch=True, 
                                           keys=x_keys,
                                           batch_size=batch_size, 
                                           x_keys = x_keys, 
                                           y_keys = y_keys,
                                           shuffle=shuffle, 
                                           drop_remainder=drop_remainder
                                          )
        return dataset, stats['total_records']

In [12]:
# Load model

def get_model():
    encoder = RobertaModel.from_pretrained("roberta-base", return_layer=True)
    decoder = RobertaModel.from_pretrained("roberta-base",use_decoder=True, return_layer=True, mask_mode='causal')

    # Assign all possible encoder variables to decoder
    encoder_dict = {var.name: var for var in encoder.variables}
    assigned_counter = 0
    for var in decoder.variables:
        if var.name in encoder_dict:
            var.assign(encoder_dict[var.name])
            assigned_counter += 1
    print("Assigned {} variables from encoder to decoder .".format(assigned_counter))
    del encoder_dict
    print("ENncoder variables {} and Decoder variables {}".format(len(encoder.variables), len(decoder.variables)))
    model = EncoderDecoder(encoder=encoder, decoder=decoder, share_embeddings=True) 
    model = model.get_model()
    print("Model variables {}".format(len(model.variables)))

    del encoder
    del decoder
    
    return model

# Load Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [14]:

# Data specific configuration
encoder_max_seq_length = 512
decoder_max_seq_length = 64

take_sample = False
train_batch_size = 32
eval_batch_size  = 32

# Trainer specifics
device = "gpu"
num_gpus = 2
tpu_address = None
dtype = "fp32"
epochs = 3
strategy = "mirrored"

# Optimizer
learning_rate = 3e-5
loss_type = None
return_all_layer_outputs = False
if loss_type and loss_type == 'joint':
    return_all_layer_outputs = True

# Core data specifics
data_name = "scientific_papers"
#num_classes = cfg.glue.data.num_classes

# Model specific
is_training = True
use_dropout = True

In [15]:
# Autoregressive model

encoder = RobertaModel.from_pretrained("roberta-base", return_layer=True)
decoder = RobertaModel.from_pretrained("roberta-base", mask_mode='causal', use_decoder=True, use_auto_regressive=True, return_layer=True)
# Assign all possible encoder variables to decoder
encoder_dict = {var.name: var for var in encoder.variables}
assigned_counter = 0
for var in decoder.variables:
    if var.name in encoder_dict:
        var.assign(encoder_dict[var.name])
        assigned_counter += 1
print("Assigned {} variables from encoder to decoder .".format(assigned_counter))
del encoder_dict
print("ENncoder variables {} and Decoder variables {}".format(len(encoder.variables), len(decoder.variables)))
model_ar = EncoderDecoder(encoder=encoder, decoder=decoder, share_embeddings=True) 
print("Model variables {}".format(len(model_ar.variables)))

del encoder
del decoder

# Important
model_ar = model_ar.get_model()

You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1
You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1


Assigned 132 variables from encoder to decoder .
ENncoder variables 204 and Decoder variables 324
Model variables 525


In [16]:
import datasets
import tqdm
from tf_transformers.text import TextDecoder

class RougeCallback():
    
    def __init__(self, model, eval_dataset, original_summaries, tokenizer, eos_id, decoder_start_id, max_iterations):
        
        with tf.device('/device:GPU:0'):
            self.model = model
        self.eval_dataset = eval_dataset
        self.original_summaries = original_summaries
        self.tokenizer = tokenizer
        self.eos_id = eos_id
        self.decoder_start_id = decoder_start_id
        self.max_iterations = max_iterations
        
    def __call__(self, train_kwargs):
        
        self.model.set_weights(train_kwargs['model'].get_weights())
        decoder = TextDecoder(self.model, decoder_start_token_id=self.decoder_start_id, input_type_ids=0)
        
        # Predictions
        predicted_summaries = []
        for (batch_inputs, batch_labels) in tqdm.tqdm(eval_dataset):
            del batch_inputs['decoder_input_ids']
            decoder_outputs = decoder.decode(batch_inputs, mode='greedy', max_iterations=self.max_iterations, eos_id=self.eos_id)
            predicted_ids = [item[0] for item in decoder_outputs['predicted_ids'].numpy().tolist()]

            predicted_ids_sliced = []
            for p_id in predicted_ids:
                if self.eos_id in p_id:
                    index = p_id.index(self.eos_id)
                    p_id = p_id[:index]
                predicted_ids_sliced.append(p_id)
                predicted_summaries.append(self.tokenizer.decode(p_id))
                
        rouge = datasets.load_metric("rouge")
        rouge_output2 = rouge.compute(predictions=predicted_summaries, references=self.original_summaries, rouge_types=["rouge2"])["rouge2"].mid
        rouge_output1 = rouge.compute(predictions=predicted_summaries, references=self.original_summaries, rouge_types=["rouge1"])["rouge1"].mid
        rouge_outputL = rouge.compute(predictions=predicted_summaries, references=self.original_summaries, rouge_types=["rougeL"])["rougeL"].mid

        rouge2 = {'rouge2_precision': rouge_output2.precision,
                  'rouge2_recall': rouge_output2.recall,
                  'rouge2_f1': rouge_output2.fmeasure}
        rouge2['rouge1_precision'] = rouge_output1.precision
        rouge2['rouge1_recall'] = rouge_output1.recall
        rouge2['rouge1_f1'] = rouge_output1.fmeasure

        rouge2['rougeL_precision'] = rouge_outputL.precision
        rouge2['rougeL_recall'] = rouge_outputL.recall
        rouge2['rougeL_f1'] = rouge_outputL.fmeasure
        return rouge2


In [None]:
# Load dataset
from datasets import load_dataset
dataset = load_dataset("xsum")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1930.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=954.0, style=ProgressStyle(description_…




Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /home/jovyan/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499...




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [33]:
# Load TFrecords
# tfrecord_dir = tempfile.mkdtemp()
tfrecord_dir = '/tmp/roberta2robera_tfrecordsxsum/'

# # Train Tfrecords
# write_tfrecord(dataset, 
#                train_batch_size,
#                tokenizer, 
#                encoder_max_seq_length, 
#                decoder_max_seq_length, 
#                "train", 
#                tfrecord_dir, 
#                take_sample)

# Eval Tfrecords
# write_tfrecord(dataset, 
#                eval_batch_size,
#                tokenizer, 
#                encoder_max_seq_length, 
#                decoder_max_seq_length, 
#                "eval", 
#                tfrecord_dir, 
#                take_sample)

train_dataset, total_train_examples = read_tfrecord(tfrecord_dir + 'train', train_batch_size, shuffle=False, drop_remainder=False)
eval_dataset, total_eval_examples   = read_tfrecord(tfrecord_dir + 'eval', eval_batch_size,  shuffle=False, drop_remainder=False)

original_summaries = [item['summary'] for item in dataset['validation']]
callback = RougeCallback( model_ar, eval_dataset, original_summaries,
                         tokenizer, tokenizer.sep_token_id, tokenizer.cls_token_id, decoder_max_seq_length)

In [34]:
# Load optimizer fn

from tf_transformers.optimization import create_optimizer
def get_optimizer(learning_rate, examples, batch_size, epochs, learning_rate_type="polynomial"):
    steps_per_epoch = int(examples / batch_size)
    num_train_steps = steps_per_epoch * epochs
    warmup_steps = int(0.1 * num_train_steps)
    
    def optimizer_fn():
        optimizer, learning_rate_fn = create_optimizer(learning_rate,
                                                   num_train_steps,
                                                   num_train_steps, 
                                                      learning_rate_type=learning_rate_type)
        return optimizer
    return optimizer_fn

optimizer_fn = get_optimizer(learning_rate, total_train_examples, train_batch_size, epochs, learning_rate_type="linear")

In [35]:
# Loss fn

def get_loss(y_true_dict, y_pred_dict):
    
    loss = cross_entropy_loss(labels=y_true_dict['labels'], 
                                   logits=y_pred_dict['token_logits'], 
                                      label_weights=y_true_dict['labels_mask'])
    return {'loss': loss}

In [36]:
# Load trainer
from tf_transformers.core import GPUTrainer
trainer = GPUTrainer(distribution_strategy=strategy, 
                    num_gpus=num_gpus, 
                    dtype=dtype)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [37]:
model_fn = get_model
train_loss_fn = get_loss


In [39]:
model_checkpoint_dir = "/tmp/roberta2robera_xsum/"
history = trainer.run(
    model_fn = model_fn,
    optimizer_fn = optimizer_fn,
    train_dataset = train_dataset,
    train_loss_fn = train_loss_fn,
    epochs = epochs,
    steps_per_epoch = 6000,
    model_checkpoint_dir=model_checkpoint_dir,
    batch_size=train_batch_size,
    training_loss_names=None,
    validation_loss_names=None,
    validation_dataset=None,
    validation_loss_fn=None,
    validation_interval_steps=None,
    steps_per_call=100,
    enable_xla=False,
    callbacks=[callback],
    callbacks_interval_steps=None,
    overwrite_checkpoint_dir=True,
    max_number_of_models=10,
    model_save_interval_steps=None,
    repeat_dataset=False,
    latest_checkpoint=None,
)

INFO:absl:Make sure `steps_per_epoch` should be less than or equal to number of batches in dataset.
INFO:absl:Policy: ----> float32
INFO:absl:Strategy: ---> <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7fad71d14310>
INFO:absl:Num GPU Devices: ---> 2
You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1
You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1










Assigned 204 variables from encoder to decoder .
ENncoder variables 204 and Decoder variables 324


INFO:absl:Using Adamw optimizer
INFO:absl:No checkpoint found in /tmp/roberta2robera_xsum/
Epoch 1/3 --- Step 100/6000 --- total examples 0:   0%|          | 0/60 [00:00<?, ?batch /s]

Model variables 525








INFO:tensorflow:batch_all_reduce: 515 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 515 all-reduces with algorithm = nccl, num_packs = 1






INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).










INFO:tensorflow:batch_all_reduce: 515 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 515 all-reduces with algorithm = nccl, num_packs = 1






INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Epoch 1/3 --- Step 6000/6000 --- total examples 188800: 100%|██████████| 60/60 [1:32:12<00:00, 92.20s/batch , learning_rate=9.33e-6, loss=2.04]
INFO:absl:Model saved at epoch 1
INFO:absl:Callbacks in progress at epoch end 1 . . . .
9it [01:42, 11.36s/it]


KeyboardInterrupt: 

In [40]:
model_ar.load_checkpoint(model_checkpoint_dir)


Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7fad71e63d60> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7fabce04cd60>).



Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7fad71e63d60> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7fabce04cd60>).
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/roberta2robera_xsum/ckpt-1


In [41]:
len(model_ar.variables)

525

In [42]:
decoder = TextDecoder(model=model_ar, 
                     decoder_start_token_id=tokenizer.cls_token_id, 
                     input_type_ids=0)

In [44]:
# Predictions
predicted_summaries = []
for (batch_inputs, batch_labels) in tqdm.tqdm(train_dataset):
    del batch_inputs['decoder_input_ids']
    decoder_outputs = decoder.decode(batch_inputs, mode='greedy', max_iterations=64, eos_id=tokenizer.sep_token_id)
    predicted_ids = [item[0] for item in decoder_outputs['predicted_ids'].numpy().tolist()]
    
    predicted_ids_sliced = []
    for p_id in predicted_ids:
        if tokenizer.sep_token_id in p_id:
            index = p_id.index(self.eos_id)
            p_id = p_id[:index]
        predicted_ids_sliced.append(p_id)
        predicted_summaries.append(tokenizer.decode(p_id))
    break

0it [00:11, ?it/s]


In [47]:
for item in dataset['train']:
    print(item)
    break

{'document': 'Recent reports have linked some France-based players with returns to Wales.\n"I\'ve always felt - and this is with my rugby hat on now; this is not region or WRU - I\'d rather spend that money on keeping players in Wales," said Davies.\nThe WRU provides £2m to the fund and £1.3m comes from the regions.\nFormer Wales and British and Irish Lions fly-half Davies became WRU chairman on Tuesday 21 October, succeeding deposed David Pickering following governing body elections.\nHe is now serving a notice period to leave his role as Newport Gwent Dragons chief executive after being voted on to the WRU board in September.\nDavies was among the leading figures among Dragons, Ospreys, Scarlets and Cardiff Blues officials who were embroiled in a protracted dispute with the WRU that ended in a £60m deal in August this year.\nIn the wake of that deal being done, Davies said the £3.3m should be spent on ensuring current Wales-based stars remain there.\nIn recent weeks, Racing Metro fla

In [66]:
text = item['document']
DECODER_START_ID = tokenizer.cls_token_id

# model = get_model()
# model.load_checkpoint(model_checkpoint_dir)

# Create inputs
inputs_hf = tokenizer(text, return_tensors='tf', truncation=True, max_length=512)
inputs = {}
inputs['encoder_input_ids'] = inputs_hf['input_ids']
inputs['encoder_input_ids'] = inputs_hf['input_ids']
inputs['encoder_input_type_ids'] = tf.zeros_like(inputs_hf['input_ids'])

inputs['encoder_input_mask'] = inputs_hf['attention_mask']
inputs['decoder_input_type_ids']  = tf.constant([[0]])
inputs['decoder_input_ids']  = tf.constant([[DECODER_START_ID]])

# Iterate
predictions_non_auto_regressive = []
predictions_prob_non_auto_regressive = []

for i in range(64):
    outputs = model(inputs)
    predicted_ids = tf.cast(tf.expand_dims(tf.argmax(outputs["last_token_logits"], axis=1), 1), tf.int32)
    inputs["decoder_input_ids"] = tf.concat([inputs["decoder_input_ids"], predicted_ids], axis=1)
    inputs["decoder_input_type_ids"] = tf.concat([inputs["decoder_input_type_ids"], [[0]]], axis=1)
    predictions_non_auto_regressive.append(predicted_ids)
    predictions_prob_non_auto_regressive.append(
        tf.expand_dims(tf.reduce_max(outputs["last_token_logits"], axis=1), 1)
    )
predictions_non_auto_regressive = tf.concat(predictions_non_auto_regressive, axis=1)
predictions_prob_non_auto_regressive = tf.concat(predictions_prob_non_auto_regressive, axis=1)

In [68]:
print(predictions_non_auto_regressive)

tf.Tensor(
[[  771 44870  5295  2918  2918   472    32     7    28   156    13  5295
     18 12093  8808  2918     6   309     7     5 12093  2918  2918     4
      2     2    18   507  2971     9  5295    18  5295  2918  2918  2918
   2918  2918  2918  2918  2918  2918  2918  2918  2918  2918  2918  2918
   2918  2918  2918  2918  2918  2918  2918  2918  2918  2918     4     2
      2     2     2     2]], shape=(1, 64), dtype=int32)


In [69]:
tokenizer.decode(predictions_non_auto_regressive[0].numpy())

"Welsh Wales union union players are to be made for Wales's Welsh rugby union, according to the Welsh union union.</s></s>'s final secretary of Wales's Wales union union union union union union union union union union union union union union union union union union union union union union union union union.</s></s></s></s></s>"

In [80]:
model_ar.load_checkpoint(model_checkpoint_dir)


Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7fad71e63d60> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7fabce04cd60>).



Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7fad71e63d60> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7fabce04cd60>).
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/roberta2robera_xsum/ckpt-1


In [81]:
encoder_hidden_dim = 768
num_hidden_layers  = 12
num_attention_heads = 12
attention_head_size = 64

# Inputs
inputs_hf = tokenizer(text, return_tensors='tf', truncation=True, max_length=512)
encoder_input_ids = inputs_hf['input_ids']
encoder_input_mask = inputs_hf['attention_mask']

batch_size = tf.shape(encoder_input_ids)[0]
seq_length = tf.shape(encoder_input_ids)[1]

decoder_input_ids  = tf.reshape([0] * batch_size.numpy(), (batch_size,1))

encoder_hidden_states = tf.zeros((batch_size, seq_length, encoder_hidden_dim))
decoder_all_cache_key = tf.zeros((num_hidden_layers, 
                                  batch_size, 
                                  num_attention_heads, 
                                  seq_length, 
                                  attention_head_size))
decoder_all_cahce_value = tf.zeros((num_hidden_layers, 
                                  batch_size, 
                                  num_attention_heads, 
                                  seq_length, 
                                  attention_head_size))


inputs = {}
inputs['encoder_input_ids'] = encoder_input_ids
inputs['encoder_input_type_ids'] = tf.zeros_like(encoder_input_ids)

inputs['encoder_input_mask']= encoder_input_mask
inputs['decoder_input_ids'] = decoder_input_ids
inputs['decoder_input_ids'] = tf.zeros_like(decoder_input_ids)

inputs['encoder_hidden_states'] = encoder_hidden_states
inputs['decoder_all_cache_key'] = decoder_all_cache_key
inputs['decoder_all_cache_value'] = decoder_all_cahce_value
inputs["decoder_input_type_ids"] = tf.constant([[0]])

# Iterate
predictions_auto_regressive = []
predictions_prob_auto_regressive = []

for i in range(64):
    outputs = model_ar(inputs)
    predicted_ids = tf.cast(tf.expand_dims(tf.argmax(outputs["last_token_logits"], axis=1), 1), tf.int32)
    inputs["decoder_input_ids"] = predicted_ids
    inputs["decoder_input_type_ids"] = tf.constant([[0]])
    inputs["decoder_all_cache_key"] = outputs["decoder_all_cache_key"]
    inputs["decoder_all_cache_value"] = outputs["decoder_all_cache_value"]
    inputs["encoder_hidden_states"] = outputs["encoder_hidden_states"]
    predictions_auto_regressive.append(predicted_ids)
    predictions_prob_auto_regressive.append(
        tf.expand_dims(tf.reduce_max(outputs["last_token_logits"], axis=1), 1)
    )
predictions_auto_regressive = tf.concat(predictions_auto_regressive, axis=1)
predictions_prob_auto_regressive = tf.concat(predictions_prob_auto_regressive, axis=1)

In [82]:
print(predictions_auto_regressive)

tf.Tensor(
[[129 129 129 129  12   4   4   4   4   4   4   4   4   4   4   4   4   4
    4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4
    4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4
    4   4   4   4   4   4   4   4   4   4]], shape=(1, 64), dtype=int32)


In [83]:
predictions_prob_auto_regressive

<tf.Tensor: shape=(1, 64), dtype=float32, numpy=
array([[ 9.428182 ,  9.829342 ,  9.904202 ,  9.8369   ,  9.822728 ,
         9.692421 , 10.047644 , 10.128189 , 10.143772 , 10.136143 ,
        10.11996  , 10.101275 , 10.082702 , 10.065314 , 10.049466 ,
        10.035193 , 10.022442 , 10.011193 , 10.001495 ,  9.993506 ,
         9.987443 ,  9.983559 ,  9.982058 ,  9.9831295,  9.98684  ,
         9.993168 , 10.002007 , 10.013155 , 10.026359 , 10.04134  ,
        10.057796 , 10.075427 , 10.093946 , 10.113109 , 10.132659 ,
        10.152413 , 10.172202 , 10.191876 , 10.211317 , 10.230444 ,
        10.249183 , 10.267481 , 10.285303 , 10.302635 , 10.319451 ,
        10.335759 , 10.351557 , 10.366844 , 10.381651 , 10.395957 ,
        10.409806 , 10.423198 , 10.436155 , 10.448684 , 10.46082  ,
        10.472563 , 10.483942 , 10.494981 , 10.505693 , 10.516117 ,
        10.526254 , 10.53614  , 10.545795 , 10.555249 ]], dtype=float32)>

In [84]:
predictions_prob_non_auto_regressive

<tf.Tensor: shape=(1, 64), dtype=float32, numpy=
array([[17.97845  , 22.559448 , 14.675355 , 13.109577 , 12.68401  ,
        12.34306  , 13.029075 , 14.807979 , 15.790917 , 13.550283 ,
        15.292    , 14.446288 , 12.826156 , 12.286309 , 13.730594 ,
        13.9249735, 12.244875 , 14.737296 , 20.04535  , 14.1511965,
        15.058198 , 15.391763 , 13.476166 , 13.707357 , 23.329325 ,
        23.817316 , 10.714266 ,  9.901299 , 13.1168375, 14.706587 ,
        16.137383 , 13.415479 , 12.671844 , 13.478654 , 13.920757 ,
        13.8897705, 13.857656 , 14.10398  , 14.347718 , 14.068935 ,
        14.331752 , 14.190344 , 14.189717 , 14.124179 , 14.005302 ,
        13.808486 , 13.930423 , 14.312213 , 14.422061 , 13.906468 ,
        13.878039 , 13.6482   , 13.596181 , 13.435531 , 13.24317  ,
        13.064389 , 12.940061 , 13.085591 , 12.919713 , 23.690815 ,
        23.95297  , 11.3611765, 11.078797 , 10.847608 ]], dtype=float32)>

In [42]:
import numpy as  np
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
with open("{}/history.json".format(model_checkpoint_dir), "w") as f:
    json.dump(str(history),f, indent=2)

In [38]:
import shutil
model_checkpoint_dir = "/tmp/roberta2robera_pubmed_short/"
shutil.rmtree(model_checkpoint_dir)
# shutil.rmtree(tfrecord_dir)


Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7fdb97a1bca0> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7fdb803d8a60>).



Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7fdb97a1bca0> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7fdb803d8a60>).
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/roberta2robera_pubmed_short/ckpt-3
