In [3]:
import sys
sys.path.append("/home/jovyan/TF_NEW/tf-transformers/src/")


In [4]:
import os
import tempfile
import json
import glob
import datasets
import shutil
import tensorflow as tf

from tf_transformers.data import TFReader, TFWriter
from tf_transformers.models import RobertaModel, EncoderDecoder
from tf_transformers.losses import cross_entropy_loss, cross_entropy_loss_label_smoothing

from transformers import RobertaTokenizer

In [5]:
# Convert data to features using specific length
# into a temp dir (and log it as well for monitoring)

def write_tfrecord(data, 
                    batch_size, 
                    tokenizer, 
                    encoder_max_length, 
                    decoder_max_length, 
                    mode, 
                    tfrecord_dir, 
                    take_sample=False, 
                    verbose=10000):
    
    if mode not in ["train", "eval"]:
        raise ValueError("Inavlid mode `{}` specified. Available mode is ['train', 'eval']".format(mode))
    
    def get_tfrecord_example(data):
        for f in data:            
            inputs_hf = tokenizer(f['document'], 
                                  truncation=True, 
                                  max_length=encoder_max_length)

            input_ids  = inputs_hf['input_ids']
            input_mask = inputs_hf['attention_mask']
            input_type_ids = [0] * len(input_ids)

            decoder_input_ids = tokenizer(f['summary'], 
                                  truncation=True, 
                                  max_length=decoder_max_seq_length)['input_ids']

            decoder_input_type_ids = [0] * len(decoder_input_ids)

            result = {}
            result['encoder_input_ids'] = input_ids
            result['encoder_input_mask'] = input_mask
            result['encoder_input_type_ids'] = input_type_ids
            result['decoder_input_ids'] = decoder_input_ids[:-1] # except last word
            result['decoder_input_type_ids'] = decoder_input_type_ids[:-1] # except last word

            result['labels'] = decoder_input_ids[1:] # not including first word
            result['labels_mask'] = [1] * len(result['labels'])

                # Decoder doesnt need input_mask because by default decoder has causal mask mode

            yield result

    schema = {
        "encoder_input_ids": ("var_len", "int"),
        "encoder_input_mask": ("var_len", "int"),
        "encoder_input_type_ids": ("var_len", "int"),
        "decoder_input_ids": ("var_len", "int"),
        "decoder_input_type_ids": ("var_len", "int"),
        "labels": ("var_len", "int"),
        "labels_mask": ("var_len", "int"),
    }
    
    # Create a temp dir
    if mode == "train":
        # Write tf records
        train_data_dir = os.path.join(tfrecord_dir,"train")        
        tfrecord_filename = 'pubmed'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=train_data_dir,
                            tag='train',
                            overwrite=True,
                            verbose_counter=verbose
                     )
        data_train = data
        # Take sample
        if take_sample:
            data_train = data_train.select(range(500))
            
        tfwriter.process(parse_fn=get_tfrecord_example(data_train))
    if mode == "eval":
        # Write tfrecords
        eval_data_dir = os.path.join(tfrecord_dir,"eval")
        tfrecord_filename = 'pubmed'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=eval_data_dir,
                            tag='eval',
                            overwrite=True,
                            verbose_counter=verbose
                            )
        data_eval = data
        # Take sample
        if take_sample:
            data_eval = data_eval.select(range(500))
        tfwriter.process(parse_fn=get_tfrecord_example(data_eval))
        
def read_tfrecord(tfrecord_dir, batch_size, shuffle=False, drop_remainder=False):
        # Read tfrecord to dataset
        schema = json.load(open("{}/schema.json".format(tfrecord_dir)))
        stats  = json.load(open('{}/stats.json'.format(tfrecord_dir)))
        all_files = glob.glob("{}/*.tfrecord".format(tfrecord_dir))
        tf_reader = TFReader(schema=schema, 
                            tfrecord_files=all_files)

        x_keys = ['encoder_input_ids', 'encoder_input_type_ids', 'encoder_input_mask', 'decoder_input_ids', 'decoder_input_type_ids']
        y_keys = ['labels', 'labels_mask']
        dataset = tf_reader.read_record(auto_batch=True, 
                                           keys=x_keys,
                                           batch_size=batch_size, 
                                           x_keys = x_keys, 
                                           y_keys = y_keys,
                                           shuffle=shuffle, 
                                           drop_remainder=drop_remainder
                                          )
        return dataset, stats['total_records']

In [6]:
# Load model

def get_model(is_training, use_dropout):
    
    def model_fn():
        encoder = RobertaModel.from_pretrained("roberta-base", return_layer=True, is_training=is_training,
                                              use_dropout=use_dropout)
        decoder = RobertaModel.from_pretrained("roberta-base",use_decoder=True,
                                               return_layer=True, mask_mode='causal', 
                                              is_training=is_training,
                                              use_dropout=use_dropout)

        # Assign all possible encoder variables to decoder
        encoder_dict = {var.name: var for var in encoder.variables}
        assigned_counter = 0
        for var in decoder.variables:
            if var.name in encoder_dict:
                var.assign(encoder_dict[var.name])
                assigned_counter += 1
        print("Assigned {} variables from encoder to decoder .".format(assigned_counter))
        del encoder_dict
        print("ENncoder variables {} and Decoder variables {}".format(len(encoder.variables), len(decoder.variables)))
        model = EncoderDecoder(encoder=encoder, decoder=decoder, share_embeddings=True) 
        model = model.get_model()
        print("Model variables {}".format(len(model.variables)))

        del encoder
        del decoder

        return model
    return model_fn

# Load Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [7]:

# Data specific configuration
encoder_max_seq_length = 512
decoder_max_seq_length = 64

take_sample = False
train_batch_size = 32
eval_batch_size  = 32

# Trainer specifics
device = "gpu"
num_gpus = 2
tpu_address = None
dtype = "fp32"
epochs = 3
strategy = "mirrored"

# Optimizer
learning_rate = 3e-5
loss_type = None
return_all_layer_outputs = False
if loss_type and loss_type == 'joint':
    return_all_layer_outputs = True

# Core data specifics
data_name = "scientific_papers"
#num_classes = cfg.glue.data.num_classes

# Model specific
is_training = True
use_dropout = True

In [8]:
# Autoregressive model

encoder = RobertaModel.from_pretrained("roberta-base", return_layer=True)
decoder = RobertaModel.from_pretrained("roberta-base", mask_mode='causal', use_decoder=True, use_auto_regressive=True, return_layer=True)
# Assign all possible encoder variables to decoder
encoder_dict = {var.name: var for var in encoder.variables}
assigned_counter = 0
for var in decoder.variables:
    if var.name in encoder_dict:
        var.assign(encoder_dict[var.name])
        assigned_counter += 1
print("Assigned {} variables from encoder to decoder .".format(assigned_counter))
del encoder_dict
print("ENncoder variables {} and Decoder variables {}".format(len(encoder.variables), len(decoder.variables)))
model_ar = EncoderDecoder(encoder=encoder, decoder=decoder, share_embeddings=True) 
print("Model variables {}".format(len(model_ar.variables)))

del encoder
del decoder

# Important
model_ar = model_ar.get_model()

You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


Robertaaaa


INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1
You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


Robertaaaa


INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1


Assigned 132 variables from encoder to decoder .
ENncoder variables 204 and Decoder variables 324
Model variables 525


In [8]:
import datasets
import tqdm
from tf_transformers.text import TextDecoder

class RougeCallback():
    
    def __init__(self, model, eval_dataset, original_summaries, tokenizer, eos_id, decoder_start_id, max_iterations):
        
        with tf.device('/device:GPU:0'):
            self.model = model
        self.eval_dataset = eval_dataset
        self.original_summaries = original_summaries
        self.tokenizer = tokenizer
        self.eos_id = eos_id
        self.decoder_start_id = decoder_start_id
        self.max_iterations = max_iterations
        
    def __call__(self, train_kwargs):
        
        self.model.set_weights(train_kwargs['model'].get_weights())
        decoder = TextDecoder(self.model, decoder_start_token_id=self.decoder_start_id, input_type_ids=0)
        
        # Predictions
        predicted_summaries = []
        for (batch_inputs, batch_labels) in tqdm.tqdm(eval_dataset):
            del batch_inputs['decoder_input_ids']
            decoder_outputs = decoder.decode(batch_inputs, mode='greedy', max_iterations=self.max_iterations, eos_id=self.eos_id)
            predicted_ids = [item[0] for item in decoder_outputs['predicted_ids'].numpy().tolist()]

            predicted_ids_sliced = []
            for p_id in predicted_ids:
                if self.eos_id in p_id:
                    index = p_id.index(self.eos_id)
                    p_id = p_id[:index]
                predicted_ids_sliced.append(p_id)
                predicted_summaries.append(self.tokenizer.decode(p_id))
                
        rouge = datasets.load_metric("rouge")
        rouge_output2 = rouge.compute(predictions=predicted_summaries, references=self.original_summaries, rouge_types=["rouge2"])["rouge2"].mid
        rouge_output1 = rouge.compute(predictions=predicted_summaries, references=self.original_summaries, rouge_types=["rouge1"])["rouge1"].mid
        rouge_outputL = rouge.compute(predictions=predicted_summaries, references=self.original_summaries, rouge_types=["rougeL"])["rougeL"].mid

        rouge2 = {'rouge2_precision': rouge_output2.precision,
                  'rouge2_recall': rouge_output2.recall,
                  'rouge2_f1': rouge_output2.fmeasure}
        rouge2['rouge1_precision'] = rouge_output1.precision
        rouge2['rouge1_recall'] = rouge_output1.recall
        rouge2['rouge1_f1'] = rouge_output1.fmeasure

        rouge2['rougeL_precision'] = rouge_outputL.precision
        rouge2['rougeL_recall'] = rouge_outputL.recall
        rouge2['rougeL_f1'] = rouge_outputL.fmeasure
        return rouge2


In [10]:
# Load dataset
from datasets import load_dataset
dataset = load_dataset("xsum")



In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [12]:
# Load TFrecords
# tfrecord_dir = tempfile.mkdtemp()
tfrecord_dir = '/tmp/roberta2robera_tfrecordsxsum/'

# # Train Tfrecords
# write_tfrecord(dataset['train'], 
#                train_batch_size,
#                tokenizer, 
#                encoder_max_seq_length, 
#                decoder_max_seq_length, 
#                "train", 
#                tfrecord_dir, 
#                take_sample)

# # # Eval Tfrecords
# write_tfrecord(dataset['validation'], 
#                eval_batch_size,
#                tokenizer, 
#                encoder_max_seq_length, 
#                decoder_max_seq_length, 
#                "eval", 
#                tfrecord_dir, 
#                take_sample)

train_dataset, total_train_examples = read_tfrecord(tfrecord_dir + 'train', train_batch_size, shuffle=False, drop_remainder=False)
eval_dataset, total_eval_examples   = read_tfrecord(tfrecord_dir + 'eval', eval_batch_size,  shuffle=False, drop_remainder=False)

original_summaries = [item['summary'] for item in dataset['validation']]
callback = RougeCallback( model_ar, eval_dataset, original_summaries,
                         tokenizer, tokenizer.sep_token_id, tokenizer.cls_token_id, decoder_max_seq_length)

NameError: name 'RougeCallback' is not defined

In [12]:
# Load optimizer fn

from tf_transformers.optimization import create_optimizer
def get_optimizer(learning_rate, examples, batch_size, epochs, learning_rate_type="polynomial"):
    steps_per_epoch = int(examples / batch_size)
    num_train_steps = steps_per_epoch * epochs
    warmup_steps = int(0.1 * num_train_steps)
    
    def optimizer_fn():
        optimizer, learning_rate_fn = create_optimizer(learning_rate,
                                                   num_train_steps,
                                                   num_train_steps, 
                                                      learning_rate_type=learning_rate_type)
        return optimizer
    return optimizer_fn

optimizer_fn = get_optimizer(learning_rate, total_train_examples, train_batch_size, epochs, learning_rate_type="linear")

In [13]:
# Loss fn

def get_loss(y_true_dict, y_pred_dict):
    
    loss = cross_entropy_loss(labels=y_true_dict['labels'], 
                                   logits=y_pred_dict['token_logits'], 
                                      label_weights=y_true_dict['labels_mask'])
    return {'loss': loss}

In [14]:
# Load trainer
from tf_transformers.core import GPUTrainer
trainer = GPUTrainer(distribution_strategy=strategy, 
                    num_gpus=num_gpus, 
                    dtype=dtype)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [15]:
model_fn = get_model(is_training=True, use_dropout=True)
train_loss_fn = get_loss


In [None]:
model_checkpoint_dir = "/tmp/roberta2robera_xsum2/"
history = trainer.run(
    model_fn = model_fn,
    optimizer_fn = optimizer_fn,
    train_dataset = train_dataset,
    train_loss_fn = train_loss_fn,
    epochs = epochs,
    steps_per_epoch = 6000,
    model_checkpoint_dir=model_checkpoint_dir,
    batch_size=train_batch_size,
    training_loss_names=None,
    validation_loss_names=None,
    validation_dataset=None,
    validation_loss_fn=None,
    validation_interval_steps=None,
    steps_per_call=100,
    enable_xla=False,
    callbacks=[callback],
    callbacks_interval_steps=None,
    overwrite_checkpoint_dir=True,
    max_number_of_models=10,
    model_save_interval_steps=None,
    repeat_dataset=False,
    latest_checkpoint=None,
)

In [19]:
!ls /tmp/roberta2robera_xsum2/

checkpoint		    ckpt-2.data-00000-of-00001	ckpt-3.index
ckpt-1.data-00000-of-00001  ckpt-2.index		logs
ckpt-1.index		    ckpt-3.data-00000-of-00001


In [59]:
model = get_model(is_training=True, use_dropout=True)()

You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


Robertaaaa










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1
You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


Robertaaaa
Embeddings (None, None, 768) Tensor("tf_transformers/roberta/Sum:0", shape=(None, None), dtype=float32)
Pos embeddings (None, None, 768) Tensor("tf_transformers/roberta/Sum_1:0", shape=(1, None), dtype=float32)
Embeddings Pos (None, None, 768) Tensor("tf_transformers/roberta/Sum_2:0", shape=(None, None), dtype=float32)
Embeddings (None, None, 768) Tensor("tf_transformers/roberta/Sum:0", shape=(None, None), dtype=float32)
Pos embeddings (None, None, 768) Tensor("tf_transformers/roberta/Sum_1:0", shape=(1, None), dtype=float32)
Embeddings Pos (None, None, 768) Tensor("tf_transformers/roberta/Sum_2:0", shape=(None, None), dtype=float32)


INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1










Assigned 204 variables from encoder to decoder .
ENncoder variables 204 and Decoder variables 324
Embeddings (None, None, 768) Tensor("roberta_roberta/tf_transformers/roberta/Sum:0", shape=(None, None), dtype=float32)
Pos embeddings (None, None, 768) Tensor("roberta_roberta/tf_transformers/roberta/Sum_1:0", shape=(1, None), dtype=float32)
Embeddings Pos (None, None, 768) Tensor("roberta_roberta/tf_transformers/roberta/Sum_2:0", shape=(None, None), dtype=float32)
Embeddings (None, None, 768) Tensor("roberta_roberta/tf_transformers/roberta/Sum:0", shape=(None, None), dtype=float32)
Pos embeddings (None, None, 768) Tensor("roberta_roberta/tf_transformers/roberta/Sum_1:0", shape=(1, None), dtype=float32)
Embeddings Pos (None, None, 768) Tensor("roberta_roberta/tf_transformers/roberta/Sum_2:0", shape=(None, None), dtype=float32)
Model variables 525


In [60]:
model.load_checkpoint(model_checkpoint_dir, step=tf.Variable(1))

AssertionError: Some Python objects were not bound to checkpointed values, likely due to changes in the Python program: [<tf.Variable 'Variable:0' shape=() dtype=int32, numpy=1>]

In [None]:
from tf_transformers.text import TextDecoder
decoder = TextDecoder(model=model_ar, 
                     decoder_start_token_id=tokenizer.cls_token_id, 
                     input_type_ids=0)


In [55]:
# Predictions
import tqdm
eval_dataset, total_eval_examples   = read_tfrecord(tfrecord_dir + 'eval', 16,  shuffle=False, drop_remainder=False)

predicted_summaries = []
for (batch_inputs, batch_labels) in tqdm.tqdm(eval_dataset):
    del batch_inputs['decoder_input_ids']
    decoder_outputs = decoder.decode(batch_inputs, mode='beam', 
                                     num_beams=4,
                                                                      alpha=2.0,

                                     do_sample=False,
                                     max_iterations=64, eos_id=tokenizer.sep_token_id)
    predicted_ids = [item[0] for item in decoder_outputs['predicted_ids'].numpy().tolist()]
    
    predicted_ids_sliced = []
    for p_id in predicted_ids:
        if tokenizer.sep_token_id in p_id:
            index = p_id.index(tokenizer.sep_token_id)
            p_id = p_id[:index]
        predicted_ids_sliced.append(p_id)
        predicted_summaries.append(tokenizer.decode(p_id))
    

572it [1:52:44, 11.83s/it]


KeyboardInterrupt: 

In [56]:
predicted_summaries

['A US court has ordered the US to pay $1bn (£2.5m) to the US to help the US-based company the company to pay its own service.',
 'England will not be held for the second Test series against South Africa at the end of the season.',
 'A French man has died after being held in the Republic of Ireland in the Republic of Ireland.',
 'Plans for a new £1.5m project to build a new £1m project to help the Scottish Borders.',
 'The number of people in Scotland has fallen by the Scottish government, according to a new report.',
 'The United Nations has said it will not be a new deal with the US and China.',
 'A lorry has been rescued from a car after a car crashed into the road in the Borders.',
 'The government has said it will not be a £2.5m of funding for the government in the next three years.',
 'A football fan has been arrested in connection with the Scottish Cup of Nations match against Dundee.',
 'The Welsh Government has said it will not be able to leave the EU if the EU is a "unprecede

In [19]:
model_ar.load_checkpoint(model_checkpoint_dir)


Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7f030abb01c0> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f02903eb130>).



Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7f030abb01c0> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f02903eb130>).
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/roberta2robera_xsum2/ckpt-3


In [42]:
import numpy as  np
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
with open("{}/history.json".format(model_checkpoint_dir), "w") as f:
    json.dump(str(history),f, indent=2)

In [38]:
import shutil
model_checkpoint_dir = "/tmp/roberta2robera_pubmed_short/"
shutil.rmtree(model_checkpoint_dir)
# shutil.rmtree(tfrecord_dir)