In [10]:
import sys
sys.path.append("/mnt/home/TF_NEW/tf-transformers/src/")

In [11]:
!nvidia-smi

Thu Aug 26 09:31:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   34C    P0    43W / 300W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   36C    P0    44W / 300W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [12]:
import tensorflow as tf

from tf_transformers.models import T5Model, EncoderDecoder
from transformers import T5TokenizerFast

In [5]:
from tf_transformers.core import LegacyModel, LegacyLayer
class Long_Model(LegacyLayer):
    def __init__(
        self, model_layer, num_splits,
        gru_units,
        activation=None, is_training=False, use_dropout=False, **kwargs
    ):
        super(Long_Model, self).__init__(
            is_training=is_training, use_dropout=use_dropout, name=model_layer.name, **kwargs
        )
        self.model_layer = model_layer
        self.num_splits = num_splits
        self.gru_layer = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(gru_units, return_sequences=True,
                                                                           name='gru_for_logits', trainable=True))
        # self.gru_layer_token_embeddings = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(gru_units, return_sequences=True))
        
        self._config_dict = model_layer._config_dict
        self._mask_mode   = model_layer._mask_mode
        self._sequence_length = model_layer._sequence_length
        self.model_inputs, self.model_outputs = self.get_model(initialize_only=True)
    
    def call(self, inputs):
        
        all_outputs_token_embeddings = []
        inputs_splitted = {}
        input_names = []
        for k, v in inputs.items():
            inputs_splitted[k] = tf.split(v, self.num_splits, axis=1)
            input_names.append(k)
            
        for i in range(self.num_splits):
            inputs_main = {}
            for name in input_names:
                inputs_main[name] = inputs_splitted[name][i]
            model_outputs = self.model_layer(inputs_main)
            # all_outputs_token_logits.append(model_outputs["token_logits"])
            all_outputs_token_embeddings.append(model_outputs['token_embeddings'])
            
        # token_logits_concatanted = tf.concat(all_outputs_token_logits, axis=1) # over sequence length

        token_embeddings_concatanted = tf.concat(all_outputs_token_embeddings, axis=1) # over sequence length
        token_embeddings_concatanted = self.gru_layer(token_embeddings_concatanted)
        return {'token_embeddings': token_embeddings_concatanted}
    
 
    def get_model(self, initialize_only=False):
        inputs = {}
        for k, v in self.model_layer.model_inputs.items():
            shape = v.shape
            inputs[k] = tf.keras.layers.Input(
                shape[1:], batch_size=shape[0], name= k, dtype=v.dtype
            )
        layer_output = self(inputs)
        if initialize_only:
            return inputs, layer_output
        model = LegacyModel(inputs=inputs, outputs=layer_output, name="long_span_selection")
        model.model_config = self.model_layer._config_dict
        return model

In [22]:
def get_model(num_splits, gru_units):
    
    def model_fn():
        model = T5Model.from_pretrained("t5-small", 
                                               return_layer=True)
        encoder = model._encoder
        decoder = model._decoder
        del model
        long_model = Long_Model(encoder, num_splits=num_splits, gru_units=gru_units)

        # long_model._layers[0]._embedding_layer = decoder._embedding_layer
        #long_model._layers[0]._type_embeddings_layer = albert_decoder._type_embeddings_layer
        #long_model._layers[0]._positional_embedding_layer = albert_decoder._positional_embedding_layer

        model_encoder = EncoderDecoder(encoder=encoder, decoder=decoder) 

        model_encoder = model_encoder.get_model()
        return model_encoder
    return model_fn
    
    
def get_model_auto_regressive():
    gru_units = 384 # half of hidden dimension
    model = T5Model.from_pretrained("t5-small", 
                                           return_layer=True, 
                                           decoder_kwargs={'use_auto_regressive': True})
    encoder = model._encoder
    decoder = model._decoder
    del model
    long_model = Long_Model(encoder, num_splits=8, gru_units=gru_units)
    
    # long_model._layers[0]._embedding_layer = decoder._embedding_layer
    #long_model._layers[0]._type_embeddings_layer = albert_decoder._type_embeddings_layer
    #long_model._layers[0]._positional_embedding_layer = albert_decoder._positional_embedding_layer
    
    model_encoder = EncoderDecoder(encoder=encoder, decoder=decoder) 
    
    model_encoder = model_encoder.get_model()
    return model_encoder
    

In [13]:
# Convert data to features using specific length
# into a temp dir (and log it as well for monitoring)

def write_tfrecord(data, 
                    batch_size, 
                    tokenizer, 
                    encoder_max_length, 
                    decoder_max_length, 
                    mode, 
                    tfrecord_dir, 
                    take_sample=False, 
                    verbose=10000):
    
    if mode not in ["train", "eval"]:
        raise ValueError("Inavlid mode `{}` specified. Available mode is ['train', 'eval']".format(mode))
    
    def get_tfrecord_example(data):
        for f in data:            
            inputs_hf = tokenizer('long summarize: ' + f['article'], 
                                  truncation=True, 
                                  max_length=encoder_max_length)

            input_ids  = inputs_hf['input_ids'][:-1] # skip sep
            input_mask = inputs_hf['attention_mask'][:-1] # skip sep
            input_type_ids = [0] * len(input_ids)

            decoder_input_ids = tokenizer(f['abstract'], 
                                  truncation=True, 
                                  max_length=decoder_max_seq_length)['input_ids']
            
            decoder_input_ids = [tokenizer.pad_token_id] + decoder_input_ids
            # decoder_input_type_ids = [0] * len(decoder_input_ids)

            result = {}
            result['encoder_input_ids'] = input_ids
            result['encoder_input_mask'] = input_mask
            #result['encoder_input_type_ids'] = input_type_ids
            result['decoder_input_ids'] = decoder_input_ids[:-1] # except last word
            #result['decoder_input_type_ids'] = decoder_input_type_ids[:-1] # except last word

            result['labels'] = decoder_input_ids[1:] # not including first word
            result['labels_mask'] = [1] * len(result['labels'])

                # Decoder doesnt need input_mask because by default decoder has causal mask mode

            yield result

    schema = {
        "encoder_input_ids": ("var_len", "int"),
        "encoder_input_mask": ("var_len", "int"),
        "decoder_input_ids": ("var_len", "int"),
        "labels": ("var_len", "int"),
        "labels_mask": ("var_len", "int"),
    }
    
    # Create a temp dir
    if mode == "train":
        # Write tf records
        train_data_dir = os.path.join(tfrecord_dir,"train")        
        tfrecord_filename = 'pubmed'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=train_data_dir,
                            tag='train',
                            overwrite=True,
                            verbose_counter=verbose
                     )
        data_train = data
        # Take sample
        if take_sample:
            data_train = data_train.select(range(500))
            
        tfwriter.process(parse_fn=get_tfrecord_example(data_train))
    if mode == "eval":
        # Write tfrecords
        eval_data_dir = os.path.join(tfrecord_dir,"eval")
        tfrecord_filename = 'pubmed'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=eval_data_dir,
                            tag='eval',
                            overwrite=True,
                            verbose_counter=verbose
                            )
        data_eval = data
        # Take sample
        if take_sample:
            data_eval = data_eval.select(range(500))
        tfwriter.process(parse_fn=get_tfrecord_example(data_eval))
        
def read_tfrecord(tfrecord_dir, max_seq_length, batch_size, shuffle=False, drop_remainder=False):
    
        padded_shapes = {'encoder_input_ids': [max_seq_length,], 
                        'encoder_input_mask':[max_seq_length,],
                        'decoder_input_ids': [None,],
                        'labels': [None,], 
                        'labels_mask': [None,]
                }
        # Read tfrecord to dataset
        schema = json.load(open("{}/schema.json".format(tfrecord_dir)))
        stats  = json.load(open('{}/stats.json'.format(tfrecord_dir)))
        all_files = glob.glob("{}/*.tfrecord".format(tfrecord_dir))
        tf_reader = TFReader(schema=schema, 
                            tfrecord_files=all_files)

        x_keys = ['encoder_input_ids', 'encoder_input_mask', 'decoder_input_ids']
        y_keys = ['labels', 'labels_mask']
        dataset = tf_reader.read_record(auto_batch=True, 
                                           keys=x_keys,
                                           padded_shapes=padded_shapes,
                                           batch_size=batch_size, 
                                           x_keys = x_keys, 
                                           y_keys = y_keys,
                                           shuffle=shuffle, 
                                           drop_remainder=drop_remainder
                                          )
        return dataset, stats['total_records']

In [14]:

# Data specific configuration
encoder_max_seq_length = 256
decoder_max_seq_length = 256

take_sample = False
train_batch_size = 8
eval_batch_size  = 8

# Trainer specifics
device = "gpu"
num_gpus = 2
tpu_address = None
dtype = "fp32"
epochs = 3
strategy = "mirrored"

# Optimizer
learning_rate = 3e-5
loss_type = None
return_all_layer_outputs = False
if loss_type and loss_type == 'joint':
    return_all_layer_outputs = True

# Core data specifics
data_name = "scientific_papers"
#num_classes = cfg.glue.data.num_classes

# Model specific
is_training = False
use_dropout = False

In [17]:
import glob
import json
from tf_transformers.data import TFWriter, TFReader
tokenizer = T5TokenizerFast.from_pretrained("t5-small")

In [7]:
# Load dataset
from datasets import load_dataset
dataset = load_dataset("scientific_papers", "pubmed")

Reusing dataset scientific_papers (/home/jovyan/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


In [26]:
# Load TFrecords
# tfrecord_dir = tempfile.mkdtemp()
tfrecord_dir = '/tmp/tfrecord_t5_pubmed_long/'

# # Train Tfrecords
write_tfrecord(dataset['train'], 
               train_batch_size,
               tokenizer, 
               encoder_max_seq_length, 
               decoder_max_seq_length, 
               "train", 
               tfrecord_dir, 
               take_sample, 
               verbose=1000)

# # Eval Tfrecords
write_tfrecord(dataset['validation'], 
               eval_batch_size,
               tokenizer, 
               encoder_max_seq_length, 
               decoder_max_seq_length, 
               "eval", 
               tfrecord_dir, 
               take_sample, 
              verbose=1000)



train_dataset, total_train_examples = read_tfrecord(tfrecord_dir + 'train', encoder_max_seq_length, train_batch_size, shuffle=True, drop_remainder=True)
eval_dataset, total_eval_examples   = read_tfrecord(tfrecord_dir + 'eval', encoder_max_seq_length,  eval_batch_size,  shuffle=False, drop_remainder=False)

# original_summaries = [item['summary'] for item in dataset['validation']]
# callback = RougeCallback( model_ar, eval_dataset, original_summaries,
#                          tokenizer, tokenizer.sep_token_id, tokenizer.cls_token_id, decoder_max_seq_length)

In [57]:
# Load optimizer fn

from tf_transformers.optimization.adam_weighted import AdamWeightDecay
def get_optimizer(learning_rate, examples, batch_size, epochs, learning_rate_type="polynomial"):
    steps_per_epoch = int(examples / batch_size)
    num_train_steps = steps_per_epoch * epochs
    warmup_steps = int(0.1 * num_train_steps)
    
    def optimizer_fn():
        optimizer = AdamWeightDecay(learning_rate = learning_rate)
        return optimizer
    return optimizer_fn

learning_rate = 0.0001
optimizer_fn = get_optimizer(learning_rate, total_train_examples, train_batch_size, epochs, learning_rate_type="linear")

In [58]:
from tf_transformers.losses import cross_entropy_loss, cross_entropy_loss_label_smoothing

# Loss fn
def get_loss(y_true_dict, y_pred_dict):
    
    loss = cross_entropy_loss_label_smoothing(labels=y_true_dict['labels'], 
                                   logits=y_pred_dict['token_logits'], 
                                      label_weights=y_true_dict['labels_mask'])
    return {'loss': loss}

In [59]:
# Load trainer
from tf_transformers.core import GPUTrainer
strategy = 'mirrored'
num_gpus = 2
dtype = 'fp32'
trainer = GPUTrainer(distribution_strategy=strategy, 
                    num_gpus=num_gpus, 
                    dtype=dtype)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [60]:
epochs = 3
steps_per_epoch = total_train_examples//train_batch_size
model_fn = get_model(num_splits=8, gru_units=384)

In [61]:
model_checkpoint_dir = "/tmp/model_t5_long/"
history = trainer.run(
    model_fn = model_fn,
    optimizer_fn = optimizer_fn,
    train_dataset = train_dataset,
    train_loss_fn = get_loss,
    epochs = epochs,
    steps_per_epoch = steps_per_epoch,
    model_checkpoint_dir=model_checkpoint_dir,
    batch_size=train_batch_size,
    training_loss_names=None,
    validation_loss_names=None,
    validation_dataset=None,
    validation_loss_fn=None,
    validation_interval_steps=None,
    steps_per_call=10,
    enable_xla=False,
    callbacks=None,
    callbacks_interval_steps=None,
    overwrite_checkpoint_dir=True,
    max_number_of_models=10,
    model_save_interval_steps=None,
    repeat_dataset=False,
    latest_checkpoint=None,
)

In [64]:
os.listdir(model_checkpoint_dir)

['logs',
 'ckpt-2.index',
 'ckpt-1.index',
 'ckpt-1.data-00000-of-00001',
 'ckpt-2.data-00000-of-00001',
 'checkpoint']

In [11]:
model_checkpoint_dir = "/tmp/model_t5_long/"


In [67]:
tf.train.latest_checkpoint(model_checkpoint_dir)

'/tmp/model_t5_long/ckpt-2'

In [68]:
checkpoint_path = '/tmp/model_t5_long/ckpt-1'
with tf.device('/device:GPU:1'):

    model_ar = get_model_auto_regressive()
    model_ar.load_checkpoint(model_checkpoint_dir, checkpoint_path=checkpoint_path)

    # Save as serialized module
    model_ar.save_as_serialize_module("/tmp/model_t5_long_serialized_v1/")


Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7f62307e7d00> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f62307cee80>).



Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7f62307e7d00> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f62307cee80>).
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/t5-small/ckpt-1



Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7f65b7d82fa0> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f62307cee80>).



Two checkpoint references resolved to different objects (<tf_transformers.models.encoder_decoder.encoder_decoder.EncoderDecoder object at 0x7f65b7d82fa0> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f62307cee80>).
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/model_t5_long/ckpt-2


In [None]:
from tf_transformers.text import TextDecoder
with tf.device('/device:GPU:0'):
    model_pb = tf.saved_model.load("/tmp/model_t5_long_serialized/")
decoder = TextDecoder(model_pb, decoder_start_token_id=0)

In [21]:
from tf_transformers.text import TextDecoder
with tf.device('/device:GPU:0'):
    model_pb = tf.saved_model.load("/tmp/model_t5_long_serialized/")
decoder = TextDecoder(model_pb, decoder_start_token_id=0)

In [None]:
import tqdm
predicted_summaries = []
predicted_summaries_256 = []
original_summaries = []
for (batch_inputs, batch_labels) in tqdm.tqdm(eval_dataset):
    
    del batch_inputs['decoder_input_ids'] # We do not need to pass decoder_input_ids , as we provide while initiating
                                          # TextDecoder
    decoder_outputs = decoder.decode(batch_inputs, 
                                 max_iterations=512, 
                                 mode='beam',
                                 num_beams=5, 
                                 alpha=0.8,
                                 eos_id=1)
    
    predicted_batch_summaries = tokenizer.batch_decode(decoder_outputs['predicted_ids'][:,0,:].numpy(),
                                                       skip_special_tokens=True)
    predicted_summaries.extend(predicted_batch_summaries)
    
    predicted_batch_summaries_256 = tokenizer.batch_decode(decoder_outputs['predicted_ids'][:,0,:].numpy()[:, :256],
                                                       skip_special_tokens=True)
    predicted_summaries_256.extend(predicted_batch_summaries_256)
    
    original_batch_summaries = tokenizer.batch_decode(batch_labels['labels'].numpy(), skip_special_tokens=True)
    original_summaries.extend(original_batch_summaries)
    

947it [4:07:00, 14.91s/it]

In [None]:
import tqdm
predicted_summaries = []
predicted_summaries_256 = []
original_summaries = []
max_length = []
for (batch_inputs, batch_labels) in tqdm.tqdm(eval_dataset):
    
    del batch_inputs['decoder_input_ids'] # We do not need to pass decoder_input_ids , as we provide while initiating
                                          # TextDecoder
    decoder_outputs = decoder.decode(batch_inputs, 
                                 max_iterations=384, 
                                 mode='greedy',
                                 eos_id=1)
    
    preds = decoder_outputs['predicted_ids'][:,0,:].numpy()
    
    max_length.append(preds.shape[1])
    
    predicted_batch_summaries = tokenizer.batch_decode(preds,
                                                       skip_special_tokens=True)
    predicted_summaries.extend(predicted_batch_summaries)
    
    predicted_batch_summaries_256 = tokenizer.batch_decode(preds[:, :256],
                                                       skip_special_tokens=True)
    predicted_summaries_256.extend(predicted_batch_summaries_256)
    
    original_batch_summaries = tokenizer.batch_decode(batch_labels['labels'].numpy(), skip_special_tokens=True)
    original_summaries.extend(original_batch_summaries)
    

In [43]:
from rouge_score import rouge_scorer
from rouge_score import scoring

In [44]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeLsum"], use_stemmer=True)
aggregator = scoring.BootstrapAggregator()

In [46]:
for (p_summary, o_summary) in tqdm.tqdm(zip(predicted_summaries, original_summaries)):
    score = scorer.score(o_summary, p_summary)
    aggregator.add_scores(score)

6633it [02:22, 46.58it/s]


In [48]:
score = aggregator.aggregate()

In [58]:
for k, v in sorted(score.items()):
    print("%s-R,%f,%f,%f\n" %
          (k, v.low.recall, v.mid.recall, v.high.recall))
    print("%s-P,%f,%f,%f\n" %
          (k, v.low.precision, v.mid.precision, v.high.precision))
    print("%s-F,%f,%f,%f\n" %
          (k, v.low.fmeasure, v.mid.fmeasure, v.high.fmeasure))
    print("-------------------------------------------------------")

rouge1-R,0.450247,0.453271,0.456391

rouge1-P,0.350808,0.354154,0.357254

rouge1-F,0.382401,0.385470,0.388405

-------------------------------------------------------
rouge2-R,0.177629,0.180571,0.183634

rouge2-P,0.138939,0.141300,0.143710

rouge2-F,0.151427,0.153809,0.156428

-------------------------------------------------------
rougeLsum-R,0.283254,0.285934,0.288525

rougeLsum-P,0.218282,0.220623,0.222851

rougeLsum-F,0.238778,0.241002,0.243262

-------------------------------------------------------


In [None]:
#### Greedy (ckpt 2) scores

rouge1-R,0.450247,0.453271,0.456391

rouge1-P,0.350808,0.354154,0.357254

rouge1-F,0.382401,0.385470,0.388405

-------------------------------------------------------
rouge2-R,0.177629,0.180571,0.183634

rouge2-P,0.138939,0.141300,0.143710

rouge2-F,0.151427,0.153809,0.156428

-------------------------------------------------------
rougeLsum-R,0.283254,0.285934,0.288525

rougeLsum-P,0.218282,0.220623,0.222851

rougeLsum-F,0.238778,0.241002,0.243262
