In [5]:
import os

os.environ['NO_PROXY'] = '169.254.169.254'

os.environ['HTTP_PROXY'] = '10.239.228.20:8000'

os.environ['HTTPS_PROXY'] = '10.239.228.20:8000'

!cat /etc/resolv.conf

!cat ~/.wgetrcb

!echo "use_proxy=yes\nhttp_proxy=http.proxy.fmr.com:8000\nhttps_proxy=http.proxy.fmr.com:8000" > ~/.wgetrc

 

#cat ~/.wgetrc

!echo $HTTP_PROXY

!echo $HTTPS_PROXY

nameserver 172.16.0.10
search fmr-a642163.svc.gpu-cluster.local svc.gpu-cluster.local gpu-cluster.local fmr.com
options ndots:5
cat: /home/jovyan/.wgetrcb: No such file or directory
10.239.228.20:8000
10.239.228.20:8000


In [6]:
import sys
sys.path.append("/home/jovyan/TF_NEW/tf-transformers/src/")


In [7]:
import os
import tempfile
import json
import glob
import datasets
import shutil
import tensorflow as tf

from tf_transformers.data import TFReader, TFWriter
from tf_transformers.models import RobertaModel, EncoderDecoder
from tf_transformers.losses import cross_entropy_loss, cross_entropy_loss_label_smoothing

from transformers import RobertaTokenizer

In [8]:
# Convert data to features using specific length
# into a temp dir (and log it as well for monitoring)

def get_dataset(data, batch_size, tokenizer, encoder_max_length, decoder_max_length, mode, tfrecord_dir, take_sample=False):
    
    if mode not in ["train", "eval"]:
        raise ValueError("Inavlid mode `{}` specified. Available mode is ['train', 'eval']".format(mode))
    
    def get_tfrecord_example(data):
        result = {}
        for f in data:
            input_ids = [tokenizer.cls_token] + tokenizer.tokenize(f['article'])[: encoder_max_length-2] + [tokenizer.sep_token] # -2 to add CLS and SEP
            input_ids = tokenizer.convert_tokens_to_ids(input_ids)
            input_mask = [1] * len(input_ids)
            input_type_ids = [0] * len(input_ids)

            decoder_input_ids = [tokenizer.cls_token] + tokenizer.tokenize(f['abstract'])[: decoder_max_length-2] + [tokenizer.sep_token]
            decoder_input_ids = tokenizer.convert_tokens_to_ids(decoder_input_ids)
            decoder_input_type_ids = [0] * len(decoder_input_ids)

            result = {}
            result['encoder_input_ids'] = input_ids
            result['encoder_input_mask'] = input_mask
            result['encoder_input_type_ids'] = input_type_ids
            result['decoder_input_ids'] = decoder_input_ids[:-1] # except last word
            result['decoder_input_type_ids'] = decoder_input_type_ids[:-1] # except last word

            result['labels'] = decoder_input_ids[1:] # not including first word
            result['labels_mask'] = [1] * len(decoder_input_ids[1:])

            # Decoder doesnt need input_mask because by default decoder has causal mask mode

            yield result

    schema = {
        "encoder_input_ids": ("var_len", "int"),
        "encoder_input_mask": ("var_len", "int"),
        "encoder_input_type_ids": ("var_len", "int"),
        "decoder_input_ids": ("var_len", "int"),
        "decoder_input_type_ids": ("var_len", "int"),
        "labels": ("var_len", "int"),
        "labels_mask": ("var_len", "int"),
    }
    
    # Create a temp dir
    if mode == "train":
        # Write tf records
        train_data_dir = os.path.join(tfrecord_dir,"train")        
        tfrecord_filename = 'pubmed'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=train_data_dir,
                            tag='train',
                            overwrite=False
                     )
        data_train = data['train']
        # Take sample
        if take_sample:
            data_train = data_train.select(range(500))
            
        tfwriter.process(parse_fn=get_tfrecord_example(data_train))
        
        # Read tfrecord to dataset
        schema = json.load(open("{}/schema.json".format(train_data_dir)))
        stats  = json.load(open('{}/stats.json'.format(train_data_dir)))
        all_files = glob.glob("{}/*.tfrecord".format(train_data_dir))
        tf_reader = TFReader(schema=schema, 
                            tfrecord_files=all_files)

        x_keys = ['encoder_input_ids', 'encoder_input_type_ids', 'encoder_input_mask', 'decoder_input_ids', 'decoder_input_type_ids']
        y_keys = ['labels', 'labels_mask']
        train_dataset = tf_reader.read_record(auto_batch=True, 
                                           keys=x_keys,
                                           batch_size=batch_size, 
                                           x_keys = x_keys, 
                                           y_keys = y_keys,
                                           shuffle=True, 
                                           drop_remainder=True
                                          )
        return train_dataset, stats['total_records']
    if mode == "eval":
        # Write tfrecords
        eval_data_dir = os.path.join(tfrecord_dir,"eval")
        tfrecord_filename = 'pubmed'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=eval_data_dir,
                            tag='dev',
                            overwrite=False
                            )
        data_eval = data['validation']
        # Take sample
        if take_sample:
            data_eval = data_eval.select(range(500))
        tfwriter.process(parse_fn=get_tfrecord_example(data_eval))
        
        
        # Read tfrecord to dataset
        schema = json.load(open("{}/schema.json".format(eval_data_dir)))
        stats  = json.load(open('{}/stats.json'.format(eval_data_dir)))
        all_files = glob.glob("{}/*.tfrecord".format(eval_data_dir))
        tf_reader = TFReader(schema=schema, 
                            tfrecord_files=all_files)

        x_keys = ['encoder_input_ids', 'encoder_input_type_ids', 'encoder_input_mask', 'decoder_input_ids', 'decoder_input_type_ids']
        y_keys = ['labels', 'labels_mask']
        eval_dataset = tf_reader.read_record(auto_batch=True, 
                                           keys=x_keys,
                                           batch_size=batch_size, 
                                           x_keys = x_keys, 
                                           y_keys = y_keys,
                                           shuffle=False, 
                                           drop_remainder=False
                                          )
        return eval_dataset, stats['total_records']

In [9]:
# Load model

def get_model():
    encoder = RobertaModel.from_pretrained("roberta-base", return_layer=True)
    decoder = RobertaModel.from_pretrained("roberta-base",use_decoder=True, return_layer=True, mask_mode='causal')

    # Assign all possible encoder variables to decoder
    encoder_dict = {var.name: var for var in encoder.variables}
    assigned_counter = 0
    for var in decoder.variables:
        if var.name in encoder_dict:
            var.assign(encoder_dict[var.name])
            assigned_counter += 1
    print("Assigned {} variables from encoder to decoder .".format(assigned_counter))
    del encoder_dict
    print("ENncoder variables {} and Decoder variables {}".format(len(encoder.variables), len(decoder.variables)))
    model = EncoderDecoder(encoder=encoder, decoder=decoder, share_embeddings=True) 
    model = model.get_model()
    print("Model variables {}".format(len(model.variables)))

    del encoder
    del decoder
    
    return model

# Load Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [10]:

# Data specific configuration
encoder_max_seq_length = 512
decoder_max_seq_length = 64

take_sample = False
train_batch_size = 64
eval_batch_size  = 32

# Trainer specifics
device = "gpu"
num_gpus = 2
tpu_address = None
dtype = "fp32"
epochs = 3
strategy = "mirrored"

# Optimizer
learning_rate = 1e-5
loss_type = None
return_all_layer_outputs = False
if loss_type and loss_type == 'joint':
    return_all_layer_outputs = True

# Core data specifics
data_name = "scientific_papers"
#num_classes = cfg.glue.data.num_classes

# Model specific
is_training = True
use_dropout = True

In [11]:
# Autoregressive model

encoder = RobertaModel.from_pretrained("roberta-base", return_layer=True)
decoder = RobertaModel.from_pretrained("roberta-base", mask_mode='causal', use_decoder=True, use_auto_regressive=True, return_layer=True)
# Assign all possible encoder variables to decoder
encoder_dict = {var.name: var for var in encoder.variables}
assigned_counter = 0
for var in decoder.variables:
    if var.name in encoder_dict:
        var.assign(encoder_dict[var.name])
        assigned_counter += 1
print("Assigned {} variables from encoder to decoder .".format(assigned_counter))
del encoder_dict
print("ENncoder variables {} and Decoder variables {}".format(len(encoder.variables), len(decoder.variables)))
model_ar = EncoderDecoder(encoder=encoder, decoder=decoder, share_embeddings=True) 
print("Model variables {}".format(len(model_ar.variables)))

del encoder
del decoder

# Important
model_ar = model_ar.get_model()

You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1
You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.



Two checkpoint references resolved to different objects (<tf_transformers.models.roberta.roberta.RobertaEncoder object at 0x7f2646e94250> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f259c198d30>).



Two checkpoint references resolved to different objects (<tf_transformers.models.roberta.roberta.RobertaEncoder object at 0x7f2646e94250> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f259c198d30>).
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1


Assigned 132 variables from encoder to decoder .
ENncoder variables 204 and Decoder variables 324
Model variables 525


In [12]:
import datasets
import tqdm
from tf_transformers.text import TextDecoder

class RougeCallback():
    
    def __init__(self, model, eval_dataset, original_summaries, tokenizer, eos_id, decoder_start_id, max_iterations):
        
        self.model = model
        self.eval_dataset = eval_dataset
        self.original_summaries = original_summaries
        self.tokenizer = tokenizer
        self.eos_id = eos_id
        self.decoder_start_id = decoder_start_id
        self.max_iterations = max_iterations
        
    def __call__(self, train_kwargs):
        
        self.model.set_weights(train_kwargs['model'].get_weights())
        decoder = TextDecoder(self.model, decoder_start_token_id=self.decoder_start_id, input_type_ids=0)
        
        # Predictions
        predicted_summaries = []
        for (batch_inputs, batch_labels) in tqdm.tqdm(eval_dataset):
            del batch_inputs['decoder_input_ids']
            decoder_outputs = decoder.decode(batch_inputs, mode='greedy', max_iterations=self.max_iterations, eos_id=self.eos_id)
            predicted_ids = [item[0] for item in decoder_outputs['predicted_ids'].numpy().tolist()]

            predicted_ids_sliced = []
            for p_id in predicted_ids:
                if self.eos_id in p_id:
                    index = p_id.index(eos_token)
                    p_id = p_id[:index]
                predicted_ids_sliced.append(p_id)
                predicted_summaries.append(self.tokenizer.decode(p_id))
                
        rouge = datasets.load_metric("rouge")
        rouge_output2 = rouge.compute(predictions=predicted_summaries, references=self.original_summaries, rouge_types=["rouge2"])["rouge2"].mid
        rouge_output1 = rouge.compute(predictions=predicted_summaries, references=self.original_summaries, rouge_types=["rouge1"])["rouge1"].mid
        rouge_outputL = rouge.compute(predictions=predicted_summaries, references=self.original_summaries, rouge_types=["rougeL"])["rougeL"].mid

        rouge2 = {'rouge2_precision': rouge_output2.precision,
                  'rouge2_recall': rouge_output2.recall,
                  'rouge2_f1': rouge_output2.fmeasure}
        rouge2['rouge1_precision'] = rouge_output1.precision
        rouge2['rouge1_recall'] = rouge_output1.recall
        rouge2['rouge1_f1'] = rouge_output1.fmeasure

        rouge2['rougeL_precision'] = rouge_outputL.precision
        rouge2['rougeL_recall'] = rouge_outputL.recall
        rouge2['rougeL_f1'] = rouge_outputL.fmeasure
        return rouge2


In [13]:
# Load dataset
from datasets import load_dataset
dataset = load_dataset("scientific_papers", "pubmed")



In [15]:
# Load TFrecords
tfrecord_dir = tempfile.mkdtemp()

train_dataset, total_train_examples = get_dataset(dataset, train_batch_size,tokenizer, encoder_max_seq_length, decoder_max_seq_length, "train", tfrecord_dir, take_sample)
eval_dataset, total_eval_examples  = get_dataset(dataset, eval_batch_size,tokenizer, encoder_max_seq_length, decoder_max_seq_length, "eval", tfrecord_dir, take_sample)

original_summaries = [item['abstract'] for item in dataset['validation']]
callback = RougeCallback( model_ar, eval_dataset, original_summaries,
                         tokenizer, tokenizer.sep_token_id, tokenizer.cls_token_id, decoder_max_seq_length)

INFO:absl:Total individual observations/examples written is 119924 in 1213.9509325027466 seconds
INFO:absl:All writer objects closed
INFO:absl:Total individual observations/examples written is 6633 in 68.34936857223511 seconds
INFO:absl:All writer objects closed


In [16]:
# Load optimizer fn

from tf_transformers.optimization import create_optimizer
def get_optimizer(learning_rate, examples, batch_size, epochs, learning_rate_type="polynomial"):
    steps_per_epoch = int(examples / batch_size)
    num_train_steps = steps_per_epoch * epochs
    warmup_steps = int(0.1 * num_train_steps)
    
    def optimizer_fn():
        optimizer, learning_rate_fn = create_optimizer(learning_rate,
                                                   num_train_steps,
                                                   num_train_steps, 
                                                      learning_rate_type=learning_rate_type)
        return optimizer
    return optimizer_fn

optimizer_fn = get_optimizer(learning_rate, total_train_examples, train_batch_size, epochs)

In [17]:
# Loss fn

def get_loss(y_true_dict, y_pred_dict):
    
    loss = cross_entropy_loss(labels=y_true_dict['labels'], 
                                   logits=y_pred_dict['token_logits'], 
                                      label_weights=y_true_dict['labels_mask'])
    return {'loss': loss}

In [18]:
# Load trainer
from tf_transformers.core import GPUTrainer
trainer = GPUTrainer(distribution_strategy=strategy, 
                    num_gpus=num_gpus, 
                    dtype=dtype)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [19]:
model_fn = get_model
train_loss_fn = get_loss

In [None]:
model_checkpoint_dir = "/tmp/roberta2robera_pubmed_short/"
history = trainer.run(
    model_fn = model_fn,
    optimizer_fn = optimizer_fn,
    train_dataset = train_dataset,
    train_loss_fn = train_loss_fn,
    epochs = epochs,
    steps_per_epoch = 100000,
    model_checkpoint_dir=model_checkpoint_dir,
    batch_size=train_batch_size,
    training_loss_names=None,
    validation_loss_names=None,
    validation_dataset=eval_dataset,
    validation_loss_fn=train_loss_fn,
    validation_interval_steps=None,
    steps_per_call=100,
    enable_xla=False,
    callbacks=[callback],
    callbacks_interval_steps=None,
    overwrite_checkpoint_dir=True,
    max_number_of_models=10,
    model_save_interval_steps=None,
    repeat_dataset=False,
    latest_checkpoint=None,
)

INFO:absl:Make sure `steps_per_epoch` should be less than or equal to number of batches in dataset.
INFO:absl:Policy: ----> float32
INFO:absl:Strategy: ---> <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7f26478ade80>
INFO:absl:Num GPU Devices: ---> 2
You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1
You are using a model of type roberta to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/roberta-base/ckpt-1


Assigned 204 variables from encoder to decoder .
ENncoder variables 204 and Decoder variables 324


INFO:absl:Using Adamw optimizer
INFO:absl:No checkpoint found in /tmp/roberta2robera_pubmed_short/
Epoch 1/3 --- Step 100/100000 --- :   0%|          | 0/1000 [00:00<?, ?batch /s]

Model variables 525








INFO:tensorflow:batch_all_reduce: 515 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 515 all-reduces with algorithm = nccl, num_packs = 1






INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).










INFO:tensorflow:batch_all_reduce: 515 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 515 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:absl:Callbacks in progress at step 100 . . . .
Epoch 1/3 --- Step 200/100000 --- :   0%|          | 1/1000 [02:31<42:05:50, 151.70s/batch , learning_rate=8.99e-8, loss=6.14]INFO:absl:Callbacks in progress at step 200 . . . .
Epoch 1/3 --- Step 300/100000 --- :   0%|          | 2/1000 [03:34<34:40:17, 125.07s/batch , learning_rate=2.68e-7, loss=4.74]INFO:absl:Callbacks in progress at step 300 . . . .
Epoch 1/3 --- Step 400/100000 --- :   0%|          | 3/1000 [04:37<29:29:05, 106.46s/batch , learning_rate=4.46e-7, loss=4.24]INFO:absl:Callbacks in progress at step 400 . . . .
Epoch 1/3 --- Step 500/100000 --- :   0%|          | 4/1000 [05:40<25:50:39, 93.41s/batch , learning_rate=6.24e-7, loss=4.02] INFO:absl:Callbacks in progress at step 500 . . . .
Epoch 1/3 --- Step 600/100000 --- :   0%|          | 5/1000 [06:43<23:17:57, 84.30s/batch , lear

Epoch 1/3 --- Step 4600/100000 --- :   4%|▍         | 45/1000 [48:41<16:42:26, 62.98s/batch , learning_rate=7.92e-6, loss=2.42]INFO:absl:Callbacks in progress at step 4600 . . . .
Epoch 1/3 --- Step 4700/100000 --- :   5%|▍         | 46/1000 [49:44<16:41:57, 63.02s/batch , learning_rate=8.1e-6, loss=2.4]  INFO:absl:Callbacks in progress at step 4700 . . . .
Epoch 1/3 --- Step 4800/100000 --- :   5%|▍         | 47/1000 [50:48<16:41:32, 63.06s/batch , learning_rate=8.28e-6, loss=2.37]INFO:absl:Callbacks in progress at step 4800 . . . .
Epoch 1/3 --- Step 4900/100000 --- :   5%|▍         | 48/1000 [51:51<16:40:06, 63.03s/batch , learning_rate=8.45e-6, loss=2.35]INFO:absl:Callbacks in progress at step 4900 . . . .
Epoch 1/3 --- Step 5000/100000 --- :   5%|▍         | 49/1000 [52:54<16:39:36, 63.07s/batch , learning_rate=8.63e-6, loss=2.36]INFO:absl:Callbacks in progress at step 5000 . . . .
Epoch 1/3 --- Step 5100/100000 --- :   5%|▌         | 50/1000 [53:57<16:37:39, 63.01s/batch , learni

Epoch 1/3 --- Step 9200/100000 --- :   9%|▉         | 91/1000 [1:36:59<15:55:10, 63.05s/batch , learning_rate=0, loss=2.21]INFO:absl:Callbacks in progress at step 9200 . . . .
Epoch 1/3 --- Step 9300/100000 --- :   9%|▉         | 92/1000 [1:38:02<15:54:42, 63.09s/batch , learning_rate=0, loss=2.21]INFO:absl:Callbacks in progress at step 9300 . . . .
Epoch 1/3 --- Step 9400/100000 --- :   9%|▉         | 93/1000 [1:39:06<15:54:25, 63.14s/batch , learning_rate=0, loss=2.24]INFO:absl:Callbacks in progress at step 9400 . . . .
Epoch 1/3 --- Step 9500/100000 --- :   9%|▉         | 94/1000 [1:40:09<15:53:23, 63.14s/batch , learning_rate=0, loss=2.22]INFO:absl:Callbacks in progress at step 9500 . . . .
Epoch 1/3 --- Step 9600/100000 --- :  10%|▉         | 95/1000 [1:41:12<15:52:36, 63.16s/batch , learning_rate=0, loss=2.22]INFO:absl:Callbacks in progress at step 9600 . . . .
Epoch 1/3 --- Step 9700/100000 --- :  10%|▉         | 96/1000 [1:42:15<15:50:07, 63.06s/batch , learning_rate=0, loss=2.

Epoch 1/3 --- Step 13800/100000 --- :  14%|█▎        | 137/1000 [2:25:19<15:06:14, 63.01s/batch , learning_rate=0, loss=2.24]INFO:absl:Callbacks in progress at step 13800 . . . .
Epoch 1/3 --- Step 13900/100000 --- :  14%|█▍        | 138/1000 [2:26:22<15:05:35, 63.03s/batch , learning_rate=0, loss=2.24]INFO:absl:Callbacks in progress at step 13900 . . . .
Epoch 1/3 --- Step 14000/100000 --- :  14%|█▍        | 139/1000 [2:27:25<15:05:00, 63.07s/batch , learning_rate=0, loss=2.24]INFO:absl:Callbacks in progress at step 14000 . . . .
Epoch 1/3 --- Step 14100/100000 --- :  14%|█▍        | 140/1000 [2:28:28<15:03:20, 63.02s/batch , learning_rate=0, loss=2.23]INFO:absl:Callbacks in progress at step 14100 . . . .
Epoch 1/3 --- Step 14200/100000 --- :  14%|█▍        | 141/1000 [2:29:31<15:01:39, 62.98s/batch , learning_rate=0, loss=2.23]INFO:absl:Callbacks in progress at step 14200 . . . .
Epoch 1/3 --- Step 14300/100000 --- :  14%|█▍        | 142/1000 [2:30:34<15:01:09, 63.02s/batch , learnin

In [42]:
import numpy as  np
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
with open("{}/history.json".format(model_checkpoint_dir), "w") as f:
    json.dump(str(history),f, indent=2)

In [43]:
import shutil
shutil.rmtree(model_checkpoint_dir)
shutil.rmtree(tfrecord_dir)