In [1]:
import sys
sys.path.append("/mnt/home/TF_NEW/tf-transformers/src/")

In [4]:
import datasets
import json
import os
import glob
import time

from tf_transformers.models import GPT2Model
from transformers import GPT2Tokenizer
from tf_transformers.data.squad_utils_sp import (
    read_squad_examples)
from tf_transformers.data import TFWriter, TFReader, TFProcessor
from tf_transformers.losses import cross_entropy_loss_fast
from tf_transformers.core import optimization, SimpleTrainer
from absl import logging
logging.set_verbosity("INFO")

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


In [5]:
input_file_path = '/mnt/home/PRE_MODELS/HuggingFace_models/datasets/squadv1.1/train-v1.1.json'

is_training = True

# 1. Read Examples
start_time = time.time()
train_examples = read_squad_examples(
      input_file=input_file_path,
      is_training=is_training,
      version_2_with_negative=False
      )
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))

Time taken 0.8381352424621582


In [38]:
max_passage_length = 384
max_question_length = 64
max_answer_length = 40

def parse_train():
    result = {}
    for f in train_examples:
        question_input_ids =  tokenizer.tokenize('question: ' + f['question_text'])[: max_question_length] 
        passage_input_ids  =  tokenizer.tokenize('context: '  + f['paragraph_text'])[: max_passage_length -1]  + [tokenizer.bos_token] # -1 to add </s>
        
        input_ids = tokenizer.convert_tokens_to_ids(question_input_ids + passage_input_ids)
        input_mask = [1] * len(input_ids)
        labels_mask = [0] * len(input_ids)
        answer_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(f['orig_answer_text'])[: max_answer_length-1] + 
                                                     [tokenizer.bos_token])
        input_ids = input_ids + answer_ids
        input_mask = input_mask + [0] * len(answer_ids)
        labels_mask = labels_mask + [1] * len(answer_ids)
        
        labels = input_ids[1:]
        labels_mask = labels_mask[1:]
        
        input_ids = input_ids[:-1]
        input_mask = input_mask[:-1]

        result = {}
        result['input_ids'] = input_ids
        result['input_mask'] = input_mask
        
        result['labels'] = labels
        result['labels_mask'] = labels_mask
        
        # Decoder doesnt need input_mask because by default decoder has causal mask mode

        yield result
        
# Lets write using TF Writer
# Use TFProcessor for smalled data

schema = {
    "input_ids": ("var_len", "int"),
    "input_mask": ("var_len", "int"),
    "labels": ("var_len", "int"),
    "labels_mask": ("var_len", "int"),
}

tfrecord_train_dir = '../OFFICIAL_TFRECORDS/squad_as_generation/gpt2/train'
tfrecord_filename = 'squad'
tfwriter = TFWriter(schema=schema, 
                    file_name=tfrecord_filename, 
                    model_dir=tfrecord_train_dir,
                    tag='train',
                    overwrite=True
                    )
tfwriter.process(parse_fn=parse_train())

INFO:absl:Wrote 1000 tfrecods
INFO:absl:Wrote 2000 tfrecods
INFO:absl:Wrote 3000 tfrecods
INFO:absl:Wrote 4000 tfrecods
INFO:absl:Wrote 5000 tfrecods
INFO:absl:Wrote 6000 tfrecods
INFO:absl:Wrote 7000 tfrecods
INFO:absl:Wrote 8000 tfrecods
INFO:absl:Wrote 9000 tfrecods
INFO:absl:Wrote 10000 tfrecods
INFO:absl:Wrote 11000 tfrecods
INFO:absl:Wrote 12000 tfrecods
INFO:absl:Wrote 13000 tfrecods
INFO:absl:Wrote 14000 tfrecods
INFO:absl:Wrote 15000 tfrecods
INFO:absl:Wrote 16000 tfrecods
INFO:absl:Wrote 17000 tfrecods
INFO:absl:Wrote 18000 tfrecods
INFO:absl:Wrote 19000 tfrecods
INFO:absl:Wrote 20000 tfrecods
INFO:absl:Wrote 21000 tfrecods
INFO:absl:Wrote 22000 tfrecods
INFO:absl:Wrote 23000 tfrecods
INFO:absl:Wrote 24000 tfrecods
INFO:absl:Wrote 25000 tfrecods
INFO:absl:Wrote 26000 tfrecods
INFO:absl:Wrote 27000 tfrecods
INFO:absl:Wrote 28000 tfrecods
INFO:absl:Wrote 29000 tfrecods
INFO:absl:Wrote 30000 tfrecods
INFO:absl:Wrote 31000 tfrecods
INFO:absl:Wrote 32000 tfrecods
INFO:absl:Wrote 3

In [44]:
# Read Data


schema = json.load(open("{}/schema.json".format(tfrecord_train_dir)))
all_files = glob.glob("{}/*.tfrecord".format(tfrecord_train_dir))
tf_reader = TFReader(schema=schema, 
                    tfrecord_files=all_files)

x_keys = ['input_ids', 'input_mask']
y_keys = ['labels', 'labels_mask']
batch_size = 16
train_dataset = tf_reader.read_record(auto_batch=True, 
                                   keys=x_keys,
                                   batch_size=batch_size, 
                                   x_keys = x_keys, 
                                   y_keys = y_keys,
                                   shuffle=True, 
                                   drop_remainder=True
                                  )

In [46]:
for (batch_inputs, batch_labels) in train_dataset:
    print(batch_inputs, batch_labels)
    break

{'decoder_input_ids': <tf.Tensor: shape=(8, 41), dtype=int32, numpy=
array([[    0,   250,   693,    18,   809,    34,    57,   303,  1025,
           10, 29957,   790,    11, 16612, 33897,     4,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [    0, 18187,   677,  8743,  1638,  4834,    40,   310,  3350,
          118, 20486,   967,  6249,    11,   395,    18,  2561,  2117,
          507,    71,   258,   439,   149,    11,  1359,  3880,     4,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [    0,  9497,  3219,    10,  2997,   891,    18,  3257,    11,
            5,  3497,     9,  2487,    32,  8959,     5,   693,    18,
          744,    25,  1900,     8,  3219,     5,  3302,    14,    79,
           21,     5,  1802,     9,    10,  1900,    12,  9228

In [5]:
import tensorflow as tf

model_layer, model, config = GPT2Model(model_name='gpt2', mask_mode = 'prefix', return_all_layer_token_embeddings=False)
model.load_checkpoint("/mnt/home/PRE_MODELS/LegacyAI_models/checkpoints/gpt2/")

INFO:absl:Overwride mask_mode with user_defined
INFO:absl:Initialized Variables
INFO:absl:Overwride mask_mode with causal
INFO:absl:Initialized Variables
INFO:absl:Succesful: Model checkpoints matched
INFO:absl:Encoder loaded succesfully from /mnt/home/PRE_MODELS/LegacyAI_models/checkpoints/roberta-base/
INFO:absl:Warm started decoder 197/202 variables
INFO:absl:Inputs -->
INFO:absl:encoder_input_ids ---> Tensor("input_ids:0", shape=(None, None), dtype=int32)
INFO:absl:encoder_input_mask ---> Tensor("input_mask:0", shape=(None, None), dtype=int32)
INFO:absl:encoder_input_type_ids ---> Tensor("input_type_ids:0", shape=(None, None), dtype=int32)
INFO:absl:decoder_input_ids ---> Tensor("decoder_input_ids:0", shape=(None, None), dtype=int32)
INFO:absl:decoder_input_type_ids ---> Tensor("decoder_input_type_ids:0", shape=(None, None), dtype=int32)
INFO:absl:Initialized Variables
INFO:absl:Inputs -->
INFO:absl:encoder_input_ids ---> Tensor("input_ids:0", shape=(None, None), dtype=int32)
INFO:

In [48]:
model_outputs = model(batch_inputs)

In [53]:
def cross_entropy_loss_label_smoothing(labels, logits, smoothing=0.1, label_weights=None):
    """
    logits: (.. , vocab_size)
    labels: (.. ) rank should be less than logits
    label_weights: labels shape

    Faster than above implementation
    """
    confidence = 1.0 - smoothing
    vocab_size = tf.shape(logits)[-1]
    vocab_float = tf.cast(vocab_size - 1, tf.float32)
    low_confidence = (1.0 - confidence) / vocab_float
    soft_targets = tf.one_hot(
        labels,
        depth=vocab_size,
        on_value=confidence,
        off_value=low_confidence)
    xentropy = tf.nn.softmax_cross_entropy_with_logits(
            logits=logits, labels=soft_targets)
    # Calculate the best (lowest) possible value of cross entropy, and
    # subtract from the cross entropy loss.
    normalizing_constant = -(
        confidence * tf.math.log(confidence) + vocab_float *
        low_confidence * tf.math.log(low_confidence + 1e-20))
    xentropy -= normalizing_constant
    if label_weights is None:
        label_weights = tf.ones_like(labels)
    per_example_loss = xentropy * tf.cast(label_weights, xentropy.dtype)
    numerator = tf.reduce_sum(per_example_loss)
    denominator = tf.cast(tf.reduce_sum(label_weights), numerator.dtype)
    denominator = tf.reduce_sum(label_weights)
    loss = tf.math.divide_no_nan(numerator, tf.cast(denominator, numerator.dtype))
    return loss

def lm_loss(y_true_dict, y_pred_dict, smoothing=0.1):
    
    return cross_entropy_loss_label_smoothing(labels=y_true_dict['labels'], 
                                   logits=y_pred_dict['token_logits'],
                                   smoothing=smoothing,
                                      label_weights=y_true_dict['labels_mask'])


In [57]:
optimizer = optimization.AdamWeightDecay(learning_rate=0.0001)

INFO:absl:using Adamw optimizer


In [60]:
# Keras Fit

keras_loss_fn = {'token_logits': lm_loss
                }
model.compile2(optimizer=optimizer, 
                            loss=None, 
                            custom_loss=keras_loss_fn, 
                            run_eagerly=False)
history = model.fit(train_dataset, epochs=2, steps_per_epoch=5)

Epoch 1/2
















Epoch 2/2


In [None]:
train_data_size = 87000
steps_per_epoch = int(train_data_size / batch_size)
EPOCHS = 4

# Custom training
history2 = SimpleTrainer(model = model,
             optimizer = optimizer,
             loss_fn = lm_loss,
             dataset = train_dataset.repeat(EPOCHS+1), # This is important
             epochs = EPOCHS, 
             num_train_examples = train_data_size, 
             batch_size = batch_size, 
             steps_per_call=100)
model_save_dir = "../OFFICIAL_MODELS/squad_as_generation/gpt2_prefix"
model.save_checkpoint(model_save_dir, overwrite=True)

In [None]:
batch_counter = 0
top_32_results = []
for batch_inputs in dev_dataset:
    predicted_ids = []
    for i in range(30):
        result = model(batch_inputs)
        p_ids = tf.cast(tf.argmax(result['last_token_logits'][0]), tf.int32)
        predicted_ids.append(p_ids.numpy())
        batch_inputs['input_ids'] = tf.concat([batch_inputs['input_ids'], [[p_ids]]], axis=1)
        #batch_inputs['input_mask'] = tf.concat([batch_inputs['input_mask'], [[1]]], axis=1)
        if p_ids.numpy() == tokenizer.bos_token_id:
            print("Found")
            break
    top_32_results.append(tokenizer.decode(predicted_ids, skip_special_tokens=True))
    batch_counter += 1
    if batch_counter == 32:
        break

        


In [None]:
# Load the model by disabling dropout and add pipeline_mode = 'auto-regressive'

import tensorflow as tf

model_layer, model, config = GPT2Model(model_name='gpt2',
                                       mask_mode='prefix',
                                      return_all_layer_token_embeddings=False,
                                      is_training=False, pipeline_mode='auto-regressive')
model.load_checkpoint(model_save_dir)

In [None]:
model.save_as_serialize_module("{}/saved_model".format(model_save_dir), overwrite=True)
loaded = tf.saved_model.load("{}/saved_model".format(model_save_dir))

In [None]:
dev_input_file_path = '/mnt/home/PRE_MODELS/HuggingFace_models/datasets/squadv1.1/dev-v1.1.json'

is_training = False

start_time = time.time()
dev_examples = read_squad_examples(
      input_file=dev_input_file_path,
      is_training=is_training,
      version_2_with_negative=False
)
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))

def parse_dev():
    for f in dev_examples:
        question_input_ids =  tokenizer.tokenize('question: ' + f['question_text'])[: max_question_length] 
        passage_input_ids  =  tokenizer.tokenize('context: '  + f['paragraph_text'])[: max_passage_length -1]  + [tokenizer.bos_token] # -1 to add </s>
        
        input_ids = tokenizer.convert_tokens_to_ids(question_input_ids + passage_input_ids)
        input_mask = [1] * len(input_ids)
        
        result = {}
        result['input_ids'] = input_ids
        result['input_mask'] = input_mask
       
        yield result
        
tf_processor = TFProcessor()
dev_dataset = tf_processor.process(parse_fn=parse_dev())
dev_dataset = tf_processor.auto_batch(dev_dataset, batch_size=32)

In [None]:
def split_by_id(predicted_ids, eos_id):
    all_ids = []
    for per_example_id in predicted_ids:
        try:
            index = per_example_id.index(eos_id)
        except:
            index = -1
        sliced_ids = per_example_id[:index]
        all_ids.append(sliced_ids)
    return all_ids

In [None]:



from tf_transformers.text import TextDecoder

decoder = TextDecoder(model=loaded, 
                      input_mask_ids=1,
                            )
# OR if keras.model

decoder = TextDecoder(model=model, 
                            input_mask_ids=1
                            )

batch_counter = 0
start_time = time.time()
predicted_answers = []
for batch_inputs in dev_dataset:
    padded_mask = tf.cast(tf.equal(batch_inputs['input_mask'], 0), tf.int32) * -1
    batch_inputs['input_ids'] = batch_inputs['input_ids'] + padded_mask # we need -1 (not 0) for padded positions
    model_outputs = decoder.decode(batch_inputs, 
                   mode='greedy', 
                   max_iterations=40,
                   do_sample=False,
                   eos_id=tokenizer.bos_token_id)

    predicted_ids = model_outputs['predicted_ids'][:, 0, :].numpy().tolist()
    predicted_ids_sliced = split_by_id(predicted_ids, tokenizer.bos_token_id)
    predicted_text = [tokenizer.decode(p_ids, skip_special_tokens=True) for p_ids in predicted_ids_sliced]
        
    
    predicted_answers.extend(predicted_text)
    batch_counter += 1
    print("batch {}/{}".format(batch_counter, len(dev_examples)//32))
    
end_time = time.time()
print("Time taken is {}".format(end_time-start_time))

squad_dev_data = json.load(open(dev_input_file_path))['data']
qas_id_answer  = {item['qas_id']: predicted_answers[i] for(i, item) in enumerate(dev_examples)}
eval_results = evaluate_v1(squad_dev_data, qas_id_answer)

# {'exact_match': 47.52128666035951, 'f1': 63.22016537129672}