In [1]:
import sys

sys.path.append("/Users/PRVATE/Documents/tf_transformers/src/")

In [3]:
import tensorflow as tf
import tensorflow_hub as hub

from transformers import TFGPT2Model
from tf_transformers.models import GPT2Encoder

from tf_transformers.core import LegacyModule

from tf_transformers.utils import convert_gpt2_hf_to_tf_transformers
import json
import time
import os

# Load HuggingFace Model

We will load huggingface Model and use it to assign variables to the tf_transformers model.


We will be using `convert_gpt2_hf_to_tf_transformers` function.


In [12]:
# Load HF model
# Always do this
tf.keras.backend.clear_session()

local_dir = "/Users/PRVATE/HUggingFace_Models/"
hf_model_name = "gpt2"
if local_dir:
    hf_model_location = local_dir + hf_model_name

model_hf = TFGPT2Model.from_pretrained(hf_model_location)

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at /Users/PRVATE/HUggingFace_Models/gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


# Load tf_transformers Model

Configs are in the `model_configs` folder in the root of the repo.


We will be using `convert_albert_hf_to_tf_transformers` function.

Always use `is_training=False` to load the model and pass this model for conversion.

Do not enablle `pipeline_mode='auto-regressive` while converting. Because, variable name

differs due to `tf.cond` usage. 

# Steps:

1. Load a model using **`is_training=False`**

2. Convert it using conversion functions from `tf_transformers.utils`

3. Save the `checkpoint` .

4. For auto-regressive tasks (text generation) use **`pipeline_mode='auto-regressive`**

   along with **`is_training=False`** and load from the checkpoint.

In [13]:
# Load tf_transformers model
# Most config we will be providing

# Default configs for the model

model_config_dir = '/Users/PRVATE/Documents/tf_transformers/model_configs/'
model_name = 'gpt2_base'
config_location = os.path.join(model_config_dir, model_name, 'config.json')
config = json.load(open(config_location))

# Always do this
tf.keras.backend.clear_session()

# tf_transformers Layer (an extension of Keras Layer)
# This is not Keras model, but extension of keras Layer

# Save as saved_model
# If you want to use the model for Auto Regressive tasks ( text-generation ),
# you have to enable pipeline_mode='auto-regressive'.
# Because TF needs extra cache inputs in the saved_model format for doing efficient caching

model_layer = GPT2Encoder(
    config=config,
    name="gpt2",
    mask_mode=config["mask_mode"],
    is_training=False,
)

# Convert to tf.keras.Model
model_tf_transformers = model_layer.get_and_load_model(model_dir=None)
convert_gpt2_hf_to_tf_transformers(model_hf, model_tf_transformers, config)

INFO:absl:We are overwriding `is_training` is False to `is_training`                     to True with `use_dropout` is False, no effects on your inference pipeline
INFO:absl:Inputs -->
INFO:absl:input_ids ---> Tensor("input_ids:0", shape=(None, None), dtype=int32)
INFO:absl:Initialized Variables
INFO:absl:Inputs -->
INFO:absl:input_ids ---> Tensor("input_ids_1:0", shape=(None, None), dtype=int32)
INFO:absl:Deleteing huggingface model for saving memory
INFO:absl:Done assigning variables weights . Total 100


# Save checkpoints

In [14]:
# If you want to save the model as checkpoints

checkpoint_dir = 'model_ckpt'
checkpoint = tf.train.Checkpoint(model=model_tf_transformers)
manager = tf.train.CheckpointManager(
    checkpoint, directory=checkpoint_dir, max_to_keep=1
)
manager.save()
print("Saved at {}".format(manager.latest_checkpoint))

Saved at model_ckpt/ckpt-1


# Check for reference

Have a look at `tf_transformers/extra/*.py` for reference values, to make sure model

has loaded correctly

In [15]:
# Please have a look at tf_transformers/extra/*.py for reference values

input_ids = tf.constant([[1, 9, 10, 11, 23], [1, 22, 234, 432, 2349]])
input_mask = tf.ones_like(input_ids)
input_type_ids = tf.ones_like(input_ids)

inputs = {
    "input_ids": input_ids,
}

results_tf_transformers = model_tf_transformers(inputs)
for k, r in results_tf_transformers.items():
    if isinstance(r, list):
        continue
    print(k, "-->", tf.reduce_sum(r), "-->", r.shape)

token_embeddings --> tf.Tensor(2371.2754, shape=(), dtype=float32) --> (2, 5, 768)
token_logits --> tf.Tensor(-34781264.0, shape=(), dtype=float32) --> (2, 5, 50257)
last_token_logits --> tf.Tensor(-8346981.0, shape=(), dtype=float32) --> (2, 50257)


# Load Model for Auto-Regressive tasks

**Text generation / Auto regressive decoding*** requires caching of `K` and `V` values.

This, means for the model to make use of serialization, thsese values has to be a part of the model.

So, K and V are extra inputs required for inference **(only in the case of text generation)**.

As a result, training and testing needs different pipleines (for auto regressive tasks)

**`Note: We have necessary wrappers to do all these, user doesnt has to worry about any of these`**

In [16]:
model_layer = GPT2Encoder(
    config=config,
    name="gpt2",
    mask_mode=config["mask_mode"],
    is_training=False,
    pipeline_mode="auto-regressive",
)

# Convert to tf.keras.Model
model_tf_transformers = model_layer.get_and_load_model(model_dir=None)

# And now load the checkpints from previously saved model

checkpoint = tf.train.Checkpoint(model=model_tf_transformers)
manager = tf.train.CheckpointManager(
    checkpoint, directory=checkpoint_dir, max_to_keep=1
)
status = checkpoint.restore(manager.latest_checkpoint)

# Important
if status.assert_existing_objects_matched():
    print("Model checkpoints matched")

INFO:absl:Inputs -->
INFO:absl:input_ids ---> Tensor("input_ids_2:0", shape=(None, None), dtype=int32)
INFO:absl:all_cache_key ---> Tensor("all_cache_key:0", shape=(None, None, 12, None, 64), dtype=float32)
INFO:absl:all_cache_value ---> Tensor("all_cache_value:0", shape=(None, None, 12, None, 64), dtype=float32)
INFO:absl:past_length ---> Tensor("past_length:0", shape=(1, None), dtype=int32)
INFO:absl:Initialized Variables
INFO:absl:Inputs -->
INFO:absl:input_ids ---> Tensor("input_ids_3:0", shape=(None, None), dtype=int32)
INFO:absl:all_cache_key ---> Tensor("all_cache_key_1:0", shape=(None, None, 12, None, 64), dtype=float32)
INFO:absl:all_cache_value ---> Tensor("all_cache_value_1:0", shape=(None, None, 12, None, 64), dtype=float32)
INFO:absl:past_length ---> Tensor("past_length_1:0", shape=(1, None), dtype=int32)



Two checkpoint references resolved to different objects (<tf_transformers.models.gpt2.GPT2Encoder object at 0x13edbdee0> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x13ed0bb80>).



Two checkpoint references resolved to different objects (<tf_transformers.models.gpt2.GPT2Encoder object at 0x13edbdee0> and <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x13ed0bb80>).


Model checkpoints matched


# Save the model .pb (saved_model)

To make use of the benefits of serialization, we have to save the model.

Now, why don't `model_tf_transformers.save("model_pb", save_format='tf')` . 

Reason is when we save the model using above, TF will somehow ignore the proper output node names.

It will assign some random names like `['gpt_output1, ect...]`. 

To preserve the names in the `saved_model` , we have small wrapper function called `LegacyModule`

In [17]:
# Save the model to .pb for make use of proper serialization
saved_model_dir = "model_pb"
tf_transformers_module = LegacyModule(model_tf_transformers)
tf_transformers_module.save(saved_model_dir)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: model_pb/assets


INFO:tensorflow:Assets written to: model_pb/assets


# Benchmark models

We will benchmark on gpt2 model with following approaches.

1. Use the model in `saved_model_dir` with **`TextDecoder`** API. This API will consume the model,

   **(suppprts `saved_model`, `tf.keras.Model`, `hub.KerasLayer`)** . Recommended is `saved_model` 
   
   or `hub.KerasLayer` . **`TextDecoder`** API is **pure python function, which has for loops for decoding
   
   
2. We will use **`TextDecoderSerializable`** . This `tf.while_loop` implementation . The advantage is

   we can serialize the entire operation as it is, so you decoding method will be a part of `saved_model`
   
   graph.
   

In [18]:
from tf_transformers.text import TextDecoder
from tf_transformers.text import TextDecoderSerializable

# Save Greedy Decoding (serialized model)

1. We can do, `model=beam` and `model=top_k_top_p` for Beam search and top K nucleus sampling

   respectively

In [22]:
# Save the model for greedy decoding
saved_model_dir_strategy = 'model_pb_greedy'

decoder_layer = TextDecoderSerializable(
    model_tf_transformers,
    max_iterations=None,
    num_attention_heads=12,
    num_layers=12,
    hidden_dimension=768,
    mode="greedy",
    do_sample=False,
    eos_id=-100,
)
# Convert whole operation to a model
decoder_model  = decoder_layer.get_model()


decoder_module = LegacyModule(decoder_model)
decoder_module.save(saved_model_dir_strategy)
print("Saved")



INFO:tensorflow:Assets written to: model_pb_greedy/assets


INFO:tensorflow:Assets written to: model_pb_greedy/assets


Saved


In [24]:
# reser variables
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [26]:
# Garbage collection
import gc
gc.collect()

1398

In [None]:
# Load the saved model and benchmark 


from functools import wraps
import time
import numpy as np

average_runs = 4

def timeit(my_func):
    @wraps(my_func)
    def timed(*args, **kw):
        time_list = []
        for i in range(3):
          tstart = time.time()
          output = my_func(*args, **kw)
          tend = time.time()
          time_taken = tend-tstart
          time_list.append((tend - tstart))
          
        return np.mean(time_list[1:]) # return last 3
    return timed




import tensorflow as tf
from tf_transformers.text import TextDecoder

saved_model_dir = "model_pb"
loaded = tf.saved_model.load(saved_model_dir)
model = loaded.signatures['serving_default']

# Greedy

In [None]:
timing_results = {}
batch_size_list = [1, 3, 5, 10]
max_seq_length_list = [15, 25, 50, 100, 150]

decoder_layer = TextDecoder(
    model=model,
    num_attention_heads=12,
    num_layers=12,
    hidden_dimension=768,
    input_mask_ids=None,
    input_type_ids=None
)

@timeit
def call_greedy(input_dict, length):
    result_greedy = decoder_layer.decode(input_dict, 
                                       mode='greedy', 
                                       eos_id=-100, 
                                       max_iterations=length, 
                                       do_sample=False)
    return result_greedy

minval = 1
maxval = 50256
input_tensor_length = 100

timing_holder = {}
for batch in batch_size_list:
  for sequence_length in max_seq_length_list:
    input_tensor = tf.random.uniform(minval=minval, maxval=maxval, shape=(batch,input_tensor_length), dtype=tf.int32)
    # because we need to genrate that much tokens from input_tensor_length
    input_dict = {'input_ids': input_tensor}
    average_time_taken = call_greedy(input_dict, sequence_length)
    timing_holder[(batch, sequence_length)] = average_time_taken
    print("Done batch_size {} sequence length {}".format(batch, sequence_length))

    
timing_results['greedy'] = timing_holder

# Beam

In [None]:
batch_size_list = [1, 3, 5, 10]
max_seq_length_list = [15, 25, 50, 100, 150]
beam_list = [1, 2, 5, 7, 10]

decoder_layer = TextDecoder(
    model=model,
    num_attention_heads=12,
    num_layers=12,
    hidden_dimension=768,
    input_mask_ids=None,
    input_type_ids=None
)

@timeit
def call_beam(input_dict, length, beam):
    result = decoder_layer.decode(input_dict, 
                                       mode='beam', 
                                       eos_id=-100, 
                                       max_iterations=length,
                                       beam_size = beam,
                                       do_sample=False)
    print(result['predicted_ids'].shape)
    return result

minval = 1
maxval = 50256
input_tensor_length = 100

timing_holder = {}
for batch in batch_size_list:
  for sequence_length in max_seq_length_list:
    for beam in beam_list:
      input_tensor = tf.random.uniform(minval=minval, maxval=maxval, shape=(batch,input_tensor_length), dtype=tf.int32)
      # because we need to genrate that much tokens from input_tensor_length
      input_dict = {'input_ids': input_tensor}
      average_time_taken = call_beam(input_dict, sequence_length, beam)
      timing_holder['{}_{}'.format(batch, sequence_length)] = average_time_taken
      timing_holder[(batch, sequence_length, beam)] = average_time_taken
      print("Done batch_size {} sequence length {} beam size {}".format(batch, sequence_length, beam))

    
#timing_results['beam'] = timing_holder

# top K top P

In [None]:
batch_size_list = [1, 3, 5, 10]
max_seq_length_list = [15, 25, 50, 100, 150]
num_return_sequence_list = [1, 2, 5, 7, 10]

decoder_layer = TextDecoder(
    model=model,
    num_attention_heads=12,
    num_layers=12,
    hidden_dimension=768,
    input_mask_ids=None,
    input_type_ids=None
)

@timeit
def call_top_k_top_p(input_dict, length, beam):
    result = decoder_layer.decode(input_dict, 
                                       mode='top_k_top_p', 
                                       eos_id=-100, 
                                       max_iterations=length,
                                       num_return_sequences = beam,
                                       top_k = 25,
                                       top_p = 0.75,
                                       do_sample=True)
    return result

minval = 1
maxval = 50256
input_tensor_length = 100

timing_holder = {}
for batch in batch_size_list:
  for sequence_length in max_seq_length_list:
    for num_return_sequence in num_return_sequence_list:
      input_tensor = tf.random.uniform(minval=minval, maxval=maxval, shape=(batch,input_tensor_length), dtype=tf.int32)
      # because we need to genrate that much tokens from input_tensor_length
      input_dict = {'input_ids': input_tensor}
      average_time_taken = call_top_k_top_p(input_dict, sequence_length, num_return_sequence)
      #timing_holder['{}_{}'.format(batch, sequence_length)] = average_time_taken
      timing_holder[(batch, sequence_length, num_return_sequence)] = average_time_taken
      print("Done batch_size {} sequence length {}".format(batch, sequence_length))

    
timing_results['top_k_top_p'] = timing_holder

import pickle
with open("tf_transformers_gpt2_benchmark.pkl", "wb") as f:
    pickle.dump(timing_results, f)

# Used completely serialized saved_model for Greedy

In [None]:
saved_model_dir = "model_pb_greedy/"
loaded = tf.saved_model.load(saved_model_dir)
model = loaded.signatures['serving_default']




timing_results = {}
batch_size_list = [1, 3, 5, 10]
max_seq_length_list = [15, 25, 50, 100, 150]


@timeit
def call_greedy(input_dict):
    result_greedy = model(**input_dict)
    return result_greedy

minval = 1
maxval = 50256
input_tensor_length = 100

timing_holder = {}
for batch in batch_size_list:
  for sequence_length in max_seq_length_list:
    input_tensor = tf.random.uniform(minval=minval, maxval=maxval, shape=(batch,input_tensor_length), dtype=tf.int32)
    # because we need to genrate that much tokens from input_tensor_length
    input_dict = {'input_ids': input_tensor}
    input_dict['iterations'] = tf.constant([[sequence_length]])
    average_time_taken = call_greedy(input_dict)
    timing_holder[(batch, sequence_length)] = average_time_taken
    print("Done batch_size {} sequence length {}".format(batch, sequence_length))

    
timing_results['greedy'] = timing_holder

import pickle
with open("tf_transformers_gpt2_benchmark_serializable.pkl", "wb") as f:
    pickle.dump(timing_results, f)