In [None]:
# Install tf-transformers from github

### Load Encoder based models like (GPT2, BERT, Albert, Roberta)

* There are 2 ways to load Encoder Models

**a.** Load a model using "model_name" . This is essentially a wrapper around core class.

**b.** Load a model by passing config to the core class

#### a. Load using model name

In [3]:
from tf_transformers.models import BertModel

model_layer , model, config = BertModel(model_name='bert-base-uncased')

# model_layer --> LegacyLayer inherited from tf.keras.layers.Layer
# model       --> LegacyModel inherited from tf.keras.Model
# config      --> Python dict config of the model

INFO:absl:We are overwriding `is_training` is False to                         `is_training` to True with `use_dropout` is False, no effects on your inference pipeline
INFO:absl:Initialized Variables


#### b. Load by passing config to the core class

In [7]:
from tf_transformers.models.model_configs.bert import bert_base_uncased
from tf_transformers.models.bert import BERTEncoder
# All configs in tf-transformers can be accessed by changing "bert" to corresponding model .
# For eg: gpt2, t5, roberta, albert etc

# model_layer --> LegacyLayer inherited from tf.keras.layers.Layer
# model       --> LegacyModel inherited from tf.keras.Model
# bert_config      --> Python dict config of the model
bert_config = bert_base_uncased.config
model_layer = BERTEncoder(config=bert_config, name='bert', is_training=False)
model = model_layer.get_model()

### Convert HuggingFace models to tf-transformers

* We can convert (GPT2, BERT, Albert, Roberta, t5, mt5) models from HF to tft

In [13]:
import tensorflow as tf
from transformers import TFBertModel
from tf_transformers.utils import convert_bert_hf_to_tf_transformers

# I have downloaded HF model and saved locally
model_hf_location = '/mnt/home/PRE_MODELS/HuggingFace_models/bert_base_uncased/'
model_hf = TFBertModel.from_pretrained(model_hf_location)

# from tf_transformers.utils -> you ca import other definitions for GPT2, t5 etc
# Convert
convert_bert_hf_to_tf_transformers(model_hf, model, config)

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at /mnt/home/PRE_MODELS/HuggingFace_models/bert_base_uncased/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
INFO:absl:Deleteing huggingface model for saving memory
INFO:absl:Done assigning variables weights. Total 199


### Save the model to tensorflow checkpoint

In [None]:
#### All legacyModel (inherited from tf.keras.Model) has save_checkpoint option

model_save_dir = "tf_transformer_models/bert"
model.save_checkpoint(model_save_dir)

# You can load back the model using load_checkpoint

model.load_checkpoint(model_save_dir)

### Write TFRecords using TFWriter

* TFWriter is a simple class, with which you can write any TFRECORD of any type (int. float, string)
* Just pass a function, which return a dict (must) as a generator

* TFWriter expects a schema which is very simple

* I recommend using **var_len** in **schema**, so that you can have **dynamic batching** while batching usig **TFReader**

In [None]:
import datasets

# Load MNLI dataset from HuggingFace
examples = datasets.load_from_disk("/mnt/home/PRE_MODELS/HuggingFace_models/datasets/glue/mnli/")
train_examples = examples["train"]

max_seq_length=64

# You can prepare data in any way you want
# Preprocess it the way you like and return what the model expects as (dict)
def parse_train():
    result = {}
    for f in train_examples:
        input_ids_s1 = [tokenizer.cls_token] + tokenizer.tokenize(f['hypothesis'])[: max_seq_length-2] + [tokenizer.sep_token] # -2 to add CLS and SEP
        input_ids_s1 = tokenizer.convert_tokens_to_ids(input_ids_s1)
        input_type_ids_s1 = [0] * len(input_ids_s1) # 0 for s1
        
        input_ids_s2 = tokenizer.tokenize(f['premise'])[: max_seq_length-1] + [tokenizer.sep_token] # -1 to add SEP
        input_ids_s2 = tokenizer.convert_tokens_to_ids(input_ids_s2)
        input_type_ids_s2 = [1] * len(input_ids_s2)
        
        input_ids =  input_ids_s1 + input_ids_s2
        input_type_ids = input_type_ids_s1 + input_type_ids_s2
        input_mask = [1] * len(input_ids) # 1 for s2
        

        result = {}
        result['input_ids'] = input_ids
        result['input_mask'] = input_mask
        result['input_type_ids'] = input_type_ids
        
        result['labels'] = f['label']
        
        yield result
        
# Lets write using TF Writer
# Schema is simple
# "var_len" --> for Variable Length feature tf.data.VarLenFeature
# "fixed_len" --> for fixed length feature tf.data.FixedLenFeature

# Make sure yiu have schema for all keys in the dict you are returning from "parse_function"
schema = {
    "input_ids": ("var_len", "int"),
    "input_mask": ("var_len", "int"),
    "input_type_ids": ("var_len", "int"),
    "labels": ("var_len", "int"),
}

# Create TFRecords
tfrecord_train_dir = '../../OFFICIAL_TFRECORDS/glue/alberta/mnli/train' # Directory
tfrecord_filename = 'mnli' # Filename, not required by default

# Auto shuffling if "tag=train"
# Overwrite the directory if "overwrite=True"
tfwriter = TFWriter(schema=schema, 
                    file_name=tfrecord_filename, 
                    model_dir=tfrecord_train_dir,
                    tag='train',
                    overwrite=True
                    )

# Pass your parse_function (generator)
tfwriter.process(parse_fn=parse_train())

### Read TFRecords using TFReader

* TFReader is a simple class, with which you can read any TFRECORD of any type (int. float, string)
* TFReader expects a schema, schema will be automatically written as a json when creating TFRecords with TFWriter

In [None]:
# Read Data

# Schema from TFRECORD directory
schema = json.load(open("{}/schema.json".format(tfrecord_train_dir)))
all_files = glob.glob("{}/*.tfrecord".format(tfrecord_train_dir))
tf_reader = TFReader(schema=schema, 
                    tfrecord_files=all_files)

# Provide what keys you required in inputs and labels
# If you dont want to separate keys, just dont pass "x_keys" and "y_keys"
x_keys = ['input_ids', 'input_type_ids', 'input_mask']
y_keys = ['labels']

# Auto Batch :-) 
batch_size = 32
train_dataset = tf_reader.read_record(auto_batch=True, 
                                   keys=x_keys,
                                   batch_size=batch_size, 
                                   x_keys = x_keys, 
                                   y_keys = y_keys,
                                   shuffle=True, 
                                   drop_remainder=True
                                  )

### Tasks

* Most of NLP downstream tasks like Classification, Span_Selection (QA), Token_Classification (NER) etc are defined by separate tasks models, independent of base models

In [None]:
# Classification Model Example

from tf_transformers.tasks import Classification_Model, Span_Selection_Model, Token_Classification_Model


classification_layer = Classification_Model(model=model,
                                      num_classes=3,
                                      use_all_layers=True, 
                                      is_training=True)
classification_model = classification_layer.get_model()


# Span_Selection_Model (QA) examples


span_selection_layer = Span_Selection_Model(model=model,
                                      is_training=True)
span_selection_model = span_selection_layer.get_model()

# Token Classification Model (NER) examples


model_ner = Token_Classification_Model(model=model,
                                      token_vocab_size=len(slot_map), # Vocab Size
                                      use_all_layers=True, 
                                      is_training=True)
model_ner = model_ner.get_model()


### Save as Serialized model (Optimized and Fast)

* Serialized Models are the crux of TF2.0 and most of the performance benefits of the model comes only from this
* LegacyModel has an option called "save_as_serialized_module", which save a model as "tf.saved_model", without altering the output format

In [None]:
model.save_as_serialize_module("{}/saved_model".format(model_save_dir))