In [2]:
# Install tf-transformers from github

In [22]:
import datasets
import json
import glob
import tensorflow as tf
import numpy as np

from tf_transformers.data import TFWriter, TFReader, TFProcessor
from tf_transformers.models import BertModel
from tf_transformers.tasks import Classification_Model
from tf_transformers.core import optimization, SimpleTrainer
from tf_transformers.losses import cross_entropy_loss
from tf_transformers.pipeline import Classification_Pipeline
from transformers import BertTokenizer

### Load Tokenizer

In [7]:
# Load HuggingFace Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

### Load COLA dataset from Huggingface datasets

In [9]:
examples = datasets.load_from_disk("/mnt/home/PRE_MODELS/HuggingFace_models/datasets/glue/cola/")
train_examples = examples["train"]


In [11]:
for item in train_examples:
    print(item)
    break

{'idx': 0, 'label': 1, 'sentence': "Our friends won't buy this analysis, let alone the next one we propose."}


In [12]:
max_seq_length=128

def parse_train():
    result = {}
    for f in train_examples:
        input_ids = [tokenizer.cls_token] + tokenizer.tokenize(f['sentence'])[: max_seq_length-2] + [tokenizer.sep_token] # -2 to add CLS and SEP
        input_ids = tokenizer.convert_tokens_to_ids(input_ids)
        input_mask = [1] * len(input_ids)
        input_type_ids = [0] * len(input_ids)

        result = {}
        result['input_ids'] = input_ids
        result['input_mask'] = input_mask
        result['input_type_ids'] = input_type_ids
        
        result['labels'] = f['label']
        
        yield result
        
# Lets write using TF Writer
# Use TFProcessor for smalled data

schema = {
    "input_ids": ("var_len", "int"),
    "input_mask": ("var_len", "int"),
    "input_type_ids": ("var_len", "int"),
    "labels": ("var_len", "int"),
}

tfrecord_train_dir = '../../OFFICIAL_TFRECORDS/glue/bert/cola/train'
tfrecord_filename = 'cola'
tfwriter = TFWriter(schema=schema, 
                    file_name=tfrecord_filename, 
                    model_dir=tfrecord_train_dir,
                    tag='train',
                    overwrite=True
                    )
tfwriter.process(parse_fn=parse_train())

INFO:absl:Wrote 1000 tfrecods
INFO:absl:Wrote 2000 tfrecods
INFO:absl:Wrote 3000 tfrecods
INFO:absl:Wrote 4000 tfrecods
INFO:absl:Wrote 5000 tfrecods
INFO:absl:Wrote 6000 tfrecods
INFO:absl:Wrote 7000 tfrecods
INFO:absl:Wrote 8000 tfrecods
INFO:absl:Total individual observations/examples written is 8551
INFO:absl:All writer objects closed


### Read TFRecords using TFReader

In [17]:
# Read Data

schema = json.load(open("{}/schema.json".format(tfrecord_train_dir)))
all_files = glob.glob("{}/*.tfrecord".format(tfrecord_train_dir))
tf_reader = TFReader(schema=schema, 
                    tfrecord_files=all_files)

x_keys = ['input_ids', 'input_type_ids', 'input_mask']
y_keys = ['labels']
batch_size = 32
train_dataset = tf_reader.read_record(auto_batch=True, 
                                   keys=x_keys,
                                   batch_size=batch_size, 
                                   x_keys = x_keys, 
                                   y_keys = y_keys,
                                   shuffle=True, 
                                   drop_remainder=True
                                  )

In [None]:
for (batch_inputs, batch_labels) in train_dataset.take(1):
    print(batch_inputs, batch_labels)

### Load BERT base-uncased Model 

In [21]:
# Lets load Albert Model

model_layer, model, config = BertModel(model_name='bert-base-uncased', 
                   is_training=True, 
                   use_dropout=False 
                   )
model.load_checkpoint("/mnt/home/PRE_MODELS/LegacyAI_models/checkpoints/bert_base_uncased/bert_base_uncased/")

# model_layer -> Legacylayer inherited from tf.keras.Layer
# model -> legacyModel inherited from tf.keras.Model

INFO:absl:Initialized Variables
INFO:absl:Succesful: Model checkpoints matched


### Load Classification Model

In [25]:

classification_layer = Classification_Model(model=model,
                                      num_classes=2,
                                      use_all_layers=False, 
                                      is_training=True)
classification_model = classification_layer.get_model()

In [27]:
# Delete to save up memory

del model
del model_layer
del classification_layer

### Define Loss

Loss function is simple.
* labels: 1D (batch_size) # class indices
* logits: 2D (batch_size x num_classes)


In [29]:
def loss_fn(y_true_dict, y_pred_dict):
    logits = y_pred_dict['class_logits']
    labels = y_true_dict['labels']
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=tf.squeeze(labels, axis=1)))
    return loss

### Define Optimizer

In [30]:
train_data_size = 8500
learning_rate   = 2e-5
steps_per_epoch = int(train_data_size / batch_size)
EPOCHS = 3
num_train_steps = steps_per_epoch * EPOCHS
warmup_steps = int(0.1 * num_train_steps)
# creates an optimizer with learning rate schedule
optimizer_type = 'adamw'
optimizer, learning_rate_fn = optimization.create_optimizer(learning_rate,
                                                steps_per_epoch * EPOCHS,
                                                warmup_steps,
                                                optimizer_type)

INFO:absl:using Adamw optimizer


### Train Using Keras :-)

- ```compile2``` allows you to have directly use model outputs as well batch dataset outputs into the loss function, without any further complexity.

Note: For ```compile2```, loss_fn must be None, and custom_loss_fn must be active. Metrics are not supprted for time being.

In [32]:
# # Compile
keras_loss_fn = {'class_logits': loss_fn}
classification_model.compile2(optimizer=optimizer, 
                             loss=None, 
                             custom_loss=keras_loss_fn)
# Change steps per epoch to large value/ ignore it completely to train
# on full dataset
history = classification_model.fit(train_dataset, epochs=2, steps_per_epoch=10)

Epoch 1/2
















Epoch 2/2


### Train using SimpleTrainer (part of tf-transformers)

In [None]:
history = SimpleTrainer(model = classification_model,
             optimizer = optimizer,
             loss_fn = loss_fn,
             dataset = train_dataset.repeat(EPOCHS+1), # This is important
             epochs = EPOCHS, 
             num_train_examples = train_data_size, 
             batch_size = batch_size, 
             steps_per_call=100, 
             gradient_accumulation_steps=None)

### Save Models 

You can save models as checkpoints using ```.save_checkpoint``` attribute, which is a part of all ```LegacyModels```

In [None]:
model_save_dir = "../../OFFICIAL_MODELS/glue/cola/bert"
classification_model.save_checkpoint(model_save_dir)

### Parse validation data

We use ```TFProcessor``` to create validation data, because dev data is small

In [37]:
dev_examples = examples['validation']
def parse_dev():
    result = {}
    for f in dev_examples:
        input_ids = [tokenizer.cls_token] + tokenizer.tokenize(f['sentence'])[: max_seq_length-2] + [tokenizer.sep_token] # -2 to add CLS and SEP
        input_ids = tokenizer.convert_tokens_to_ids(input_ids)
        input_mask = [1] * len(input_ids)
        input_type_ids = [0] * len(input_ids)

        result = {}
        result['input_ids'] = input_ids
        result['input_mask'] = input_mask
        result['input_type_ids'] = input_type_ids
        
        result['labels'] = f['label']
        
        yield result
        
tf_processor = TFProcessor()
dev_dataset = tf_processor.process(parse_fn=parse_dev())
x_keys = ['input_ids', 'input_type_ids', 'input_mask']
y_keys = ['labels']
dev_dataset = tf_processor.auto_batch(dev_dataset, shuffle=False, x_keys=x_keys, y_keys=y_keys, batch_size=32, drop_remainder=False)

INFO:absl:Total individual observations/examples written is 1043


### Evaluate dev dataset - Mathews Correlation

In [38]:
predictions = []

original_labels = []
for (batch_inputs, batch_labels) in dev_dataset:
    model_outputs = classification_model(batch_inputs)['class_logits']

    predictions.append(tf.argmax(model_outputs, axis=1))
    original_labels.append(batch_labels['labels'].numpy())
    
from sklearn.metrics import matthews_corrcoef
eval_metrics =  matthews_corrcoef(np.hstack(predictions), np.hstack(original_labels))
print("Mathews corelation", eval_metrics)

# Mathews corelation 0.5952198946938653

### Save as Serialized version 

- Now we can use ```save_as_serialize_module``` to save a model directly to saved_model

In [None]:
classification_model.save_as_serialize_module("{}/saved_model".format(model_save_dir), overwrite=False)
classification_model_serialized = tf.saved_model.load("{}/saved_model".format(model_save_dir))

### TFLite Conversion

TFlite conversion requires:
- static batch size
- static sequence length

In [None]:
model_layer, model, config = BertModel(model_name='bert-base-uncased', 
                                     batch_size=1, 
                                     sequence_length=128, 
                                     is_training=False
                                     )


classification_layer = Classification_Model(model=model,
                                      num_classes=2,
                                      is_training=False)
classification_model = classification_layer.get_model()
classification_model.load_checkpoint(model_save_dir)

# Save to .pb format , we need it for tflite

classification_model.save_as_serialize_module("{}/saved_model_for_tflite".format(model_save_dir))


converter = tf.lite.TFLiteConverter.from_saved_model("{}/saved_model_for_tflite".format(model_save_dir)) # path to the SavedModel directory
converter.experimental_new_converter = True

tflite_model = converter.convert()

open("{}/converted_model.tflite".format(model_save_dir), "wb").write(tflite_model)

### **In production**

- We can use either ```tf.keras.Model``` or ```saved_model```. I recommend saved_model, which is much much faster and no hassle of having architecture code

In [None]:
from tf_transformers.data import pad_dataset_normal

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_seq_length = 128

@pad_dataset_normal
def tokenizer_fn(texts):
    """
    feature: tokenized text (tokenizer.tokenize)
    """
    input_ids = []
    input_type_ids = []
    input_mask = []
    for text in texts:
        input_ids_ex = [tokenizer.cls_token] + tokenizer.tokenize(text)[: max_seq_length-2] + [tokenizer.sep_token] # -2 to add CLS and SEP
        input_ids_ex = tokenizer.convert_tokens_to_ids(input_ids_ex)
        input_mask_ex = [1] * len(input_ids_ex)
        input_type_ids_ex = [0] * len(input_ids_ex)
        
        input_ids.append(input_ids_ex)
        input_type_ids.append(input_type_ids_ex)
        input_mask.append(input_mask_ex)
        
    result = {}
    result['input_ids'] = input_ids
    result['input_type_ids'] = input_type_ids
    result['input_mask'] = input_mask
    return result


# load serialized model
label_map_reverse = {0: 'unacceptable', 1: 'acceptable'}
pipeline = Classification_Pipeline( model = classification_model_serialized, 
                tokenizer_fn = tokenizer_fn, 
                label_map = label_map_reverse,
                batch_size=32)

sentences = ['In which way is Sandy very anxious to see if the students will be able to solve the homework problem?',
            'The book was written by John.', 
            'Play Carnatic Fusion by Various Artists', 
            'She voted herself.']
result = pipeline(sentences)

### Sanity Check for TFlite

In [None]:
# Check same model with tflite

import numpy as np
import tensorflow as tf

# Load the TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="{}/converted_model.tflite".format(model_save_dir))
interpreter.allocate_tensors()

tflite_seq_length = 128

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

sample_inputs = {}
sample_inputs['input_ids'] = tf.random.uniform(minval=0, maxval=100, 
                                                                    shape=(1, tflite_seq_length), dtype=tf.int32)
sample_inputs['input_type_ids'] = tf.zeros_like(sample_inputs['input_ids'])
sample_inputs['input_mask'] = tf.ones_like(sample_inputs['input_ids'])

interpreter.set_tensor(input_details[0]['index'], sample_inputs['input_ids'])
interpreter.set_tensor(input_details[1]['index'],  sample_inputs['input_mask'])
interpreter.set_tensor(input_details[2]['index'], sample_inputs['input_type_ids'])

interpreter.invoke()

# The function `get_tensor()` returns a copy of the tensor data.
# Use `tensor()` in order to get a pointer to the tensor.
tflite_output = interpreter.get_tensor(output_details[0]['index'])

model_output = classification_model_serialized(**sample_inputs) # Why ** ? because it is a saved model

# Check tf.reduce_sum(tflite_output), tf.reduce_sum(model_output['token_logits'])
# Both matches :-)