In [None]:
# Install tf-transformers from github

### SNIPS NLU

#### credit -> https://colab.research.google.com/drive/1wgWdxUpKf3FWJgqA6ogBGDEzxAosjJMI

Snips NLU consists of 2 tasks (Slot Filling and Classification)

Slot filling can be formulated as NER

# NER + Albert base using Fast Sentence Piece Alignment + Joint loss + TFlite Conversion

This tutorial contains code to fine-tune an Albert Model for NER (Token Classification).

In this notebook:
- Load the data + create ```tf.data.Dataset``` using TFProcesor
- Load Albert Model V2 and use it to create a Token Classification Model
- Train using ```tf.keras.Model.fit``` and ```Custom Trainer``` 
- Minimze loss per layer to find optimal layer
- Evaluate EXACT MATCH per layer
- Convert TFlite
- In production using faster ```tf.SavedModel``` + no architecture code

### Download Data

In [None]:
from urllib.request import urlretrieve
from pathlib import Path


SNIPS_DATA_BASE_URL = (
    "https://github.com/ogrisel/slot_filling_and_intent_detection_of_SLU/blob/"
    "master/data/snips/"
)
for filename in ["train", "valid", "test", "vocab.intent", "vocab.slot"]:
    path = Path(filename)
    if not path.exists():
        print(f"Downloading {filename}...")
        urlretrieve(SNIPS_DATA_BASE_URL + filename + "?raw=true", path)

In [21]:

import json
import tensorflow as tf
import time
import glob

from tf_transformers.utils.tokenization import BasicTokenizer, SPIECE_UNDERLINE # Special Piece for Albert
from tf_transformers.data.ner_utils_sp import fast_tokenize_and_align_sentence_for_ner

from tf_transformers.data import TFWriter, TFReader, TFProcessor
from tf_transformers.models import AlbertModel
from tf_transformers.tasks import Token_Classification_Model
from tf_transformers.core import optimization, SimpleTrainer
from tf_transformers.losses import cross_entropy_loss
from tf_transformers.pipeline import Token_Classification_Pipeline

from transformers import AlbertTokenizer
from absl import logging
logging.set_verbosity("INFO")

In [4]:
# Read SNIPS data to dataframe

import pandas as pd
import numpy as np
from pathlib import Path


def parse_line(line):
    utterance_data, intent_label = line.split(" <=> ")
    items = utterance_data.split()
    words = [item.rsplit(":", 1)[0] for item in items]
    word_labels = [item.rsplit(":", 1)[1] for item in items]
    return {
        "intent_label": intent_label,
        "words": " ".join(words),
        "word_labels": " ".join(word_labels),
        "length": len(words),
    }

lines_train = Path("train").read_text().strip().splitlines()
lines_valid = Path("valid").read_text().strip().splitlines()
lines_test  = Path("test").read_text().strip().splitlines()

df_train = pd.DataFrame([parse_line(line) for line in lines_train])
df_valid = pd.DataFrame([parse_line(line) for line in lines_valid])
df_test  = pd.DataFrame([parse_line(line) for line in lines_test])

# Slot labels
slot_names = ["[PAD]"]
slot_names += Path("vocab.slot").read_text().strip().splitlines()
slot_map = {}
for label in slot_names:
    slot_map[label] = len(slot_map)
    
# id ->  labels
slot_map_reverse = {v:k for k,v in slot_map.items()}

### Load Tokenizer

In [5]:
# Load HuggingFace Tokenizer
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

### Parse train data 

We will use a simple ```generator``` to iterate over the data and use ```TFProcessor``` to convert it to ```tf.data.Dataset``` in 2 lines

In [6]:
is_training = True
label_pad_token="[PAD]"

# Convert train examples to properly aligned examples
ignored_examples_index = []

all_flat_tokens = []
all_flat_labels = []
for index, row in df_train.iterrows():
    sentence = row["words"]
    labels = row["word_labels"]
    word_tokens = sentence.split()
    label_tokens = labels.split()
    if len(word_tokens) != len(label_tokens):
        ignored_examples_index.append(index)
        continue
    aligned_words, sub_words_mapped, flat_tokens, flat_labels = fast_tokenize_and_align_sentence_for_ner(
        tokenizer, sentence, word_tokens, SPIECE_UNDERLINE, is_training, label_tokens, label_pad_token=label_pad_token)
    all_flat_tokens.append(flat_tokens)
    all_flat_labels.append(flat_labels)
    

# Convert tokens to id and add type_ids
# input_mask etc
# This is user specific/ tokenizer specific
# Eg: Roberta has input_type_ids = 0, BERT has input_type_ids = [0, 1]

def parse_train():
    """
    convert text to inputs (ids)
    """
    
    for i in range(len(all_flat_tokens)):
        
        flat_tokens = all_flat_tokens[i]
        flat_labels = all_flat_labels[i]
        # Tokenizer will automatically set [BOS] <text> [EOS]
        result = {}
        result["input_ids"] = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] +  flat_tokens + [tokenizer.bos_token])
        result["input_mask"] = [1] * len(result["input_ids"])
        result["input_type_ids"] = [0] * len(result["input_ids"])
        labels = [slot_map[token] for token in flat_labels]
        labels = [slot_map[label_pad_token]] + labels + [slot_map[label_pad_token]]  # for [CLS] and [SEP]
        
        # We dont want label_pad_token (which is mostly because of multiple subowrds)
        label_mask = []
        for token in flat_labels:
            if token == [label_pad_token]:
                label_mask.append(0)
                continue
            label_mask.append(1)
        label_mask = [0] + label_mask + [0]  # for [CLS] and [SEP]
        
        result["labels"] = labels
        result["label_mask"] = label_mask
        yield result


# use TFProcessor only if your data is in range of 10k - 20k maximum
# otherwise use TFWriter
# This will create a (batch_inputs, batch_labels) tuple dataset
batch_size = 32
tf_processor = TFProcessor()
train_dataset = tf_processor.process(parse_fn=parse_train())
x_keys = ['input_ids', 'input_type_ids', 'input_mask']
y_keys = ['labels', 'label_mask']
train_dataset = tf_processor.auto_batch(train_dataset, batch_size=batch_size,
                                        x_keys = x_keys,
                                        y_keys = y_keys,
                                        shuffle=True, 
                                        drop_remainder=True)

INFO:absl:Processed  10000 examples so far
INFO:absl:Total individual observations/examples written is 13073


### Load Albert V2 Model 

We will use Albert Model and load the checkpoints.
To convert Huggingface models to checkpoints, refer ```conversion``` notebooks


In [31]:
# Lets load Albert Model

model_layer, model, config = AlbertModel(model_name='albert_base_v2', 
                   is_training=True, 
                   use_dropout=False
                   )
model.load_checkpoint("/mnt/home/PRE_MODELS/LegacyAI_models/checkpoints/albert-base-v2/")

# model_layer -> Legacylayer inherited from tf.keras.Layer
# model -> legacyModel inherited from tf.keras.Model

INFO:absl:Initialized Variables
INFO:absl:Succesful: Model checkpoints matched


### Load Token Classification Model

In [32]:

model_ner = Token_Classification_Model(model=model,
                                      token_vocab_size=len(slot_map), # Vocab Size
                                      use_all_layers=True, 
                                      is_training=True)
model_ner = model_ner.get_model()

# Delete models to save memory

del model
del model_layer

### Define Loss

Loss function is simple, softmax over NER token vocab .

In [9]:
def token_loss(y_true_dict, token_logits):
    loss = cross_entropy_loss(
        labels=y_true_dict["labels"],
        logits=token_logits,
        label_weights=y_true_dict["label_mask"],
    )
    return loss

def token_loss_all_layers(y_true_dict, y_pred_dict):
    layer_loss = []
    for token_logits in y_pred_dict['token_logits']:
        loss = token_loss(y_true_dict, token_logits)
        layer_loss.append(loss)
    return tf.reduce_mean(layer_loss)

### Define Optimizer

In [10]:
train_data_size = 13000
learning_rate   = 2e-5
steps_per_epoch = int(train_data_size / batch_size)
EPOCHS = 3
num_train_steps = steps_per_epoch * EPOCHS
warmup_steps = int(0.1 * num_train_steps)
# creates an optimizer with learning rate schedule
optimizer_type = 'adamw'
optimizer, learning_rate_fn = optimization.create_optimizer(learning_rate,
                                                steps_per_epoch * EPOCHS,
                                                warmup_steps,
                                                optimizer_type)

INFO:absl:using Adamw optimizer


### Train Using Keras :-)

- ```compile2``` allows you to have directly use model outputs as well batch dataset outputs into the loss function, without any further complexity.

Note: For ```compile2```, loss_fn must be None, and custom_loss_fn must be active. Metrics are not supprted for time being.

In [11]:
# # Compile
loss_fn = {'token_logits': token_loss_all_layers}
model_ner.compile2(optimizer=optimizer, 
                             loss=None, 
                             custom_loss=loss_fn)

# Change steps per epoch to large value/ ignore it completely to train
# on full dataset
history = model_ner.fit(train_dataset, epochs=2, steps_per_epoch=10)

Epoch 1/2
















Epoch 2/2


### Train using SimpleTrainer (part of tf-transformers)

In [None]:
history = SimpleTrainer(model = model_ner,
             optimizer = optimizer,
             loss_fn = token_loss_all_layers,
             dataset = train_dataset.repeat(EPOCHS+1), # This is important
             epochs = EPOCHS, 
             num_train_examples = train_data_size, 
             batch_size = batch_size, 
             steps_per_call=100)

### Save Models 

You can save models as checkpoints using ```.save_checkpoint``` attribute, which is a part of all ```LegacyModels```

In [None]:
model_save_dir = "../OFFICIAL_MODELS/snips/albert_base"
model_ner.save_checkpoint(model_save_dir)

### Parse validation data

Like before we use ```TFProcessor``` to create datasets.

In [14]:


# Convert train examples to properly aligned examples
ignored_examples_index = []

dev_flat_tokens = []
dev_flat_labels = []
for index, row in df_valid.iterrows():
    sentence = row["words"]
    labels = row["word_labels"]
    word_tokens = sentence.split()
    label_tokens = labels.split()
    if len(word_tokens) != len(label_tokens):
        ignored_examples_index.append(index)
        continue
    aligned_words, sub_words_mapped, flat_tokens, flat_labels = fast_tokenize_and_align_sentence_for_ner(
        tokenizer, sentence, word_tokens, SPIECE_UNDERLINE, is_training, label_tokens, label_pad_token="[PAD]")
    dev_flat_tokens.append(flat_tokens)
    dev_flat_labels.append(flat_labels)
    

# Convert tokens to id and add type_ids
# input_mask etc
# This is user specific/ tokenizer specific
# Eg: Roberta has input_type_ids = 0, BERT has input_type_ids = [0, 1]

label_pad_token="[PAD]"
def parse_dev():
    """
    convert text to inputs (ids)
    """
    
    for i in range(len(dev_flat_tokens)):
        
        flat_tokens = dev_flat_tokens[i]
        flat_labels = dev_flat_labels[i]
        # Tokenizer will automatically set [BOS] <text> [EOS]
        result = {}
        result["input_ids"] = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] +  flat_tokens + [tokenizer.bos_token])
        result["input_mask"] = [1] * len(result["input_ids"])
        result["input_type_ids"] = [0] * len(result["input_ids"])
        labels = [slot_map[token] for token in flat_labels]
        labels = [slot_map[label_pad_token]] + labels + [slot_map[label_pad_token]]  # for [CLS] and [SEP]
        
        # We dont want label_pad_token (which is mostly because of multiple subowrds)
        label_mask = []
        for token in flat_labels:
            if token == [label_pad_token]:
                label_mask.append(0)
                continue
            label_mask.append(1)
        label_mask = [0] + label_mask + [0]  # for [CLS] and [SEP]
        result["labels"] = labels
        result["label_mask"] = label_mask
        yield result

batch_size = 32
tf_processor = TFProcessor()
dev_dataset = tf_processor.process(parse_fn=parse_dev())
x_keys = ['input_ids', 'input_type_ids', 'input_mask']
y_keys = ['labels', 'label_mask']
dev_dataset = tf_processor.auto_batch(dev_dataset, batch_size=batch_size,
                                        x_keys = x_keys,
                                        y_keys = y_keys,
                                        shuffle=False, drop_remainder=False)

INFO:absl:Total individual observations/examples written is 700


### EXACT MATCH based evaluation
Lets see our idea of jointly minimze loss will bring
some benefits or not. If so, that will reduce the overall latency :-)

In [15]:
# Exact Match Evaluation .

num_layers = 12
prediction_per_layer = {i:[] for i in range(num_layers)}
original_labels = []
for (batch_inputs, batch_labels) in dev_dataset:
    results = model_ner(batch_inputs)
    model_logits = results['token_logits'][-1]
    
    for i, model_logits in enumerate(results['token_logits']):
    
        # Iterate over each example
        for index, per_example_logits in enumerate(model_logits):
            per_example_length = tf.reduce_sum(batch_inputs['input_mask'][index])
            per_example_label  = batch_labels['labels'][index][:per_example_length][1:-1] # we dont want pad positions and 1:-1 is to remove CLS and SEP
            per_example_logits = per_example_logits[:per_example_length][1:-1] # 1:-1 CLS and SEP
            per_example_preds  = tf.argmax(per_example_logits, axis=-1)
            prediction_per_layer[i].append(per_example_preds)
            
            # We want the original label only once
            if i == 0:
                original_labels.append(per_example_label)
    
    
# We have 700 examples
for layer_iter in range(num_layers):
    result = prediction_per_layer[layer_iter]
    
    pred_list = []
    for i in range(700):
        pred = list(result[i].numpy())
        orig = list(original_labels[i].numpy())
        if pred == orig:
            pred_list.append(1)
        else:
            pred_list.append(0)
    print("Layer {} exact match {} / 700".format(layer_iter, sum(pred_list)))
    
    
# Layer 0 exact match 223 / 700
# Layer 1 exact match 428 / 700
# Layer 2 exact match 511 / 700
# Layer 3 exact match 558 / 700
# Layer 4 exact match 571 / 700
# Layer 5 exact match 576 / 700
# Layer 6 exact match 587 / 700
# Layer 7 exact match 594 / 700
# Layer 8 exact match 592 / 700
# Layer 9 exact match 591 / 700
# Layer 10 exact match 596 / 700
# Layer 11 exact match 586 / 700

Layer 0 exact match 223 / 700
Layer 1 exact match 428 / 700
Layer 2 exact match 511 / 700
Layer 3 exact match 558 / 700
Layer 4 exact match 571 / 700
Layer 5 exact match 576 / 700
Layer 6 exact match 587 / 700
Layer 7 exact match 594 / 700
Layer 8 exact match 592 / 700
Layer 9 exact match 591 / 700
Layer 10 exact match 596 / 700
Layer 11 exact match 586 / 700


### Save as Serialized version 

- Now we can use ```save_as_serialize_module``` to save a model directly to saved_model

In [36]:
# Reload with only 5 layers 
model_layer, model, config = AlbertModel(model_name='albert-base-v2', 
                                     num_hidden_layers=5, 
                                     is_training=False
                                     )


model_ner = Token_Classification_Model(model=model,
                                      token_vocab_size=len(slot_map),
                                      use_all_layers=False, 
                                      is_training=False)
model_ner = model_ner.get_model()
model_ner.load_checkpoint(model_save_dir)

model_ner.save_as_serialize_module("{}/saved_model".format(model_save_dir), overwrite=True)


INFO:absl:Overwride num_hidden_layers with 5
INFO:absl:We are overwriding `is_training` is False to                         `is_training` to True with `use_dropout` is False, no effects on your inference pipeline
INFO:absl:Initialized Variables
INFO:absl:Succesful: Model checkpoints matched


INFO:tensorflow:Assets written to: ../OFFICIAL_MODELS/snips/albert_base/saved_model/assets


INFO:tensorflow:Assets written to: ../OFFICIAL_MODELS/snips/albert_base/saved_model/assets


### TFLite Conversion

TFlite conversion requires:
- static batch size
- static sequence length

In [17]:
# So, layer 5 itself is giving 576/700 
# Thats great. Lets finalize the model with 5 hidden layers (instead of 12)

model_layer, model, config = AlbertModel(model_name='albert-base-v2', 
                                     batch_size=1, 
                                     sequence_length=45,  # 45 is enough for SNIP 
                                     num_hidden_layers=5, 
                                     is_training=False
                                     )


model_ner = Token_Classification_Model(model=model,
                                      token_vocab_size=len(slot_map),
                                      use_all_layers=False, 
                                      is_training=False)
model_ner = model_ner.get_model()
model_ner.load_checkpoint(model_save_dir)

# Save to .pb format , we need it for tflite

model_ner.save_as_serialize_module("{}/saved_model_for_tflite".format(model_save_dir))


converter = tf.lite.TFLiteConverter.from_saved_model("{}/saved_model_for_tflite".format(model_save_dir)) # path to the SavedModel directory
converter.experimental_new_converter = True

tflite_model = converter.convert()

open("{}/converted_model.tflite".format(model_save_dir), "wb").write(tflite_model)

INFO:absl:Using experimental converter: If you encountered a problem please file a bug. You can opt-out by setting experimental_new_converter=False


44829296

### **In production**

- We can use either ```tf.keras.Model``` or ```saved_model```. I recommend saved_model, which is much much faster and no hassle of having architecture code

In [None]:
def tokenizer_fn(feature):
    """
    feature: tokenized text (tokenizer.tokenize)
    """
    result = {}
    result["input_ids"] = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] +  feature['input_ids'] + [tokenizer.bos_token])
    result["input_mask"] = [1] * len(result["input_ids"])
    result["input_type_ids"] = [0] * len(result["input_ids"])
    return result

# load serialized model
model_ner = tf.saved_model.load("{}/saved_model".format(model_save_dir))
slot_map_reverse = {v:k for k,v in slot_map.items()}
pipeline = Token_Classification_Pipeline( model = model_ner, 
                tokenizer = tokenizer, 
                tokenizer_fn = tokenizer_fn, 
                SPECIAL_PIECE = SPIECE_UNDERLINE,
                label_map = slot_map_reverse,
                max_seq_length = 128,
                batch_size=32)

sentences = ['I would love to listen to Carnatic music by Yesudas', 
            'Play Carnatic Fusion by Various Artists', 
            'Please book 2 tickets from Bangalore to Kerala']
result = pipeline(sentences)

In [38]:
result

[{'sentence': 'I would love to listen to Carnatic music by Yesudas',
  'original_words': ['I',
   'would',
   'love',
   'to',
   'listen',
   'to',
   'Carnatic',
   'music',
   'by',
   'Yesudas'],
  'predicted_labels': ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-album',
   'O',
   'O',
   'B-artist'],
  'predicted_probs': array([2.1276660e-01, 5.5502349e-01, 8.1709892e-02, 3.0246880e-02,
         4.6255462e-02, 4.7849871e-02, 8.4232342e-06, 2.8101532e-03,
         2.2861654e-02, 4.6765100e-04], dtype=float32)},
 {'sentence': 'Play Carnatic Fusion by Various Artists',
  'original_words': ['Play', 'Carnatic', 'Fusion', 'by', 'Various', 'Artists'],
  'predicted_labels': ['O', 'B-album', 'I-album', 'O', 'O', 'O'],
  'predicted_probs': array([0.57828164, 0.00286884, 0.00589877, 0.31501585, 0.03300228,
         0.06493266], dtype=float32)},
 {'sentence': 'Please book 2 tickets from Bangalore to Kerala',
  'original_words': ['Please',
   'book',
   '2',
   'tickets',
   'from',
   

### Sanity Check for TFlite

In [40]:
# Check same model with tflite

import numpy as np
import tensorflow as tf

# Load the TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="{}/converted_model.tflite".format(model_save_dir))
interpreter.allocate_tensors()

tflite_seq_length = 45

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

sample_inputs = {}
sample_inputs['input_ids'] = tf.random.uniform(minval=0, maxval=100, 
                                                                    shape=(1, tflite_seq_length), dtype=tf.int32)
sample_inputs['input_type_ids'] = tf.zeros_like(sample_inputs['input_ids'])
sample_inputs['input_mask'] = tf.ones_like(sample_inputs['input_ids'])

interpreter.set_tensor(input_details[0]['index'], sample_inputs['input_ids'])
interpreter.set_tensor(input_details[1]['index'],  sample_inputs['input_mask'])
interpreter.set_tensor(input_details[2]['index'], sample_inputs['input_type_ids'])

interpreter.invoke()

# The function `get_tensor()` returns a copy of the tensor data.
# Use `tensor()` in order to get a pointer to the tensor.
tflite_output = interpreter.get_tensor(output_details[0]['index'])

model_output = model_ner(**sample_inputs) # Why ** ? because it is a saved model

# Check tf.reduce_sum(tflite_output), tf.reduce_sum(model_output['token_logits'])
# Both matches :-)