In [None]:
# Install tf-transformers from github

In [13]:
import json
import tensorflow as tf
import time
import glob
import collections

from tf_transformers.utils.tokenization import BasicTokenizer, SPIECE_UNDERLINE
from tf_transformers.utils import fast_sp_alignment
from tf_transformers.data.squad_utils_sp import (
    read_squad_examples,
    post_clean_train_squad,
    example_to_features_using_fast_sp_alignment_train,
    example_to_features_using_fast_sp_alignment_test, 
    _get_best_indexes
)
from tf_transformers.data import TFWriter, TFReader, TFProcessor
from tf_transformers.models import AlbertModel
from tf_transformers.core import optimization, SimpleTrainer
from tf_transformers.tasks import Span_Selection_Model

from transformers import AlbertTokenizer
from absl import logging
logging.set_verbosity("INFO")

from tf_transformers.pipeline.span_extraction_pipeline import Span_Extraction_Pipeline

### Load Tokenizer

In [3]:
# Load HuggingFace Tokenizer
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
basic_tokenizer = BasicTokenizer(do_lower_case=False)

### Convert train data to Features

* using Fast Sentence Piece Alignment, we convert text to features (text -> list of sub words)

In [4]:
input_file_path = '/mnt/home/PRE_MODELS/HuggingFace_models/datasets/squadv1.1/train-v1.1.json'

is_training = True

# 1. Read Examples
start_time = time.time()
train_examples = read_squad_examples(
      input_file=input_file_path,
      is_training=is_training,
      version_2_with_negative=False
      )
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))

# 2.Postprocess (clean text to avoid some unwanted unicode charcaters)
train_examples_processed, failed_examples = post_clean_train_squad(train_examples, basic_tokenizer, is_training=is_training)


# 3.Convert question, context and answer to proper features (tokenized words) not word indices
feature_generator = example_to_features_using_fast_sp_alignment_train(tokenizer, train_examples_processed, max_seq_length = 384, 
                                                           max_query_length=64, doc_stride=128, SPECIAL_PIECE=SPIECE_UNDERLINE) 

all_features = []
for feature in feature_generator:
    all_features.append(feature)
end_time = time.time()
print("time taken {} seconds".format(end_time-start_time))

Time taken 0.7315692901611328


INFO:absl:Time taken 41.1900794506073
INFO:absl:Wrote 1012 pos and 0 neg examples
INFO:absl:Wrote 2019 pos and 0 neg examples
INFO:absl:Wrote 3031 pos and 0 neg examples
INFO:absl:Wrote 4036 pos and 0 neg examples
INFO:absl:Wrote 5042 pos and 0 neg examples
INFO:absl:Wrote 6042 pos and 0 neg examples
INFO:absl:Wrote 7043 pos and 0 neg examples
INFO:absl:Wrote 8043 pos and 0 neg examples
INFO:absl:Wrote 9056 pos and 0 neg examples
INFO:absl:Wrote 10056 pos and 0 neg examples
INFO:absl:Wrote 11058 pos and 0 neg examples
INFO:absl:Wrote 12064 pos and 0 neg examples
INFO:absl:Wrote 13065 pos and 0 neg examples
INFO:absl:Wrote 14065 pos and 0 neg examples
INFO:absl:Wrote 15067 pos and 0 neg examples
INFO:absl:Wrote 16067 pos and 0 neg examples
INFO:absl:Wrote 17067 pos and 0 neg examples
INFO:absl:Wrote 18067 pos and 0 neg examples
INFO:absl:Wrote 19067 pos and 0 neg examples
INFO:absl:Wrote 20072 pos and 0 neg examples
INFO:absl:Wrote 21081 pos and 0 neg examples
INFO:absl:Wrote 22081 pos 

time taken 173.22707796096802 seconds


### Convert features to TFRecords using TFWriter

In [11]:
# Convert tokens to id and add type_ids
# input_mask etc
# This is user specific/ tokenizer specific
# Eg: Roberta has input_type_ids = 0, BERT has input_type_ids = [0, 1]

def parse_train():
    result = {}
    for f in all_features:
        sep_index = f['input_ids'].index(tokenizer.sep_token)
        input_ids = tokenizer.convert_tokens_to_ids(f['input_ids'])
        input_type_ids = [0] * len(input_ids[:sep_index]) + [1] * len(input_ids[sep_index:])
        input_mask = [1] * len(input_ids)
        
        result['input_ids'] = input_ids
        result['input_type_ids'] = input_type_ids
        result['input_mask'] = input_mask
        result['start_position'] = f['start_position']
        result['end_position']   = f['end_position']
        yield result
        

# Lets write using TF Writer
# Use TFProcessor for smaller data

schema = {'input_ids': ("var_len", "int"), 
         'input_type_ids': ("var_len", "int"), 
         'input_mask': ("var_len", "int"), 
         'start_position': ("var_len", "int"), 
         'end_position': ("var_len", "int")}

tfrecord_train_dir = '../../OFFICIAL_TFRECORDS/squad/albert/train'
tfrecord_filename = 'squad'
tfwriter = TFWriter(schema=schema, 
                    file_name=tfrecord_filename, 
                    model_dir=tfrecord_train_dir,
                    tag='train',
                    overwrite=True
                    )
tfwriter.process(parse_fn=parse_train())

INFO:absl:Wrote 1000 tfrecods
INFO:absl:Wrote 2000 tfrecods
INFO:absl:Wrote 3000 tfrecods
INFO:absl:Wrote 4000 tfrecods
INFO:absl:Wrote 5000 tfrecods
INFO:absl:Wrote 6000 tfrecods
INFO:absl:Wrote 7000 tfrecods
INFO:absl:Wrote 8000 tfrecods
INFO:absl:Wrote 9000 tfrecods
INFO:absl:Wrote 10000 tfrecods
INFO:absl:Wrote 11000 tfrecods
INFO:absl:Wrote 12000 tfrecods
INFO:absl:Wrote 13000 tfrecods
INFO:absl:Wrote 14000 tfrecods
INFO:absl:Wrote 15000 tfrecods
INFO:absl:Wrote 16000 tfrecods
INFO:absl:Wrote 17000 tfrecods
INFO:absl:Wrote 18000 tfrecods
INFO:absl:Wrote 19000 tfrecods
INFO:absl:Wrote 20000 tfrecods
INFO:absl:Wrote 21000 tfrecods
INFO:absl:Wrote 22000 tfrecods
INFO:absl:Wrote 23000 tfrecods
INFO:absl:Wrote 24000 tfrecods
INFO:absl:Wrote 25000 tfrecods
INFO:absl:Wrote 26000 tfrecods
INFO:absl:Wrote 27000 tfrecods
INFO:absl:Wrote 28000 tfrecods
INFO:absl:Wrote 29000 tfrecods
INFO:absl:Wrote 30000 tfrecods
INFO:absl:Wrote 31000 tfrecods
INFO:absl:Wrote 32000 tfrecods
INFO:absl:Wrote 3

### Read TFRecords using TFReader

In [18]:
# Read Data


schema = json.load(open("{}/schema.json".format(tfrecord_train_dir)))
all_files = glob.glob("{}/*.tfrecord".format(tfrecord_train_dir))
tf_reader = TFReader(schema=schema, 
                    tfrecord_files=all_files)

x_keys = ['input_ids', 'input_type_ids', 'input_mask']
y_keys = ['start_position', 'end_position']
batch_size = 16
train_dataset = tf_reader.read_record(auto_batch=True, 
                                   keys=x_keys,
                                   batch_size=batch_size, 
                                   x_keys = x_keys, 
                                   y_keys = y_keys,
                                   shuffle=True, 
                                   drop_remainder=True
                                  )

### Load Albert V2 Model

In [12]:
# Lets load Albert Model

model_layer, model, config = AlbertModel(model_name='albert_base_v2', 
                   is_training=True, 
                   use_dropout=False
                   )
model.load_checkpoint("/mnt/home/PRE_MODELS/LegacyAI_models/checkpoints/albert-base-v2/")

# model_layer -> Legacylayer inherited from tf.keras.Layer
# model -> legacyModel inherited from tf.keras.Model

INFO:absl:Initialized Variables
INFO:absl:Succesful: Model checkpoints matched


### Load Span Selection Model

In [14]:

span_selection_layer = Span_Selection_Model(model=model,
                                      use_all_layers=True, 
                                      is_training=True)
span_selection_model = span_selection_layer.get_model()

In [15]:
# Delete to save up memory

del model
del model_layer
del span_selection_layer

### Define Loss

Loss function is simple.
* labels: 1D (batch_size) # start or end positions
* logits: 2D (batch_size x sequence_length)

**Joint loss** - We minimze loss over each hidden layer .

In [16]:


def span_loss(position, logits):
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=tf.squeeze(position, axis=1)))
    return loss

    
def start_span_loss_all_layers(y_true_dict, y_pred_dict):
    
    layer_loss = []
    model_outputs = y_pred_dict['start_logits']
    for start_logits in model_outputs:
        loss = span_loss(y_true_dict['start_position'], start_logits)
        layer_loss.append(loss)
    return tf.reduce_mean(layer_loss)

def end_span_loss_all_layers(y_true_dict, y_pred_dict):
    
    layer_loss = []
    model_outputs = y_pred_dict['end_logits']
    for end_logits in model_outputs:
        loss = span_loss(y_true_dict['end_position'], end_logits)
        layer_loss.append(loss)
    return tf.reduce_mean(layer_loss)

# Sum of start_loss + end_loss
def joint_loss(y_true_dict, y_pred_dict):
    start_loss = start_span_loss_all_layers(y_true_dict, y_pred_dict)
    end_loss = end_span_loss_all_layers(y_true_dict, y_pred_dict)
    return (start_loss + end_loss)/2.0

In [19]:
for (batch_inputs, batch_labels) in train_dataset.take(1):
    print(batch_inputs, batch_labels)

{'input_ids': <tf.Tensor: shape=(16, 323), dtype=int32, numpy=
array([[   2,   19,   98, ...,    0,    0,    0],
       [   2,   98, 3338, ...,    0,    0,    0],
       [   2,  496,   20, ...,    0,    0,    0],
       ...,
       [   2,  303,   29, ...,    0,    0,    0],
       [   2,  184,  151, ...,    0,    0,    0],
       [   2,   98, 2091, ...,    0,    0,    0]], dtype=int32)>, 'input_mask': <tf.Tensor: shape=(16, 323), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>, 'input_type_ids': <tf.Tensor: shape=(16, 323), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>} {'end_position': <tf.Tensor: shape=(16,

### Define Optimizer

In [29]:
train_data_size = 89000
learning_rate   = 2e-5
steps_per_epoch = int(train_data_size / batch_size)
EPOCHS = 3
num_train_steps = steps_per_epoch * EPOCHS
warmup_steps = int(0.1 * num_train_steps)
# creates an optimizer with learning rate schedule
optimizer_type = 'adamw'
optimizer, learning_rate_fn = optimization.create_optimizer(learning_rate,
                                                steps_per_epoch * EPOCHS,
                                                warmup_steps,
                                                optimizer_type)

INFO:absl:using Adamw optimizer


### Train Using Keras :-)

- ```compile2``` allows you to have directly use model outputs as well batch dataset outputs into the loss function, without any further complexity.

Note: For ```compile2```, loss_fn must be None, and custom_loss_fn must be active. Metrics are not supprted for time being.

In [20]:
# Keras Fit

keras_loss_fn = {'start_logits': start_span_loss_all_layers, 
           'end_logits': end_span_loss_all_layers}
span_selection_model.compile2(optimizer=tf.keras.optimizers.Adam(), 
                            loss=None, 
                            custom_loss=keras_loss_fn
                             )
history = span_selection_model.fit(train_dataset, epochs=2, steps_per_epoch=10)

Epoch 1/2
















Epoch 2/2


### Train using SimpleTrainer (part of tf-transformers)

In [None]:
# Custom training
history = SimpleTrainer(model = span_selection_model,
             optimizer = optimizer,
             loss_fn = joint_loss,
             dataset = train_dataset.repeat(EPOCHS+1), # This is important
             epochs = EPOCHS, 
             num_train_examples = train_data_size, 
             batch_size = batch_size, 
             steps_per_call=100, 
             gradient_accumulation_steps=None)

INFO:absl:Global Steps 165
  0%|          | 0/165 [00:00<?, ?it/s]









### Save Models 

You can save models as checkpoints using ```.save_checkpoint``` attribute, which is a part of all ```LegacyModels```

In [9]:
model_save_dir = "../../OFFICIAL_MODELS/joint_loss/squad/albert"
span_selection_model.save_checkpoint(model_save_dir)

INFO:absl:Succesful: Model checkpoints matched


### Parse validation data

We use ```TFProcessor``` to create validation data, because dev data is small

In [8]:
# Convert to features
dev_input_file_path = '/mnt/home/PRE_MODELS/HuggingFace_models/datasets/squadv1.1/dev-v1.1.json'

is_training = False

start_time = time.time()
dev_examples = read_squad_examples(
      input_file=dev_input_file_path,
      is_training=is_training,
      version_2_with_negative=False
)
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))
dev_examples_cleaned = post_clean_train_squad(dev_examples, basic_tokenizer, is_training=False)
qas_id_info, dev_features = example_to_features_using_fast_sp_alignment_test(tokenizer, dev_examples_cleaned,  max_seq_length = 384, 
                                                           max_query_length=64, doc_stride=128, SPECIAL_PIECE=SPIECE_UNDERLINE)

# Features to TF Dataset


def parse_dev():
    result = {}
    for f in dev_features:
        sep_index = f['input_ids'].index(tokenizer.sep_token)
        input_ids = tokenizer.convert_tokens_to_ids(f['input_ids'])
        input_type_ids = [0] * len(input_ids[:sep_index]) + [1] * len(input_ids[sep_index:])
        input_mask = [1] * len(input_ids)
        
        result['input_ids'] = input_ids
        result['input_type_ids'] = input_type_ids
        result['input_mask'] = input_mask
        
        yield result     

tf_processor = TFProcessor()
dev_dataset = tf_processor.process(parse_fn=parse_dev())
dev_dataset = tf_processor.auto_batch(dev_dataset, batch_size=32)

Time taken 0.07536649703979492


### Evaluate Exact Match

* Make Predictions
* Extract Answers
* Evaluate

### Make Batch Predictions

In [28]:
def extract_from_dict(dict_items, key):
    holder = []
    for item in dict_items:
        holder.append(item[key])
    return holder
qas_id_list = extract_from_dict(dev_features, 'qas_id')
doc_offset_list = extract_from_dict(dev_features, 'doc_offset')

# Make batch predictions
num_layers = 12
per_layer_start_logits = {i:[] for i in range(num_layers)}
per_layer_end_logits = {i:[] for i in range(num_layers)}

start_time = time.time()
for (batch_inputs) in dev_dataset:
    model_outputs = span_selection_model(batch_inputs)
    for i, start_logits in enumerate(model_outputs['start_logits']):
        per_layer_start_logits[i].append(start_logits)
    for i, end_logits in enumerate(model_outputs['end_logits']):
        per_layer_end_logits[i].append(end_logits)
end_time = time.time()
print('Time taken {}'.format(end_time-start_time))

### Extract Answers (text) from Predictions

* Its little tricky as there will be multiple features for one example, if it is longer than max_seq_length

In [None]:
n_best_size = 20 # top N answers
max_answer_length = 30 # Max answer length
squad_dev_data = json.load(open(dev_input_file_path))['data']
layer_results = []
for layer in range(num_layers):

    start_logits_unstcaked = []
    end_logits_unstacked = []
    for batch_start_logits in per_layer_start_logits[layer]:
        start_logits_unstcaked.extend(tf.unstack(batch_start_logits))
    for batch_end_logits in per_layer_end_logits[layer]:
        end_logits_unstacked.extend(tf.unstack(batch_end_logits))

    qas_id_logits = {}
    for i in range(len(qas_id_list)):
        qas_id = qas_id_list[i]
        example = qas_id_info[qas_id]
        feature = dev_features[i]
        assert qas_id == feature['qas_id']
        if qas_id not in qas_id_logits:
            qas_id_logits[qas_id] = {'tok_to_orig_index': example['tok_to_orig_index'],
                                                'aligned_words': example['aligned_words'],
                                                'feature_length': [len(feature['input_ids'])],
                                                'doc_offset': [doc_offset_list[i]],
                                                'passage_start_pos': [feature['input_ids'].index(tokenizer.sep_token) + 1],
                                                'start_logits': [start_logits_unstcaked[i]], 
                                                'end_logits': [end_logits_unstacked[i]]}

        else:
            qas_id_logits[qas_id]['start_logits'].append(start_logits_unstcaked[i])
            qas_id_logits[qas_id]['end_logits'].append(end_logits_unstacked[i])
            qas_id_logits[qas_id]['feature_length'].append(len(feature['input_ids']))
            qas_id_logits[qas_id]['doc_offset'].append(doc_offset_list[i])
            qas_id_logits[qas_id]['passage_start_pos'].append(feature['input_ids'].index(tokenizer.sep_token) + 1)

            
    qas_id_answer = {}
    skipped = []
    skipped_null = []
    global_counter = 0
    for qas_id in qas_id_logits:

        current_example = qas_id_logits[qas_id]

        _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
            "PrelimPrediction",
            ["feature_index", "start_index", "end_index",
             "start_log_prob", "end_log_prob"])
        prelim_predictions = []
        example_features = []
        for i in range(len( current_example['start_logits'])):
            f = dev_features[global_counter]
            assert f['qas_id'] == qas_id
            example_features.append(f)
            global_counter += 1
            passage_start_pos = current_example['passage_start_pos'][i]
            feature_length = current_example['feature_length'][i]

            start_log_prob_list = current_example['start_logits'][i].numpy().tolist()[:feature_length]
            end_log_prob_list = current_example['end_logits'][i].numpy().tolist()[:feature_length]
            start_indexes = _get_best_indexes(start_log_prob_list, n_best_size)
            end_indexes   = _get_best_indexes(end_log_prob_list, n_best_size)

            for start_index in start_indexes:
                for end_index in end_indexes:
                  # We could hypothetically create invalid predictions, e.g., predict
                  # that the start of the span is in the question. We throw out all
                  # invalid predictions.
                  if start_index < passage_start_pos or end_index < passage_start_pos:
                    continue
                  if end_index < start_index:
                    continue
                  length = end_index - start_index + 1
                  if length > max_answer_length:
                    continue
                  start_log_prob = start_log_prob_list[start_index]
                  end_log_prob = end_log_prob_list[end_index]
                  start_idx = start_index - passage_start_pos
                  end_idx = end_index - passage_start_pos

                  prelim_predictions.append(
                            _PrelimPrediction(
                                feature_index=i,
                                start_index=start_idx,
                                end_index=end_idx,
                                start_log_prob=start_log_prob,
                                end_log_prob=end_log_prob))



        prelim_predictions = sorted(
            prelim_predictions,
            key=lambda x: (x.start_log_prob + x.end_log_prob),
            reverse=True)

        if prelim_predictions:
            best_index = prelim_predictions[0].feature_index
            aligned_words = current_example['aligned_words']
            try:
                tok_to_orig_index = current_example['tok_to_orig_index']
                reverse_start_index_align = tok_to_orig_index[prelim_predictions[0].start_index + example_features[best_index]['doc_offset']] # aligned index
                reverse_end_index_align   = tok_to_orig_index[prelim_predictions[0].end_index + example_features[best_index]['doc_offset']]

                predicted_words = [w for w in aligned_words[reverse_start_index_align: reverse_end_index_align + 1] if w != SPIECE_UNDERLINE]
                predicted_text = ' '.join(predicted_words)
                qas_id_answer[qas_id] = predicted_text
            except:
                qas_id_answer[qas_id] = ""
                skipped.append(qas_id)
        else:
            qas_id_answer[qas_id] = ""
            skipped_null.append(qas_id)
    eval_results = evaluate_v1(squad_dev_data, qas_id_answer)
    layer_results.append(eval_results)
    print("Layer {} , results {}".format(layer, eval_results))
    
    
with open("squad_albert_joint_loss.json", 'w') as f:
    json.dump(layer_results, f)

In [None]:
import collections
import string
import six
import re
    
from tf_transformers.data.squad_utils_sp import _get_best_indexes, _compute_softmax

####### following are from official SQuAD v1.1 evaluation scripts
def normalize_answer_v1(s):
  """Lower text and remove punctuation, articles and extra whitespace."""

  def remove_articles(text):
    return re.sub(r"\b(a|an|the)\b", " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
  prediction_tokens = normalize_answer_v1(prediction).split()
  ground_truth_tokens = normalize_answer_v1(ground_truth).split()
  common = (
      collections.Counter(prediction_tokens)
      & collections.Counter(ground_truth_tokens))
  num_same = sum(common.values())
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(prediction_tokens)
  recall = 1.0 * num_same / len(ground_truth_tokens)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1


def exact_match_score(prediction, ground_truth):
  return (normalize_answer_v1(prediction) == normalize_answer_v1(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
  scores_for_ground_truths = []
  for ground_truth in ground_truths:
    score = metric_fn(prediction, ground_truth)
    scores_for_ground_truths.append(score)
  return max(scores_for_ground_truths)


def evaluate_v1(dataset, predictions):
  f1 = exact_match = total = 0
  for article in dataset:
    for paragraph in article["paragraphs"]:
      for qa in paragraph["qas"]:
        total += 1
        if qa["id"] not in predictions:
          # message = ("Unanswered question " + six.ensure_str(qa["id"]) +
          #           "  will receive score 0.")
          # print(message, file=sys.stderr)
          continue
        ground_truths = [x["text"] for x in qa["answers"]]
        # ground_truths = list(map(lambda x: x["text"], qa["answers"]))
        prediction = predictions[qa["id"]]
        exact_match += metric_max_over_ground_truths(exact_match_score,
                                                     prediction, ground_truths)
        f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)

  exact_match = 100.0 * exact_match / total
  f1 = 100.0 * f1 / total

  return {"exact_match": exact_match, "f1": f1}

