In [1]:
!pip install sentencepiece
!pip install tf-models-nightly
!pip install tf-nightly

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 5.3MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95
Collecting tf-models-nightly
[?25l  Downloading https://files.pythonhosted.org/packages/61/52/0705a5a0a6aff94c3cde1a4e52042103fc7c38d691417d77976cb85149fe/tf_models_nightly-2.4.0.dev20210110-py2.py3-none-any.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 7.3MB/s 
Collecting tf-nightly
[?25l  Downloading https://files.pythonhosted.org/packages/44/b8/f30541be399375a7d3b6803d5c070cf90b3b056796bd57e2eb5699526d06/tf_nightly-2.5.0.dev20210110-cp36-cp36m-manylinux2010_x86_64.whl (399.9MB)
[K     |████████████████████████████████| 399.9MB 40kB/s 
Collecting pyyaml>=5.1
[?25l  Downloading https://files.pythonhoste

In [2]:
import tensorflow as tf

In [3]:
tf.__version__

'2.5.0-dev20210110'

In [None]:
import tensorflow_hub as hub

from official.nlp.bert.tokenization import FullTokenizer
from official.nlp.bert.input_pipeline import create_squad_dataset
from official.nlp.data.squad_lib import generate_tf_record_from_json_file

from official.nlp import optimization

from official.nlp.data.squad_lib import (
    read_squad_examples, FeatureWriter,
    convert_examples_to_features, write_predictions)

In [None]:
import numpy as np
import math
import random
import time
import json
import collections
import os

from google.colab import drive

## Data processing

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
input_meta_data = generate_tf_record_from_json_file(
    "/content/drive/MyDrive/BERT/data/squad/train-v1.1.json",
    "/content/drive/MyDrive/BERT/data/squad/vocab.txt",
    "/content/drive/MyDrive/BERT/data/squad/train-v1.1.tf_record"
)

In [None]:
with tf.io.gfile.GFile("/content/drive/MyDrive/BERT/data/squad/train_meta_data.gfile", "w") as writer:
  writer.write(json.dumps(input_meta_data, indent=4) + '\n')

In [None]:
BATCH_SIZE = 4

train_dataset = create_squad_dataset(
    "/content/drive/MyDrive/BERT/data/squad/train-v1.1.tf_record",
    input_meta_data['max_seq_length'], # 384
    BATCH_SIZE,
    is_training=False
)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'str' and 'Literal'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'str' and 'Literal'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'str' and 'Literal'


# Model building

## squad layer

In [None]:
class BertSquadLayer(tf.keras.layers.Layer):

  def __init__(self):
    super(BertSquadLayer, self).__init__()
    self.final_dense = tf.keras.layers.Dense(
        units=2,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)
    )

  def call(self, inputs):
    logits = self.final_dense(inputs) # (batch_size, seq_len, 2)

    logits = tf.transpose(logits, [2, 0, 1]) # (2, batch_size, seq_len)
    unstacked_logits = tf.unstack(logits, axis=0) # [(batch_size, seq_len), (batch_size, seq_len)]
    return unstacked_logits[0], unstacked_logits[1]

## whole model

In [None]:
class BertSquad(tf.keras.Model):

  def __init__(self, name="bert_squad"):
    super(BertSquad, self).__init__()
    self.bert_layer = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
        trainable=True
    )
    self.squad_layer = BertSquadLayer()

  def apply_bert(self, inputs):
    _, sequence_output = self.bert_layer(
        inputs["input_word_ids"],
        inputs["input_mask"],
        inputs["input_type_ids"]
    )
    return sequence_output

  def call(self, inputs):
    seq_output = self.apply_bert(inputs)
    start_logits, end_logits = self.squad_layer(seq_output)
    return start_logits, end_logits

# Training

## creating the AI

In [16]:
TRAIN_DATA_SIZE = 88641 # this is our training dataset
NB_BATCHES_TRAIN = 2000 # we will not use the full one as fine tuning bert is quite heavy
BATCH_SIZE = 4
NB_EPOCHS = 3 # 2 should be fine but we are using 3
INIT_LR = 5e-4 # we are going to use a different optimiser that is given by google specifically for BERT
WARMUP_STEPS = int(NB_BATCHES_TRAIN * 0.1)

In [14]:
train_dataset_light = train_dataset.take(NB_BATCHES_TRAIN)

In [15]:
bert_squad = BertSquad()

In [17]:
optimizer = optimization.create_optimizer(
    init_lr=INIT_LR,
    num_train_steps=NB_BATCHES_TRAIN,
    num_warmup_steps=WARMUP_STEPS
)

In [18]:
def squad_loss_function(labels, model_outputs):
  start_positions = labels["start_positions"]
  end_positions = labels["end_positions"]
  start_logits, end_logits = model_outputs

  start_loss = tf.keras.backend.sparse_categorical_crossentropy(
      start_positions, start_logits, from_logits=True
  )
  end_loss = tf.keras.backend.sparse_categorical_crossentropy(
      end_positions, end_logits, from_logits=True
  )

  total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
  return total_loss

  train_loss = tf.keras.metrics.Mean(name='train_loss')

In [19]:
next(iter(train_dataset_light))

({'input_mask': <tf.Tensor: shape=(4, 384), dtype=int32, numpy=
  array([[1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0],
         [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
  'input_type_ids': <tf.Tensor: shape=(4, 384), dtype=int32, numpy=
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>,
  'input_word_ids': <tf.Tensor: shape=(4, 384), dtype=int32, numpy=
  array([[  101,  2000,  3183, ...,     0,     0,     0],
         [  101,  2054,  2003, ...,     0,     0,     0],
         [  101,  1996, 13546, ...,     0,     0,     0],
         [  101,  2054,  2003, ...,     0,     0,     0]], dtype=int32)>,
  'unique_ids': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([1000000000, 1000000001, 1000000002, 1000000003], dtype=int32)>},
 {})