The current notebook requires TF2 to run (see [databricks 6.1ML Runtime for how to install TF2](https://docs.databricks.com/applications/deep-learning/single-node-training/tensorflow.html#install-tensorflow-20-on-dbr-61-ml))

TODO
- [] Add tf.summary to monitor all statistics in tensorboard (see [tutorial](https://www.tensorflow.org/tensorboard/migrate))
- [x] Change how inference is done: currently no mask is used when validation accuracy is measured
- [x] Add an option to use MSE LOSS
- [x] Add a function to load initializations from the teacher model to the student model

In [3]:
import tensorflow as tf
import tensorflow_datasets
from transformers import *
import tensorflow_hub as hub
import sys

In [4]:
print(tf.__version__)
print(hub.__version__)

Define DistillBertConfig as a subclass of BertConfig and DistilConfig to incorporate the following parameters

In [6]:
class DistilMyBertConfig(PretrainedConfig):
    def __init__(self,
                 vocab_size_or_config_json_file=33333,
                 num_classes=2,
                 distill_temperature=2.0,
                 task_balance=0.5,
                 max_seq_len=128,
                 epoch = 5,
                 learning_rate=5e-4,
                 adam_epsilon=1e-6,
                 max_grad_norm=5.0,
                 **kwargs):
        super(PretrainedConfig, self).__init__(*kwargs)
        
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.num_classes=num_classes
            self.distill_temperature = distill_temperature
            self.task_balance = task_balance
            self.max_seq_len = max_seq_len
            self.epoch = epoch
            self.learning_rate = learning_rate
            self.adam_epsilon = adam_epsilon
            self.max_grad_norm = max_grad_norm

In [7]:
# Prepare Config
config = DistilMyBertConfig()

In [8]:
# Get stuent and teacher nn architecture

student_config_class, student_model_class, student_tokenizer_class = DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer
teacher_config_class, teacher_model_class, teacher_tokenizer_class = BertConfig, TFBertForSequenceClassification, BertTokenizer

# Load teacher model from checkpoints, but freeze the teacher layers
teacher = teacher_model_class.from_pretrained("/dbfs/ml/judith/transformers/mrpc/1")#, output_hidden_states=True)
teacher_tokenizer = teacher_tokenizer_class.from_pretrained('/dbfs/ml/judith/transformers/mrpc/1') 

# # Load student model from config
# # Student model no longer has token type embedding and the position embedding
# student_config_path = "/dbfs/ml/judith/transformers/distilbert-base-uncased.json"
# stu_architecture_config = student_config_class.from_pretrained(student_config_path)
# stu_architecture_config.sinusoidal_pos_embds = False
# stu_architecture_config.n_layers = 1
# student = student_model_class(stu_architecture_config)
# print("Is the student weights are empty before training? ", student.weights==[])

In [9]:
teacher_config = teacher_config_class.from_pretrained('bert-base-uncased')
teacher_config.num_hidden_layers = 1
student = teacher_model_class(teacher_config)
print("Is the teacher model weights are empty before training? ", student.weights==[])
student(student.dummy_inputs, training=False)

In [10]:
student.load_weights('/tmp/experiment3/teacher/tf_model.h5', by_name=True)
student(student.dummy_inputs, training=False)

In [11]:
print(student(student.dummy_inputs, training=False))
layer_mapping = {0:0}
extract_weights_from_teacher(teacher, student, layer_mapping)
# student(student.dummy_inputs, training=False)

In [12]:
student(student.dummy_inputs, training=False)

In [13]:
# save pre-trained weights
teacher = teacher_model_class.from_pretrained('bert-base-uncased')
teacher.save_pretrained('/tmp/experiment3/teacher')

# build teacher architecture
teacher_config = teacher_config_class.from_pretrained('bert-base-uncased')
teacher = teacher_model_class(teacher_config)
teacher(teacher.dummy_inputs, training=False)

# initialize teacher from pre-trained weights
teacher.load_weights('/tmp/experiment3/teacher/tf_model.h5')
teacher(teacher.dummy_inputs, training=False)

In [14]:
# num_hidden_layers = 1
# training=True

# teacher = teacher_model_class.from_pretrained('bert-base-uncased')

# input_ids = tf.keras.layers.Input(shape=(config.max_seq_len,), dtype=tf.int32,
#                                        name="input_word_ids")
# attention_mask = tf.keras.layers.Input(shape=(config.max_seq_len,), dtype=tf.int32,
#                                    name="attention_mask")
# token_type_ids = tf.keras.layers.Input(shape=(config.max_seq_len,), dtype=tf.int32,
#                                     name="token_type_ids")

# extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
# extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
# extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
# head_mask = [None] * num_hidden_layers
# position_ids = None

# embedding_output = teacher.bert.embeddings([input_ids, position_ids, token_type_ids], training=training)
# encoder_outputs = teacher.bert.encoder.layer[0]([embedding_output, extended_attention_mask, head_mask], training=training)

# sequence_output = encoder_outputs[0]
# pooled_output = teacher.bert.pooler(sequence_output)

In [15]:
%sh
ls /tmp/experiment3/teacher

In [16]:
# layer_num = 0
# print("the {}th encoder layer has {} variables and {} trainable variables".format(
#   layer_num,
#   len(teacher.layers[0].encoder.layer[layer_num].variables),
#   len(teacher.layers[0].encoder.layer[layer_num].trainable_variables)
# ))

# print("the inputs are transformed by embeddings layer first")
# for v in teacher.bert.embeddings.variables:
#   print(v.name)
  
# print("each encoder layer has attention, intermediate and bert_output layer")
# for v in teacher.bert.encoder.layer[layer_num].attention.variables:
#   print(v.name)
  
# for v in teacher.bert.encoder.layer[layer_num].intermediate.variables:
#   print(v.name)

# for v in teacher.bert.encoder.layer[layer_num].bert_output.variables:
#   print(v.name)

# print("the output of the encoder is a sequence output then is transformed into a pooled output through a pooler layer on top")
# for v in teacher.bert.pooler.variables:
#   print(v.name)


In [17]:
def extract_weights_from_teacher(teacher, student, layer_maps):
  """Extract the weights from teacher to initialize the student layer
  
  Args:
    teacher: BertSequenceClassifier
    student: BertSequenceClassifier
    layer_maps: e.g., {s: t} will initialize s'th student
                encoder layer using the weights from t'th teacher encoder layer
  """
  # copy the embedding weights
  for v in teacher.bert.embeddings.variables:
    student.bert.embeddings.add_weight(
      name=v.name,
      shape=v.shape,
      dtype=v.dtype,
      trainable=True,
      getter=lambda *_, **__: v
    )

  # initialize the student layer (s_layer_num) using the teacher layer number (layer_num)
  layer_num = 0
  s_layer_num = 0
  for s, t in layer_maps.items():
    # copy the encoder weights

    for v in teacher.bert.encoder.layer[t].attention.variables:
      student.bert.encoder.layer[s].add_weight(
        name=v.name,
        shape=v.shape,
        dtype=v.dtype,
        trainable=True,
        getter=lambda *_, **__: v
      )

    for v in teacher.bert.encoder.layer[t].intermediate.variables:
      student.bert.encoder.layer[s].add_weight(
        name = v.name, 
        shape = v.shape,
        dtype=v.dtype,
        trainable=True,
        getter=lambda *_, **__: v
      )

    for v in teacher.bert.encoder.layer[t].bert_output.variables:
      student.bert.encoder.layer[s].add_weight(
        name=v.name,
        shape=v.shape,
        dtype=v.dtype,
        trainable=True,
        getter=lambda *_, **__: v
      )

  # copy the pooler weights
  for v in teacher.bert.pooler.variables:
    student.bert.pooler.add_weight(
      name=v.name,
      shape=v.shape,
      dtype=v.dtype,
      trainable=True,
      getter=lambda *_, **__: v
    )
    
#   # Forward the callable's regularization losses (if any).
#   if hasattr(teacher, "regularization_losses"):
#     for l in self._func.regularization_losses:
#       if not callable(l):
#         raise ValueError(
#             "hub.KerasLayer(obj) expects obj.regularization_losses to be an "
#             "iterable of callables, each returning a scalar loss term.")
#       self.add_loss(self._call_loss_if_trainable(l))  # Supports callables.

In [18]:
# input_ids = tf.keras.layers.Input(shape=(config.max_seq_len,), dtype=tf.int32, name="input_ids")
# student_logits = student(input_ids)
# student_model = tf.keras.Model(inputs=input_ids,
#                        outputs=student_logits)

# teacher_input_ids = tf.keras.layers.Input(shape=(config.max_seq_len,), dtype=tf.int32, name="teacher_input_ids")
# teacher_logits = teacher(teacher_input_ids)
# teacher_model = tf.keras.Model(inputs=teacher_input_ids,
#                                outputs=teacher_logits)

In [19]:
teacher.summary()

In [20]:
student.summary()

Prepare Data from MRPC

In [22]:
data = tensorflow_datasets.load('glue/mrpc')

In [23]:
train_dataset = glue_convert_examples_to_features(data['train'], teacher_tokenizer, max_length=config.max_seq_len, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], teacher_tokenizer, max_length=config.max_seq_len, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(16).repeat(2)
valid_dataset = valid_dataset.batch(32)

Training the student model ONLY

Training the knowledge distiller

In [26]:
# loss1 = ce_loss_fct_torch(torch.tensor([1.0,1.0]), torch.tensor([0.0, 0.0]), config)
# loss2 = ce_loss_fct(tf.constant([1.0,1.0]), tf.constant([0.0,0.0]), config)
# assert(loss1.numpy()==loss2.numpy())

loss1 = mse_loss_fct_torch(torch.tensor([[1.0,1.0],[2.0,2.0]]), torch.tensor([[0.0,0.0],[1.0, 1.0]]))
loss2 = mse_loss_fct(tf.constant([[1.0,1.0],[2.0,2.0]]), tf.constant([[0.0,0.0],[1.0, 1.0]]))
assert(loss1.numpy()==loss2.numpy())

In [27]:
import torch.nn.functional as F
import torch

def loss_fn_v2(teacher_logits, student_logits, targets, config):
  loss_op_standard = standard_loss_fct(student_logits, targets, config)
  mse_loss = mse_loss_fct(teacher_logits, student_logits)
  return loss_op_standard + mse_loss

def loss_fn_v1(teacher_logits, student_logits, targets, config):
  loss_op_standard = standard_loss_fct(student_logits, targets, config)
  loss_op_soft = ce_loss_fct(student_logits, teacher_logits, config)
  return loss_op_standard + loss_op_soft

def standard_loss_fct(student_logits, targets, config):
  one_hot_targets = tf.one_hot(targets, config.num_classes, dtype=tf.float32)
  loss_op_standard = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=student_logits, labels=one_hot_targets
  ))
  return loss_op_standard

def ce_loss_fct(student_logits, teacher_logits, config):
  teacher_targets = tf.nn.softmax(tf.multiply(teacher_logits,  1.0 / config.distill_temperature))
  loss_op_soft = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=tf.multiply(student_logits, 1.0 / config.distill_temperature), labels=tf.stop_gradient(teacher_targets)
  ))
  # scale soft target obj to match hard target obj. scale
  loss_op_soft *= tf.square(config.distill_temperature)
  return loss_op_soft

def ce_loss_fct_torch(student_logits, teacher_logits, config):
  loss = torch.nn.KLDivLoss(reduction='batchmean')
  return loss(F.log_softmax(student_logits/config.distill_temperature, dim=-1),
              F.softmax(teacher_logits/config.distill_temperature, dim=-1))*(config.distill_temperature)**2

def mse_loss_fct(teacher_logits, student_logits):
  return tf.reduce_mean(tf.keras.losses.MSE(teacher_logits, student_logits))

def mse_loss_fct_torch(teacher_logits, student_logits):
  return torch.nn.MSELoss(reduction='mean')(teacher_logits, student_logits)

In [28]:
def get_loss_acc(student, teacher, valid_dataset, config):
  mean_loss = tf.keras.metrics.Mean()
  acc = tf.keras.metrics.SparseCategoricalAccuracy()
  for x, y in valid_dataset:
    student_logits = student(x)[0]
    teacher_logits = teacher(x)[0]
    pred = tf.nn.softmax(student_logits)
    loss = loss_fn_v1(teacher_logits, student_logits, y, config)
    acc(y, pred)
    mean_loss(loss)
  return mean_loss.result().numpy(), acc.result().numpy()

In [29]:
# optimizer = tf.keras.optimizers.Adam(learning_rate=config.learning_rate, epsilon=config.adam_epsilon, clipnorm=config.max_grad_norm)
# ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=student)
# manager = tf.train.CheckpointManager(ckpt, '/tmp/distil', max_to_keep=5)
# ckpt = tf.train.Checkpoint(step=tf.Variable(1), opt=optimizer, net=student)
# ckpt.restore('/dbfs/ml/judith/transformers/ckpts/ckpt-2').assert_consumed()

In [30]:
%fs
ls dbfs:/ml/judith/transformers/distilbert/experiment3

path,name,size
dbfs:/ml/judith/transformers/distilbert/experiment3/config.json,config.json,188


In [31]:
import os

# setup directory
save_dir = "/dbfs/ml/judith/transformers/distilbert/experiment3"
local_dir = "/tmp/experiment3/"
if not os.path.exists(local_dir):
  os.mkdir(local_dir)

if not os.path.exists(save_dir):
  os.mkdir(save_dir)

# optimizer
config.learning_rate = 5e-4
config.save_pretrained(save_dir)
optimizer = tf.keras.optimizers.Adam(learning_rate=config.learning_rate, epsilon=config.adam_epsilon, clipnorm=config.max_grad_norm)

In [32]:
train_loss = []
train_acc = []

for epoch in range(config.epoch):
  epoch_end_loss = tf.keras.metrics.Mean()
  epoch_end_acc = tf.keras.metrics.SparseCategoricalAccuracy()
  
  # Training loop - using batches of 16
  for step, (x, y) in enumerate(train_dataset):
    with tf.GradientTape() as tape:
      student_logits = student(x)[0]
      teacher_logits = teacher(x)[0]
      targets = y
      loss = loss_fn_v1(teacher_logits, student_logits, targets, config)
    
    # only optimize student weights
    gradients = tape.gradient(loss, student.trainable_variables)
    optimizer.apply_gradients(zip(gradients, student.trainable_variables))

    # logging
    if step % 100 == 0:
      print(step, float(loss))
    
    # log accuracy
    pred = tf.nn.softmax(student_logits)
    epoch_end_acc(targets, pred)
    epoch_end_loss(loss)
  
  # End epoch
  train_acc.append(epoch_end_acc.result())
  train_loss.append(epoch_end_loss.result())
  
  val_loss, val_acc = get_loss_acc(student, teacher, valid_dataset, config)
  print("epoch : {} training loss: {} training acc: {}".format(epoch, epoch_end_loss.result(), epoch_end_acc.result()))
  print("epoch : {} validation loss: {} validation acc: {}".format(epoch, val_loss, val_acc))
  
  # save checkpoint
#   ckpt.step.assign_add(1)
#   save_path = manager.save()
#   print("Saved checkpoint for epoch {} at {}".format(int(ckpt.step), save_path))
#   print("epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch, epoch_end_loss.result(), epoch_end_acc.result()))

  # save model weights
  student.save_weights(os.path.join(local_dir,"ckpt-{}".format(epoch)), save_format='tf')

In [33]:
get_loss_acc(student, teacher, valid_dataset, config)

In [34]:
for epoch in range(config.epoch):
  student.load_weights("/dbfs/ml/judith/transformers/ckpts/{}".format(epoch))
  loss, acc = get_loss_acc(student, teacher, train_dataset, config)
  print("epoch : {} training loss: {} training acc: {}".format(epoch, loss, acc))

In [35]:
for epoch in range(config.epoch):
  student.load_weights("/dbfs/ml/judith/transformers/ckpts/{}".format(epoch))
  loss, acc = get_loss_acc(student, teacher, valid_dataset, config)
  print("epoch : {} validation loss: {} validation acc: {}".format(epoch, loss, acc))

In [36]:
for epoch in range(config.epoch):
  student.load_weights("/dbfs/ml/judith/transformers/ckpts/{}".format(epoch))
  train_acc = get_valid_acc(student, train_dataset)
  acc = get_valid_acc(student, valid_dataset)
  print("epoch : {} train acc: {} validation acc: {}".format(epoch, train_acc.numpy(), acc.numpy()))

In [37]:
res = get_valid_acc(student, valid_dataset)
res.numpy()

In [38]:
%fs
cp -r file:/tmp/distilbert dbfs:/ml/judith/transformers/ckpts

In [39]:
tf.argmax(results,1)

In [40]:
tf.equal(tf.argmax(results, 1), tf.argmax())

In [41]:
input_words = ["dogs are on the ground", "cats are in the cloud"]
input_word_ids = tf.constant([teacher_tokenizer.encode(text) for text in input_words])
targets = tf.constant([1, 0])

In [42]:
teacher_logits = teacher(input_word_ids)
student_logits = student(input_word_ids)

In [43]:
loss_fn(teacher_logits, student_logits, targets, config)

In [44]:
amax_seq_length = 128  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
attention_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="attention_mask")
token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="token_type_ids")
bert_layer = hub.KerasLayer("/dbfs/ml/smrt-hub/bert-base-uncased/1",
                            trainable=True)
sequence_output, pooled_output = bert_layer([input_word_ids, attention_mask, token_type_ids])


In [45]:
dir(bert_layer.resolved_object)

In [46]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()

In [47]:
# Prepare dataset for GLUE as a tf.data.Dataset instance

data = tensorflow_datasets.load('glue/mrpc')

# train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
# valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
# train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
# valid_dataset = valid_dataset.batch(64)

Train DistillBert

In [49]:
student_config_class, student_model_class, student_tokenizer_class = DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer
teacher_config_class, teacher_model_class, teacher_tokenizer_class = BertConfig, TFBertForSequenceClassification, BertTokenizer

In [50]:
tokenizer = teacher_tokenizer_class.from_pretrained('bert-base-uncased')

In [51]:
student_config = {
	"activation": "gelu",
	"attention_dropout": 0.1,
	"dim": 768,
	"dropout": 0.1,
	"hidden_dim": 3072,
	"initializer_range": 0.02,
	"max_position_embeddings": 512,
	"n_heads": 12,
	"n_layers": 6,
	"sinusoidal_pos_embds": true,
	"tie_weights_": true,
	"vocab_size": 30522
  }

In [52]:
# Student
stu_architecture_config = student_config_class.from_pretrained("/dbfs/ml/judith/transformers/distilbert-base-uncased.json")
stu_architecture_config.sinusoidal_pos_embds = False
student = student_model_class(stu_architecture_config)

In [53]:
teacher = teacher_model_class.from_pretrained("bert-base-uncased")#, output_hidden_states=True)

Evaluation using fine-tuned model

In [55]:
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = DistilBertTokenizer.from_pretrained('/dbfs/ml/judith/transformers/mrpc/2')
model = TFDistilBertForSequenceClassification.from_pretrained('/dbfs/ml/judith/transformers/mrpc/2')

In [56]:
results = model.predict(valid_dataset)

In [57]:
gpus = tf.config.experimental.list_physical_devices('GPU')
# tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

In [58]:
# Load dataset, tokenizer, model from pretrained model/vocabulary
teacher_tokenizer = BertTokenizer.from_pretrained('/dbfs/ml/judith/transformers/mrpc/1')
teacher_model = TFBertForSequenceClassification.from_pretrained('/dbfs/ml/judith/transformers/mrpc/1')

In [59]:
results = teacher_model.predict(valid_dataset)

Prepare Dataset from MRPC

In [61]:
data = tensorflow_datasets.load('glue/mrpc')

In [62]:
# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64)

DistillBert for MRPC

In [64]:
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

In [65]:
%fs
mkdirs dbfs:/ml/judith/transformers/mrpc/2

In [66]:
%sh
mkdir /tmp/mrpc/2

In [67]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train and evaluate using tf.keras.Model.fit()
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)


In [68]:
# Save model
model.save_pretrained("/tmp/mrpc/2/")
tokenizer.save_pretrained("/tmp/mrpc/2/")

In [69]:
%fs
ls dbfs:/ml/judith/transformers/mrpc/1

path,name,size
dbfs:/ml/judith/transformers/mrpc/1/added_tokens.json,added_tokens.json,2
dbfs:/ml/judith/transformers/mrpc/1/config.json,config.json,543
dbfs:/ml/judith/transformers/mrpc/1/special_tokens_map.json,special_tokens_map.json,112
dbfs:/ml/judith/transformers/mrpc/1/tf_model.h5,tf_model.h5,433518744
dbfs:/ml/judith/transformers/mrpc/1/tokenizer_config.json,tokenizer_config.json,59
dbfs:/ml/judith/transformers/mrpc/1/vocab.txt,vocab.txt,213450


In [70]:
%fs
cp -r file:/tmp/mrpc/2 dbfs:/ml/judith/transformers/mrpc/2

In [71]:
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')

In [72]:
data = tensorflow_datasets.load('glue/mrpc')

In [73]:
# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64)

In [74]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [75]:
# Train and evaluate using tf.keras.Model.fit()
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)

In [76]:
model.save_pretrained('/tmp/mrpc')

In [77]:
tokenizer.save_pretrained('/tmp/mrpc')

In [78]:
%fs
cp -r file:/tmp/mrpc dbfs:/ml/judith/transformers/mrpc/1

In [79]:
%fs
ls dbfs:/ml/judith/transformers/mrpc

path,name,size
dbfs:/ml/judith/transformers/mrpc/1/,1/,0
dbfs:/ml/judith/transformers/mrpc/2/,2/,0
dbfs:/ml/judith/transformers/mrpc/config.json,config.json,543


In [80]:
model1 = TFBertForSequenceClassification.from_pretrained("/tmp/mrpc")

In [81]:
sentence_0 = "This research was consistent with his findings."
sentence_1 = "His findings were compatible with this research."
sentence_2 = "His findings were not compatible with this research."
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='tf')
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='tf')

In [82]:
model1(inputs_1)[0]

In [83]:
model1(inputs_2)[0]

How to download GLUE dataset?

In [85]:
%sh
curl https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py --output /tmp/download_glue_data.py

In [86]:
%fs
cp -r file:/databricks/driver/glue_data dbfs:/ml/judith/datasets/glue_data

In [87]:
%fs
ls dbfs:/ml/judith/datasets/glue_data

path,name,size
dbfs:/ml/judith/datasets/glue_data/CoLA/,CoLA/,0
dbfs:/ml/judith/datasets/glue_data/MNLI/,MNLI/,0
dbfs:/ml/judith/datasets/glue_data/MRPC/,MRPC/,0
dbfs:/ml/judith/datasets/glue_data/QNLI/,QNLI/,0
dbfs:/ml/judith/datasets/glue_data/QQP/,QQP/,0
dbfs:/ml/judith/datasets/glue_data/RTE/,RTE/,0
dbfs:/ml/judith/datasets/glue_data/SNLI/,SNLI/,0
dbfs:/ml/judith/datasets/glue_data/SST-2/,SST-2/,0
dbfs:/ml/judith/datasets/glue_data/STS-B/,STS-B/,0
dbfs:/ml/judith/datasets/glue_data/WNLI/,WNLI/,0


In [88]:
%sh
/databricks/python/bin/python -V
/databricks/python/bin/python /tmp/download_glue_data.py