## Get word embeddings

In [1]:
import tensorflow as tf
from transformers import TFBertForMaskedLM, BertConfig
class bert_with_mask(tf.keras.Model):
    def __init__(self, config, onehot_mask):
        super(bert_with_mask, self).__init__()
        self.bert = TFBertForMaskedLM(config)
        self.onehot_mask = onehot_mask

    def call(self, inputs):
        mask = inputs[-1] # unpack mask from inputs
        inputs = inputs[:-1]
        outputs = self.bert(inputs)[0]
        
        outputs = tf.where(mask[:,:,None], outputs, self.onehot_mask[None,None,:])
        
        return (outputs,)

I0416 19:15:27.636775 47039360943104 file_utils.py:57] TensorFlow version 2.1.0 available.


In [2]:
import sys
sys.path.insert(1, '../code')
import pretrained_tokenizer

# t_fr = pretrained_tokenizer.Tokenizer(language='fr', path='../tokenizer_data_fr_30k')
t_fr = pretrained_tokenizer.Tokenizer(language='en', path='../tokenizer_data_en_30k')

I0416 19:15:41.362657 47039360943104 configuration_utils.py:281] loading configuration file ../tokenizer_data_en_30k/config.json
I0416 19:15:41.366433 47039360943104 configuration_utils.py:319] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  

In [3]:
import numpy as np
onehot_mask = np.zeros(len(t_fr.tokenizer), dtype=np.float32)
onehot_mask[t_fr.tokenizer.pad_token_id] = 1.0

In [4]:
# Recreate the model
config = BertConfig.from_pretrained('../code/bert_config_tiny.json')
config.vocab_size = len(t_fr.tokenizer)

new_model = bert_with_mask(config, onehot_mask)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-3, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
new_model.compile(optimizer=optimizer, loss=loss)

I0416 19:15:42.284523 47039360943104 configuration_utils.py:281] loading configuration file ../code/bert_config_tiny.json
I0416 19:15:42.288191 47039360943104 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 2,
  "num_beam

In [5]:
# This initializes the variables used by the optimizers,
# as well as any stateful metric variables
SENTENCE_LENGTH = t_fr.MAX_LENGTH
new_model.train_on_batch((tf.zeros(shape=(1,SENTENCE_LENGTH),dtype=tf.int32), tf.ones(shape=(1,SENTENCE_LENGTH),dtype=tf.bool), tf.ones(shape=(1,SENTENCE_LENGTH),dtype=tf.bool)), (tf.zeros(shape=(1,SENTENCE_LENGTH),dtype=tf.int32)))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0416 19:15:50.788982 47039360943104 optimizer_v2.py:1043] Gradients do not exist for variables ['bert_with_mask/tf_bert_for_masked_lm/bert/pooler/dense/kernel:0', 'bert_with_mask/tf_bert_for_masked_lm/bert/pooler/dense/bias:0'] when minimizing the loss.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0416 19:15:52.709235 47039360943104 optimizer_v2.py:1043] Gradients do not exist for variables ['bert_with_mask/tf_bert_for_masked_lm/bert/pooler/dense/kernel:0', 'bert_with_mask/tf_bert_for_masked_lm/bert/pooler/dense/bias:0'] when minimizing the loss.


10.209963

In [26]:
# Load the state of the old model
new_model.load_weights('tinyBERT_fr2/tinyBERT')
# new_model.load_weights('ckpts/weights-improvement-49-9.00.hdf5')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2ac8476c2ad0>

In [27]:
weights = new_model.bert.weights[0].numpy()
weights

array([[-0.04424642,  0.06636524,  0.18779634, ...,  0.12359824,
         0.20507875,  0.00632676],
       [ 0.02300078,  0.04526478,  0.1138289 , ...,  0.18306506,
         0.36607835,  0.08191235],
       [ 0.2874621 , -0.02671925,  0.21807905, ...,  0.41321686,
         0.36394203,  0.05828147],
       ...,
       [-0.06198958,  0.34959   ,  0.22715902, ...,  0.14978306,
         0.23692305,  0.12474256],
       [ 0.01955202,  0.23921558,  0.13941337, ...,  0.24168685,
         0.4770586 ,  0.13954757],
       [-0.13496612,  0.40748537,  0.1220889 , ...,  0.16994001,
         0.24111958,  0.00428365]], dtype=float32)

In [28]:
np.save(
#     'tf_bert_for_masked_lm_epoch_49_loss_9.00_EN_30k_position.npy',
    'tf_bert_for_masked_lm_epoch_40_loss_8.78_FR_30k_position.npy',
    new_model.bert.weights[1].numpy()
)

In [31]:
# weights2 = np.load('tf_bert_for_masked_lm_epoch_49_loss_9.00_EN_30k_position.npy')
weights2 = np.load('tf_bert_for_masked_lm_epoch_40_loss_8.78_FR_30k_position.npy')

In [32]:
assert (new_model.bert.weights[1].numpy() == weights2).all()

In [33]:
weights2.shape

(512, 128)

In [25]:
weights2

array([[ 0.02549606, -0.34520984, -0.11114695, ...,  0.00919494,
        -0.02353353,  0.17783244],
       [-0.08381344, -0.24735476, -0.02676942, ...,  0.16489263,
        -0.04131527, -0.02563108],
       [-0.11810085, -0.20583269, -0.05165746, ...,  0.3128433 ,
         0.07927801, -0.19550008],
       ...,
       [-0.02872846, -0.00938647,  0.00218509, ...,  0.01731755,
         0.03368582, -0.00059017],
       [ 0.00795601,  0.00063555, -0.02629197, ...,  0.00309679,
        -0.01845641,  0.02721068],
       [ 0.00711415,  0.02857519, -0.03790667, ...,  0.00750328,
        -0.0061818 ,  0.00392741]], dtype=float32)