## Get word embeddings

In [3]:
import tensorflow as tf
from transformers import TFBertForMaskedLM, BertConfig
class bert_with_mask(tf.keras.Model):
    def __init__(self, config, onehot_mask):
        super(bert_with_mask, self).__init__()
        self.bert = TFBertForMaskedLM(config)
        self.onehot_mask = onehot_mask

    def call(self, inputs):
        mask = inputs[-1] # unpack mask from inputs
        inputs = inputs[:-1]
        outputs = self.bert(inputs)[0]
        
        outputs = tf.where(mask[:,:,None], outputs, self.onehot_mask[None,None,:])
        
        return (outputs,)

In [8]:
import sys
sys.path.insert(1, 'code')
import pretrained_tokenizer

t_fr = pretrained_tokenizer.Tokenizer(language='fr', path='tokenizer_data_fr_30k')

I0413 16:40:40.681236 47228017591296 configuration_utils.py:281] loading configuration file tokenizer_data_fr_30k/config.json
I0413 16:40:40.682730 47228017591296 configuration_utils.py:319] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "nu

In [10]:
import numpy as np
onehot_mask = np.zeros(len(t_fr.tokenizer), dtype=np.float32)
onehot_mask[t_fr.tokenizer.pad_token_id] = 1.0

In [11]:
# Recreate the model
config = BertConfig.from_pretrained('code/bert_config_tiny.json')
config.vocab_size = len(t_fr.tokenizer)

new_model = bert_with_mask(config, onehot_mask)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-3, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
new_model.compile(optimizer=optimizer, loss=loss)

I0413 16:41:04.660619 47228017591296 configuration_utils.py:281] loading configuration file code/bert_config_tiny.json
I0413 16:41:04.662433 47228017591296 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 2,
  "num_beams":

In [12]:
# This initializes the variables used by the optimizers,
# as well as any stateful metric variables
SENTENCE_LENGTH = t_fr.MAX_LENGTH
new_model.train_on_batch((tf.zeros(shape=(1,SENTENCE_LENGTH),dtype=tf.int32), tf.ones(shape=(1,SENTENCE_LENGTH),dtype=tf.bool), tf.ones(shape=(1,SENTENCE_LENGTH),dtype=tf.bool)), (tf.zeros(shape=(1,SENTENCE_LENGTH),dtype=tf.int32)))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0413 16:41:28.132115 47228017591296 optimizer_v2.py:1043] Gradients do not exist for variables ['bert_with_mask/tf_bert_for_masked_lm/bert/pooler/dense/kernel:0', 'bert_with_mask/tf_bert_for_masked_lm/bert/pooler/dense/bias:0'] when minimizing the loss.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0413 16:41:30.052045 47228017591296 optimizer_v2.py:1043] Gradients do not exist for variables ['bert_with_mask/tf_bert_for_masked_lm/bert/pooler/dense/kernel:0', 'bert_with_mask/tf_bert_for_masked_lm/bert/pooler/dense/bias:0'] when minimizing the loss.


10.353072

In [14]:
# Load the state of the old model
new_model.load_weights('notebooks/ckpts/weights-improvement-20-8.78.hdf5')

In [32]:
weights = new_model.bert.weights[0].numpy()
weights

array([[-0.03686362, -0.05840969, -0.04828245, ..., -0.04347925,
         0.11179028, -0.05221564],
       [-0.20925385,  0.08379307,  0.02067706, ..., -0.03660351,
         0.08842378,  0.0282498 ],
       [ 0.16639018, -0.19323124,  0.01101069, ...,  0.18592238,
         0.17581482,  0.02437549],
       ...,
       [-0.15946755,  0.06413963, -0.05813764, ...,  0.03509144,
         0.16632266,  0.09055192],
       [-0.09907825, -0.07877128, -0.19373311, ..., -0.03466598,
         0.25243932,  0.11537454],
       [-0.24498609,  0.19010551, -0.09455546, ..., -0.05772343,
        -0.07767264, -0.01369407]], dtype=float32)

In [33]:
np.save('tf_bert_for_masked_lm_epoch_27_loss_8.78.npy', weights)

In [35]:
weights2 = np.load('tf_bert_for_masked_lm_epoch_27_loss_8.78.npy')

In [39]:
assert (weights == weights2).all()

In [40]:
weights2.shape

(30000, 128)