# Intialize

## Load all files

In [None]:
import tensorflow as tf

en_file1 = '../data/train.lang1'
en_file2 = '../data/train.en.no-punctuation/unaligned.en'
fr_file1 = '../data/train.lang2'
fr_file2 = '../data/train.fr.tokenized/unaligned.fr'

# Tokenize

## Tokenize all files (whole strings)

In [2]:
import sys
sys.path.insert(1, '../code')
import pretrained_tokenizer

I0418 05:23:03.394031 47112253547520 file_utils.py:57] TensorFlow version 2.1.0 available.


In [3]:
!pwd
t_fr = pretrained_tokenizer.Tokenizer(language='en', path='../tokenizer_data_en_30k')
# t_fr = pretrained_tokenizer.Tokenizer(language='fr', path='../tokenizer_data_fr_30k')
# t_fr = pretrained_tokenizer.Tokenizer(language='en', path='../tokenizer_data_en_30k')

/home/guest139/Translation-Team08-IFT6759/notebooks


I0418 05:23:19.509272 47112253547520 configuration_utils.py:281] loading configuration file ../tokenizer_data_en_30k/config.json
I0418 05:23:19.513222 47112253547520 configuration_utils.py:319] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  

In [4]:
import numpy as np
def tokenize_file(filename):
    with open(filename) as f:
        tokens = []
        attention_mask = []
        for idx, line in enumerate(f):
            encoded = t_fr.encode(line)
            tokens += [np.array(encoded['input_ids'])[None,:]]
            attention_mask += [np.array(encoded['attention_mask'])[None,:]]
    return np.concatenate(tokens,axis=0), np.concatenate(attention_mask,axis=0)

x_true, attention_mask = tokenize_file(en_file2)
x_true_val, attention_mask_val = tokenize_file(en_file1)

(474000, 64)

# Define masking strategy

In [5]:
SENTENCE_LENGTH = t_fr.MAX_LENGTH
def mask_tokens(true_tokens, attention_mask):
    random_masking_seed = np.random.uniform(0,1,(SENTENCE_LENGTH,)) * attention_mask
    
    masking_targets = 0.85 < random_masking_seed # 15%
    mask_token_targets = np.logical_and(0.85 < random_masking_seed, random_masking_seed < 0.85 + 0.15*0.8) # 80% of 15%
    random_token_targets = np.logical_and(1.0 - 0.1*0.15 < random_masking_seed, random_masking_seed < 1.0) # 10% of 15%
    
    masked_tokens = true_tokens.copy()
    masked_tokens[mask_token_targets] = t_fr.tokenizer.mask_token_id
    masked_tokens[random_token_targets] = np.random.randint(0,len(t_fr.tokenizer),(random_token_targets.sum(),))

    masked_true_tokens = true_tokens.copy()
    masked_true_tokens[~masking_targets] = t_fr.tokenizer.pad_token_id
    
    return masked_tokens, masking_targets, masked_true_tokens

# Override model to include masking

In [6]:
from transformers import TFBertForMaskedLM, BertConfig

## Overriding functions

In [7]:
class bert_with_mask(tf.keras.Model):
    def __init__(self, config, onehot_mask):
        super(bert_with_mask, self).__init__()
        self.bert = TFBertForMaskedLM(config)
        self.onehot_mask = onehot_mask

    def call(self, inputs):
        mask = inputs[-1] # unpack mask from inputs
        inputs = inputs[:-1]
        outputs = self.bert(inputs)[0]
        
        outputs = tf.where(mask[:,:,None], outputs, self.onehot_mask[None,None,:])
        
        return (outputs,)

## Define masking function

In [8]:
onehot_mask = np.zeros(len(t_fr.tokenizer), dtype=np.float32)
onehot_mask[t_fr.tokenizer.pad_token_id] = 1.0

## Test the model

In [9]:
config = BertConfig.from_pretrained('../code/bert_config_tiny.json')
config.vocab_size = len(t_fr.tokenizer)

I0418 05:25:25.154355 47112253547520 configuration_utils.py:281] loading configuration file ../code/bert_config_tiny.json
I0418 05:25:25.156252 47112253547520 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 2,
  "num_beam

In [10]:
model2 = bert_with_mask(config, onehot_mask)

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model2.compile(optimizer=optimizer, loss=loss)

# Define a dataset

In [11]:
BATCH_SIZE = 32
def data_generator_fn():
    for x, atn in zip(x_true, attention_mask):
        x_train, targets_train, masked_x_true = mask_tokens(x, atn)
        yield (x_train, atn, targets_train), masked_x_true

# dataset object
dataset = tf.data.Dataset.from_generator(
    data_generator_fn,
    output_types=((tf.int32, tf.bool, tf.bool), tf.int32),
    output_shapes=(( tf.TensorShape((SENTENCE_LENGTH,)), tf.TensorShape((SENTENCE_LENGTH,)), tf.TensorShape((SENTENCE_LENGTH,)) ), tf.TensorShape((SENTENCE_LENGTH,)) )
)
dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [12]:
def data_generator_fn_val():
    for x, atn_val in zip(x_true_val, attention_mask_val):
        x_train, targets_train, masked_x_true = mask_tokens(x, atn_val)
        yield (x_train, atn_val, targets_train), masked_x_true

# dataset object
dataset_val = tf.data.Dataset.from_generator(
    data_generator_fn_val,
    output_types=((tf.int32, tf.bool, tf.bool), tf.int32),
    output_shapes=(( tf.TensorShape((SENTENCE_LENGTH,)), tf.TensorShape((SENTENCE_LENGTH,)), tf.TensorShape((SENTENCE_LENGTH,)) ), tf.TensorShape((SENTENCE_LENGTH,)) )
)
dataset_val = dataset_val.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

## Training loop

In [13]:
# Recreate the model
model2 = bert_with_mask(config, onehot_mask)
model2.compile(optimizer=optimizer, loss=loss)

# This initializes the variables used by the optimizers,
# as well as any stateful metric variables
model2.train_on_batch(dataset.take(1))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0418 05:25:37.774441 47112253547520 optimizer_v2.py:1043] Gradients do not exist for variables ['bert_with_mask_1/tf_bert_for_masked_lm_1/bert/pooler/dense/kernel:0', 'bert_with_mask_1/tf_bert_for_masked_lm_1/bert/pooler/dense/bias:0'] when minimizing the loss.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0418 05:25:39.714071 47112253547520 optimizer_v2.py:1043] Gradients do not exist for variables ['bert_with_mask_1/tf_bert_for_masked_lm_1/bert/pooler/dense/kernel:0', 'bert_with_mask_1/tf_bert_for_masked_lm_1/bert/pooler/dense/bias:0'] when minimizing the loss.


9.36928

In [16]:
!pwd
model2.load_weights('ckpts/weights-improvement-10-9.01.hdf5')
# model2.load_weights('tinyBERT_en/tinyBERT')
# model2.load_weights('ckpts/weights-improvement-20-8.78.hdf5')
# model2.load_weights('tinyBERT_fr2/tinyBERT')

/home/guest139/Translation-Team08-IFT6759/notebooks


In [None]:
!mkdir ckpts
from tensorflow.keras.callbacks import ModelCheckpoint 
filepath="ckpts/weights-improvement-{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [17]:
hist = model2.fit(dataset, validation_data=dataset_val, initial_epoch=10, epochs=25, callbacks=callbacks_list)

mkdir: cannot create directory 'ckpts': File exists
Epoch 11/25
  14813/Unknown - 1930s 130ms/step - loss: 9.0233
Epoch 00011: val_loss improved from inf to 9.01219, saving model to ckpts/weights-improvement-11-9.01.hdf5
Epoch 12/25
Epoch 00012: val_loss improved from 9.01219 to 9.00961, saving model to ckpts/weights-improvement-12-9.01.hdf5
Epoch 13/25
Epoch 00013: val_loss did not improve from 9.00961
Epoch 14/25
Epoch 00014: val_loss improved from 9.00961 to 9.00778, saving model to ckpts/weights-improvement-14-9.01.hdf5
Epoch 15/25
Epoch 00015: val_loss improved from 9.00778 to 9.00720, saving model to ckpts/weights-improvement-15-9.01.hdf5
Epoch 16/25
Epoch 00016: val_loss improved from 9.00720 to 9.00695, saving model to ckpts/weights-improvement-16-9.01.hdf5
Epoch 17/25
Epoch 00017: val_loss improved from 9.00695 to 9.00420, saving model to ckpts/weights-improvement-17-9.00.hdf5
Epoch 18/25
Epoch 00018: val_loss did not improve from 9.00420
Epoch 19/25
Epoch 00019: val_loss did 

In [18]:
hist.history

{'loss': [9.023326345129858,
  9.0209677878915,
  9.019792253711556,
  9.019181977284106,
  9.017312573976154,
  9.017025964986424,
  9.01628955329219,
  9.015283525748595,
  9.014510984267867,
  9.014428088868218,
  9.013857906035733,
  9.013160177753948,
  9.012782871133668,
  9.012200083221565,
  9.0124870794755],
 'val_loss': [9.012193613274153,
  9.009608476660972,
  9.012180081633634,
  9.007779406946758,
  9.007198236709417,
  9.006946231043615,
  9.004197935725367,
  9.004905312560325,
  9.005924216536588,
  9.006016800569933,
  9.003250912178395,
  9.003198166226232,
  9.003413519193959,
  9.002597262693007,
  9.003135553626127]}

# Save model weights

In [None]:
!rm -rf tinyBERT_fr2
!mkdir tinyBERT_fr2
model2.save_weights('tinyBERT_fr2/tinyBERT', save_format='tf')