<a href="https://colab.research.google.com/github/notAlex2/Translation-Team08-IFT6759/blob/master/notebooks/harman_bert_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os

project_path = "/content/drive/My Drive/machine-translation"
os.chdir(project_path)

data_path = os.path.join(project_path, 'data')

In [0]:
! pip install transformers

In [0]:
import transformers
import numpy as np
import tensorflow as tf
import io
import tokenizers
import json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import (
    PreTrainedTokenizer, 
    AutoTokenizer, 
    BertConfig, 
    TFBertModel
)
                          
from typing import Tuple

def set_seed(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(10)

In [0]:
unaligned_en_path = os.path.join(data_path, 'unaligned_small.en')

VOCAB_EXIST = True
VOCAB_SIZE = 30000
pretrained_tokenizer_path = "/content/drive/My Drive/machine-translation/transBERTo"
tokenizer_name = "bpe_tokenizer"

if not VOCAB_EXIST:
    # create vocab from scratch
    bpe_tokenizer = tokenizers.ByteLevelBPETokenizer()
    special_tokens = ["<s>","<pad>","</s>","<unk>","<mask>"]

    bpe_tokenizer.train(
                    files=[unaligned_en_path], 
                    vocab_size=VOCAB_SIZE, 
                    min_frequency=2, 
                    show_progress=True,
                    special_tokens=special_tokens
                    )

    # Save files to disk
    bpe_tokenizer.save(pretrained_tokenizer_path, tokenizer_name)
    
    # rename files to vocab.json and merges.txt
    src = os.path.join(pretrained_tokenizer_path, tokenizer_name + "-vocab.json")
    dst = os.path.join(pretrained_tokenizer_path, "vocab.json")
    os.rename(src, dst)

    src = os.path.join(pretrained_tokenizer_path, tokenizer_name + "-merges.txt")
    dst = os.path.join(pretrained_tokenizer_path, "merges.txt")
    os.rename(src, dst)

config = {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.3,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "num_attention_heads": 1,
  "num_hidden_layers": 1,
  "vocab_size": VOCAB_SIZE,
  "intermediate_size": 256,
  "max_position_embeddings": 256, 
  "model_type": "roberta" # roBERTa model is better than BERT for language modelling
}

config_path = os.path.join(pretrained_tokenizer_path, "config.json")
with open(config_path, 'w') as fp:
    json.dump(config, fp)

tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path, cache_dir=None)

In [0]:
def max_length(tensor):
    return max(len(t) for t in tensor)

def tokenize_string(tokenizer, raw_string):
    return tokenizer.encode(raw_string)
  
def encode_dataset(file_path, pretrained_tokenizer_path, num_examples):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path)
    sentences = io.open(file_path, encoding='UTF-8').read().strip().split('\n')
    encoded_sequences = [tokenize_string(tokenizer, sentence) for sentence in sentences[:num_examples]]
    encoded_sequences = pad_sequences(encoded_sequences, padding='post', value=tokenizer.pad_token_id)
    return encoded_sequences

NUM_EXAMPLES = 6 
encoded_sequences = encode_dataset(unaligned_en_path, pretrained_tokenizer_path, NUM_EXAMPLES)

def data_generator_fn():
    for seq in encoded_sequences:
        yield seq

In [0]:
BATCH_SIZE = 2

dataset = tf.data.Dataset.from_generator(
    data_generator_fn,
    output_types=(tf.int32)
    ).batch(BATCH_SIZE)

In [7]:
for inputs in dataset.take(2):
    print(inputs)

tf.Tensor(
[[  0 475 286 264 716 619 290 264 483 550  87 313 225  78 427 648 308 353
   74 540  82  88 271  77  94 282  16 602  16 723  16 644 297 387 401  69
   17  80 504  18 330  88 389 712  18   2   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1]
 [  0 225  43 269  75 648 386 412 293 556 264 692 577 288  89 441  16 514
  330 489 264 720 375  55 414  74 269 325 271 296 265  88 441 284 428 272
  274 403  76 592  16 263 390 301 406 405 385 279 464  71 307 286 319 482
  577 324  81  93 403 291 567 415  87 587  81 266 272 401 318  18   2]], shape=(2, 71), dtype=int32)
tf.Tensor(
[[  0 330  11  81 659 381 430 261  77 485  18   2   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1]
 [  0 330  88 312 712 314 264 276 469 272 312 280 307 568 264 730 523

In [0]:
def mask_tokens(inputs: tf.Tensor, 
                tokenizer: PreTrainedTokenizer, 
                mlm_probability=0.15) -> Tuple[tf.Tensor, tf.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """

    if tokenizer.mask_token is None:
        raise ValueError(
            "This tokenizer does not have a mask token which is necessary for masked language modeling. \
            Remove the --mlm flag if you want to use this tokenizer."
        )
    labels = inputs
    # We sample a few tokens in each sequence for masked-LM training 
    # (with probability mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = np.full(labels.shape, mlm_probability)
    special_tokens_mask = [
            tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.numpy().tolist()
        ]
    probability_matrix = np.ma.array(probability_matrix, mask=special_tokens_mask).filled(0)
    if tokenizer._pad_token is not None:
        padding_mask = (labels == tokenizer.pad_token_id)
        probability_matrix = np.ma.array(probability_matrix, mask=padding_mask).filled(0)

    masked_indices = tf.compat.v1.distributions.Bernoulli(probs=probability_matrix)
    masked_indices = masked_indices.sample(1)[0].numpy()
    masked_indices = np.array(masked_indices, dtype=np.bool)
    labels = labels.numpy() 
    labels[~masked_indices] = -100 # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    choose_mask_tokens = tf.compat.v1.distributions.Bernoulli(probs=np.full(labels.shape, 0.8))
    choose_mask_tokens = choose_mask_tokens.sample(1)[0].numpy()
    choose_mask_tokens = np.array(choose_mask_tokens, dtype=np.bool)
    indices_replaced = choose_mask_tokens & masked_indices
    inputs = inputs.numpy()
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    choose_random_tokens = tf.compat.v1.distributions.Bernoulli(probs=np.full(labels.shape, 0.5))
    choose_random_tokens = choose_random_tokens.sample(1)[0].numpy()
    choose_random_tokens = np.array(choose_random_tokens, dtype=np.bool)
    indices_random = choose_random_tokens & masked_indices & ~indices_replaced
    indices_random, choose_random_tokens
    random_words = np.random.randint(0, len(tokenizer), labels.shape, dtype=np.int32)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return tf.convert_to_tensor(inputs), tf.convert_to_tensor(labels)

In [9]:
mask_tokens(inputs, tokenizer, mlm_probability=0.15)

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


(<tf.Tensor: shape=(2, 71), dtype=int32, numpy=
 array([[  0, 330, 527,  81, 659, 381, 156, 261,  77, 485,  18,   2,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1],
        [  0, 330,  88, 312,   4, 314, 264, 276, 469, 272, 312,   4, 307,
         568, 264, 730, 523,  11, 702, 677, 437, 564, 358, 381,   4,   4,
         316, 304,  18,   2,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1]], dtype=int32)>,
 <tf.Tensor: shape=(2, 71), dtype=int32, numpy=
 array([[-100, -100,   11, -100, -100, -100,  430, -100, -100, -100,

In [10]:
configuration = BertConfig.from_pretrained(pretrained_tokenizer_path)
bert_model = TFBertModel(config=configuration)

input = np.random.randint(0,10, size=(2,3))
output = bert_model(input)
output[0].shape, output[1].shape

(TensorShape([2, 3, 128]), TensorShape([2, 128]))

In [11]:
bert_model.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  4022272   
Total params: 4,022,272
Trainable params: 4,022,272
Non-trainable params: 0
_________________________________________________________________
