In [1]:
en_file1 = '../data/train.lang1.no-punctuation/train.lang1'
en_file2 = '../data/train.en.no-punctuation/unaligned.en'
fr_file1 = '../data/train.lang2.no-punctuation/train.lang2'
fr_file2 = '../data/train.fr.no-punctuation/unaligned.fr'
# TODO: exclude the test set

lang_files = [en_file1, en_file2, fr_file1, fr_file2]

In [2]:
from collections import Counter

# Add control tokens
my_counter = Counter()
for i in range(2):
    my_counter.update(["<START>", "<STOP>", "<UNK>", "<MASK>", "<SEP>", "<PAD>"])

In [3]:
def string_to_tokens(raw_string):
    return raw_string.split()

line_lengths = []
for lang_file in lang_files:
    with open(lang_file) as f:
        for line in f:
            tokens = string_to_tokens(line)
            line_lengths += [len(tokens)]
            for token in tokens:
                my_counter.update([token])    

In [4]:
import numpy as np
tokens = np.array(list(my_counter.keys()))
frequencies = np.array(list(my_counter.values()))

In [5]:
retained_tokens = tokens[frequencies > 1]

In [6]:
print(len(tokens))
print(len(retained_tokens))

127878
86384


In [7]:
indices = np.array(range(len(retained_tokens)))

In [8]:
indices

array([    0,     1,     2, ..., 86381, 86382, 86383])

In [9]:
my_tokenizer_lut = dict(zip(retained_tokens,indices))

In [10]:
def pad_tokens(token_list, max_length):
    if len(token_list) >= max_length:
        token_list = token_list[:max_length]
        token_list[(max_length-1)] = my_tokenizer_lut["<STOP>"]
    else:
        while len(token_list) < max_length:
            token_list = token_list + [my_tokenizer_lut["<PAD>"]]
    return token_list

def tokenize_string(raw_string, max_length=32): # TODO: Better definition of sentence length
    token_list = [my_tokenizer_lut["<START>"]]
    
    for token in string_to_tokens(raw_string):
        if token in my_tokenizer_lut:
            token_list += [my_tokenizer_lut[token]]
        else:
            token_list += [my_tokenizer_lut["<UNK>"]]
    
    token_list += [my_tokenizer_lut["<STOP>"]]
    
    token_list = pad_tokens(token_list, max_length)
    
    return np.array(token_list)[None,:]
        
tokenize_string("The quick brown fox jumped over the lazy dog.")

array([[   0,    2, 2018, 2659, 1169, 8195,   30,    9, 1244,    2,    1,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5]])

In [11]:
def tokenize_file(filename):
    with open(filename) as f:
        tokens = []
        for line in f:
            tokens += [tokenize_string(line)]
    return np.concatenate(tokens,axis=0)

x_true = tokenize_file(en_file1)
x_true.shape

(11000, 32)

In [12]:
def mask_tokens(true_tokens):
    non_pad_tokens = true_tokens != my_tokenizer_lut["<PAD>"]
    random_masking_seed = np.random.uniform(0,1,(32,)) * non_pad_tokens
    
    masking_targets = 0.85 < random_masking_seed # 15% # TODO: Use masking targets
    mask_token_targets = np.logical_and(0.85 < random_masking_seed, random_masking_seed < 0.85 + 0.15*0.8) # 80% of 15%
    random_token_targets = np.logical_and(1.0 - 0.1*0.15 < random_masking_seed, random_masking_seed < 1.0) # 10% of 15%
    
    masked_tokens = true_tokens.copy()
    masked_tokens[mask_token_targets] = my_tokenizer_lut["<MASK>"]
    masked_tokens[random_token_targets] = np.random.randint(0,len(my_tokenizer_lut),(random_token_targets.sum(),))
    
    return masked_tokens

x_train = mask_tokens(x_true)
x_train[0]

array([ 0,  6,  7,  8,  9, 10, 11, 12, 13,  3, 15, 16,  1,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5])

In [13]:
attention_mask = x_train != my_tokenizer_lut["<PAD>"]
attention_mask[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [14]:
x_true[0]

array([ 0,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  1,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5])

In [15]:
from transformers import BertConfig, TFBertForMaskedLM
config = BertConfig.from_pretrained('../uncased_L-2_H-128_A-2/bert_config_tiny.json')
config.vocab_size = len(my_tokenizer_lut)
model = TFBertForMaskedLM(config)

In [16]:
import tensorflow as tf
input_ids = tf.constant(tokenize_string("Hello, my dog is cute"))
outputs = model((input_ids,))
outputs

(<tf.Tensor: shape=(1, 32, 86384), dtype=float32, numpy=
 array([[[ 0.09791588,  0.13342728, -0.10703319, ..., -0.16127932,
           0.03996952,  0.16378266],
         [-0.04859214,  0.06760185,  0.04196171, ..., -0.1572957 ,
           0.13940057,  0.11954289],
         [ 0.28934202, -0.15358104,  0.14815764, ..., -0.129953  ,
           0.16664478, -0.0650624 ],
         ...,
         [ 0.04984542, -0.04067166, -0.11882814, ..., -0.1577919 ,
           0.32993722, -0.09779587],
         [ 0.11055933, -0.20321994,  0.1207041 , ..., -0.332794  ,
          -0.02643372, -0.06331111],
         [ 0.10005414, -0.18562782,  0.24596229, ..., -0.3783547 ,
          -0.07882662,  0.02330005]]], dtype=float32)>,)

In [17]:
len(my_tokenizer_lut)

86384

In [18]:
input_ids

<tf.Tensor: shape=(1, 32), dtype=int64, numpy=
array([[   0,    2,   55, 1695,   15, 9325,    1,    5,    5,    5,    5,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
           5,    5,    5,    5,    5,    5,    5,    5,    5,    5]])>

In [19]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss)
model.fit((x_train,attention_mask), x_true)

Train on 11000 samples


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




<tensorflow.python.keras.callbacks.History at 0x7fa1d8368438>

In [20]:
my_input = tokenize_string("I drove to <MASK>")
my_output = model(my_input)

In [21]:
my_input

array([[    0,     2, 10439,   119,     3,     1,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5,     5,     5,     5,     5,
            5,     5,     5,     5,     5]])

In [28]:
# tf.argmax(my_output[0][0,4,:])
my_output[0][0,4,:][:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 1.4085811 ,  1.6936857 ,  0.54693615, -0.08038695, -1.3574069 ,
        2.6084661 ,  0.86079395,  0.1713116 ,  0.7250719 ,  1.8401847 ],
      dtype=float32)>

In [23]:
inv_tokenizer_lut = {v: k for k, v in my_tokenizer_lut.items()}

In [24]:
inv_tokenizer_lut[5]

'<PAD>'

In [25]:
my_tokenizer_lut['work']

953