In [1]:
!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt

--2021-02-11 17:45:55--  https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.98.229
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.98.229|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 231508 (226K) [text/plain]
Saving to: ‘bert-base-uncased-vocab.txt.1’


2021-02-11 17:45:57 (325 KB/s) - ‘bert-base-uncased-vocab.txt.1’ saved [231508/231508]



In [1]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)
vocabs = tokenizer.get_vocab()
print(vocabs)
print('number of vocabs: ', tokenizer.get_vocab_size())

outputs = tokenizer.encode_batch(['hello world', 'this is a piece of text', 'this is a hard word: edema'])
for output in outputs:
  print(output.tokens)
  print(output.ids)
  print(output.attention_mask)

number of vocabs:  30522
['[CLS]', 'hello', 'world', '[SEP]']
[101, 7592, 2088, 102]
[1, 1, 1, 1]
['[CLS]', 'this', 'is', 'a', 'piece', 'of', 'text', '[SEP]']
[101, 2023, 2003, 1037, 3538, 1997, 3793, 102]
[1, 1, 1, 1, 1, 1, 1, 1]
['[CLS]', 'this', 'is', 'a', 'hard', 'word', ':', 'ed', '##ema', '[SEP]']
[101, 2023, 2003, 1037, 2524, 2773, 1024, 3968, 14545, 102]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [2]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
!unzip wikitext-103-raw-v1.zip

--2021-02-13 00:38:15--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.110.173
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.110.173|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘wikitext-103-raw-v1.zip.1’

-v1.zip.1             1%[                    ]   2.49M   367KB/s    eta 7m 5s  ^C
Archive:  wikitext-103-raw-v1.zip
replace wikitext-103-raw/wiki.test.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
trainer = BpeTrainer(special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'])
tokenizer.pre_tokenizer = Whitespace()

files = [f'wikitext-103-raw/wiki.{split}.raw' for split in ['test', 'train', 'valid']]
tokenizer.train(files, trainer)

In [3]:
tokenizer.save("wikitext-103-raw/tokenizer-wiki.json")

In [4]:
print('number of vocabs: ', tokenizer.get_vocab_size())
print('vocabs: ', tokenizer.get_vocab())

number of vocabs:  30000


In [5]:
import tensorflow_datasets as tfds

examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']
print(train_examples)

en_text = []
pt_text = []
for pt, en in train_examples:
  pt_text.append(pt.numpy().decode('utf-8'))
  en_text.append(en.numpy().decode('utf-8'))

print(en_text[:10])
print(pt_text[:10])
assert len(en_text) == len(pt_text)

<DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.string)>
['and when you improve searchability , you actually take away the one advantage of print , which is serendipity .', 'but what if it were active ?', "but they did n't test for curiosity .", 'and this conscious defiance is why i , as an agnostic , can still have faith .', 'you can use everything on the table on me .', "`` i write a lot about `` '' security theater , '' '' which are products that make people feel secure , but do n't actually do anything . ''", "and they 've put it deep down in an iron mine in minnesota , ok , deep under the ground , and in fact , in the last couple of days announced the most sensitive results so far .", 'see , some people might fear girls not liking them back .', "no , what happened to us , chris , is that power , it 's priced off the margin .", 'back to my question : why did i stay ?']
['e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .', 'mas

In [6]:
def train_tokenizer(text):
  tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
  trainer = BpeTrainer(special_tokens=['[PAD]', '[UNK]', '[START]', '[END]'])
  tokenizer.pre_tokenizer = Whitespace()
  tokenizer.train_from_iterator(text, trainer)
  return tokenizer

en_tokenizer = train_tokenizer(en_text)
pt_tokenizer = train_tokenizer(pt_text)
  
print('number of english vocabs: ', en_tokenizer.get_vocab_size())
print('number of pt vocabs: ', pt_tokenizer.get_vocab_size())
print('vocabs: ', en_tokenizer.get_vocab())
print('vocabs: ', pt_tokenizer.get_vocab())

number of english vocabs:  30000
number of pt vocabs:  30000
vocabs:  {'matthe': 18535, 'gha': 22935, 'coletivos': 20598, 'injustiça': 5780, 'm': 52, 'decidiram': 4927, 'inventar': 3335, 'tiam': 3928, 'historia': 9396, 'fúria': 14423, 'andei': 9085, 'mapeámos': 18072, 'psiquia': 7238, 'entram': 6246, 'modes': 24063, 'transpor': 2099, 'irritadi': 27519, 'piratas': 8430, 'dolor': 4973, 'preenchem': 21865, 'medu': 23776, 'hierar': 10915, 'st': 2895, 'tantes': 2696, 'sário': 3626, 'dio': 886, 'parentes': 16762, 'recebidos': 27208, 'persegu': 6102, 'kyi': 21512, 'ร': 103, 'contagiosas': 22379, 'considerei': 26235, 'olfato': 13751, 'árqui': 17617, 'sinai': 18623, 'revelo': 19311, 'ainda': 536, 'aulas': 3136, 'artístico': 8424, 'importava': 10855, 'natanz': 14895, 'compa': 9564, 'macro': 14771, 'urg': 12711, 'léxico': 11194, 'namorada': 8540, 'finito': 17292, 'melhorando': 20091, 'imaginados': 24865, 'glicinas': 27915, 'maçar': 19258, 'tímidos': 21896, 'nife': 24225, 'taz': 13478, 'igas': 198

In [10]:
output = en_tokenizer.encode(en_text[0])
print(output.ids)

[99, 255, 117, 1994, 18305, 1522, 13, 117, 345, 428, 799, 90, 171, 3891, 106, 1916, 13, 331, 97, 17333, 15]
