In [32]:
import pprint as pp
import pandas as pd
import numpy as np
import urllib.request, json 

from transformers import (
    BertConfig, 
    TFBertForMaskedLM
)

from tokenizers import (
    Tokenizer,
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    processors,
    decoders
)

from utils import (
    get_tweet_list,
    get_tweet_iterator
)

### Config

In [30]:
VOCAB_SIZE = 1000
TWEETS_PATH = '../data/birthyear.1950_1969.lowercase'

### Create Untrained BERT Model

In [19]:
# not needed - BertConfig default settings are equivalent to bert-base-uncased config
# with urllib.request.urlopen("https://huggingface.co/google-bert/bert-base-uncased/resolve/main/config.json") as url:
#     config_from_pretrained = json.load(url) # bert-base-uncased config
    
# initialize & build Masked LM BERT model w/ default config settings
config = BertConfig(
    vocab_size = VOCAB_SIZE
)
bert_model = TFBertForMaskedLM(config = config)
bert_model.build()

bert_model.summary()

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  86218752  
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  1757416   
                                                                 
Total params: 86811880 (331.16 MB)
Trainable params: 86811880 (331.16 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Create and Train BERT Tokenizer
https://huggingface.co/learn/nlp-course/en/chapter6/8#building-a-wordpiece-tokenizer-from-scratch

In [22]:
# create a BERT tokenizer
tokenizer = Tokenizer(model = models.WordPiece(unk_token = '[UNK]'))
normalizer = normalizers.Sequence([
    normalizers.NFD(),            # Normalize characters to Unicode NFD
    # normalizers.Lowercase(),      # Set all characters to lowercase - not necessary, as tweets are already lowercase
    normalizers.StripAccents()    # Remove all accents from characters
])
pre_tokenizer = pre_tokenizers.Whitespace()

tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = pre_tokenizer

# create a WordPiece trainer
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(
    vocab_size = VOCAB_SIZE,
    special_tokens = special_tokens
)

In [None]:
# create an iterator out of our tweets
tweets = get_tweet_iterator('../data/birthyear.1950_1969.lowercase')
tokenizer.train_from_iterator(tweets, trainer=trainer)

In [23]:
tweets = get_tweet_list(TWEETS_PATH)
tokenizer.train_from_iterator(tweets, trainer=trainer)






In [None]:
post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", tokenizer.token_to_id('[CLS]')), ("[SEP]", tokenizer.token_to_id('[SEP]'))],
)
tokenizer.post_processor = post_processor

In [None]:
decoder = decoders.WordPiece(prefix="##")
tokenizer.decoder = decoder