In [1]:
import pprint as pp
import pandas as pd
import numpy as np
import urllib.request, json 

from transformers import (
    BertConfig, 
    TFBertForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizerFast,
    AdamWeightDecay
)

from tokenizers import (
    Tokenizer,
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    processors,
    decoders
)

from datasets import IterableDataset, load_dataset, load_from_disk

from utils import (
    TweetIterator
)

  from .autonotebook import tqdm as notebook_tqdm


### Config

In [2]:
VOCAB_SIZE = 1000
TWEETS_PATH = '../data/birthyear.1950_1969.lowercase'

### Create Untrained BERT Model

In [3]:
# not needed - BertConfig default settings are equivalent to bert-base-uncased config
# with urllib.request.urlopen("https://huggingface.co/google-bert/bert-base-uncased/resolve/main/config.json") as url:
#     config_from_pretrained = json.load(url) # bert-base-uncased config
    
# initialize & build Masked LM BERT model w/ default config settings
config = BertConfig(
    vocab_size = VOCAB_SIZE
)
bert_model = TFBertForMaskedLM(config = config)
bert_model.build()
bert_model.compile(optimizer = AdamWeightDecay(lr=2e-5, weight_decay_rate=0.01))

bert_model.summary()

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  86218752  
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  1757416   
                                                                 
Total params: 86811880 (331.16 MB)
Trainable params: 86811880 (331.16 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


  super().__init__(name, **kwargs)


### Create and Train BERT Tokenizer
https://huggingface.co/learn/nlp-course/en/chapter6/8#building-a-wordpiece-tokenizer-from-scratch

In [4]:
# create a BERT tokenizer
tokenizer = Tokenizer(model = models.WordPiece(unk_token = '[UNK]'))
normalizer = normalizers.Sequence([
    normalizers.NFD(),            # Normalize characters to Unicode NFD
    # normalizers.Lowercase(),      # Set all characters to lowercase - not necessary, as tweets are already lowercase
    normalizers.StripAccents()    # Remove all accents from characters
])
pre_tokenizer = pre_tokenizers.Whitespace()

tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = pre_tokenizer

# create a WordPiece trainer
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(
    vocab_size = VOCAB_SIZE,
    special_tokens = special_tokens
)

In [5]:
# create an iterator out of our tweets since they won't fit on disk
tweets = TweetIterator(TWEETS_PATH)
tokenizer.train_from_iterator(tweets, trainer=trainer)






In [6]:
post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", tokenizer.token_to_id('[CLS]')), ("[SEP]", tokenizer.token_to_id('[SEP]'))],
)
tokenizer.post_processor = post_processor

decoder = decoders.WordPiece(prefix="##")
tokenizer.decoder = decoder

In [7]:
# wrap our tokenizer in a PreTrainedTokenizerFast object
# so we can use it with a DataCollator and Trainer
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

## Create Train and Test Datasets from Iterator

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = wrapped_tokenizer, 
    mlm_probability = 0.15, # probability that a token is masked
    return_tensors="tf"
)

In [9]:
# tweets_ds = load_dataset(
#     path = "text",
#     data_files = TWEETS_PATH,
#     split = "train", # train on everything
# )
# tweets_ds = tweets_ds.map(
#     function = lambda x: wrapped_tokenizer(x['text']),
#     batched = True
# )

In [10]:
# tweets_ds.save_to_disk('birthyear.1950_1969.hf')

In [11]:
tweets_ds = load_from_disk('birthyear.1950_1969.hf')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
tweets_ds_tf = bert_model.prepare_tf_dataset(
    dataset = tweets_ds,
    collate_fn = data_collator,
    batch_size = 32
)

In [13]:
bert_model.fit(tweets_ds_tf, epochs=1) 

Cause: for/else statement not yet supported
Cause: for/else statement not yet supported
