In [19]:
import pandas as pd
import numpy as np
import urllib.request, json, os

from transformers import (
    BertConfig,
    TFBertForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizerFast,
    AdamWeightDecay
)

from tokenizers import (
    Tokenizer,
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    processors,
    decoders
)

from datasets import IterableDataset, load_dataset, load_from_disk

In [40]:
try:
    from google.colab import drive
    import sys

    drive.mount('/content/gdrive/')
    sys.path.append('/content/gdrive/My Drive/Colab Notebooks')
except:
    pass

from utils import (
    TweetIterator
)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


### Config

In [41]:
VOCAB_SIZE = 30522
DATA_DIR = '/content/gdrive/My Drive/Colab Notebooks' # data
TWEETS_PATH = 'birthyear.1990_2009.lowercase'         # name of tweets file
TEST_PCT = 0.1 # defines pct of total dataset to use for validation, 1 - TEST_PCT = pct of dataset to use for training
TRAIN_BATCH_SIZE = 64
TEST_BATCH_SIZE = 16

full_tweets_path = os.path.join(DATA_DIR, TWEETS_PATH)
dataset_path = '.'.join(TWEETS_PATH.split('.')[:-1]) + '.hf'
full_dataset_path = os.path.join(DATA_DIR, dataset_path)
tokenizer_path = TWEETS_PATH + '_tokenizer'
full_tokenizer_path = os.path.join(DATA_DIR, tokenizer_path)

In [22]:
num_tweets = 0
with open(full_tweets_path, 'r') as file:
    for line in file:
        num_tweets += 1

### Create Untrained BERT Model

### Create and Train BERT Tokenizer
https://huggingface.co/learn/nlp-course/en/chapter6/8#building-a-wordpiece-tokenizer-from-scratch

In [32]:
if tokenizer_path in os.listdir(os.path.join(DATA_DIR, os.path.curdir)):
    print(f"You've saved this tokenizer before at {full_tokenizer_path}.")
    print("Loading from disk...")

    wrapped_tokenizer = PreTrainedTokenizerFast.from_pretrained(os.path.join(DATA_DIR, tokenizer_path))
    print("Tokenizer loaded!")
else:
    print("You've never saved this dataset before. Creating a tokenizer from scratch...")
    # create a BERT tokenizer
    tokenizer = Tokenizer(model = models.WordPiece(unk_token = '[UNK]'))
    normalizer = normalizers.Sequence([
        normalizers.NFD(),            # Normalize characters to Unicode NFD
        # normalizers.Lowercase(),      # Set all characters to lowercase - not necessary, as tweets are already lowercase
        normalizers.StripAccents()    # Remove all accents from characters
    ])
    pre_tokenizer = pre_tokenizers.Whitespace()

    tokenizer.normalizer = normalizer
    tokenizer.pre_tokenizer = pre_tokenizer

    # create a WordPiece trainer
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.WordPieceTrainer(
        vocab_size = VOCAB_SIZE,
        special_tokens = special_tokens
    )
    print(f"Creating an iterator from tweets at {full_tweets_path}...")
    # create an iterator out of our tweets since they won't fit on disk
    tweets = TweetIterator(full_tweets_path)
    print(f"Training your tokenizer...")
    tokenizer.train_from_iterator(tweets, trainer=trainer, length=num_tweets)

    post_processor = processors.TemplateProcessing(
        single=f"[CLS]:0 $A:0 [SEP]:0",
        pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
        special_tokens=[("[CLS]", tokenizer.token_to_id('[CLS]')), ("[SEP]", tokenizer.token_to_id('[SEP]'))]
    )
    tokenizer.post_processor = post_processor

    decoder = decoders.WordPiece(prefix="##")
    tokenizer.decoder = decoder
    # wrap our tokenizer in a PreTrainedTokenizerFast object
    # so we can use it with a DataCollator and Trainer
    wrapped_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )
    print('Saving tokenizer to disk...')
    wrapped_tokenizer.save_pretrained(os.path.join(DATA_DIR, tokenizer_path))
    print(f'Tokenizer saved at {full_tokenizer_path}')

You've saved this tokenizer before at /content/gdrive/My Drive/Colab Notebooks/birthyear.1990_2009.lowercase_tokenizer.
Loading from disk...
Tokenizer loaded!


## Create Train and Test Datasets from Iterator

In [5]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = wrapped_tokenizer,
    mlm_probability = 0.15, # probability that a token is masked
    return_tensors="tf"
)

In [35]:
if dataset_path in os.listdir(os.path.join(DATA_DIR, os.path.curdir)):
    print(f"You've saved this dataset before at {full_dataset_path}.")
    print("Loading from disk...")
    tweets_ds = load_from_disk(full_dataset_path)
    print("Dataset loaded!")
else:
    print(f"You've never saved this dataset before. Loading dataset from {full_tweets_path}...")
    tweets_ds = load_dataset(
        path = "text",
        data_files = full_tweets_path,
        split = f"train", # train on everything
    )
    print('Tokenizing dataset...')
    tweets_ds = tweets_ds.map(
        function = lambda x: wrapped_tokenizer(x['text']),
        batched = True
    )
    print('Saving dataset to disk...')
    tweets_ds.save_to_disk(full_dataset_path)
    print(f'Dataset saved at {full_dataset_path}')

You've never saved this dataset before. Loading dataset from /content/gdrive/My Drive/Colab Notebooks/birthyear.1990_2009.lowercase...


Generating train split: 0 examples [00:00, ? examples/s]

Tokenizing dataset...


Map:   0%|          | 0/5447916 [00:00<?, ? examples/s]

Saving dataset to disk...


Saving the dataset (0/2 shards):   0%|          | 0/5447916 [00:00<?, ? examples/s]

Dataset saved at /content/gdrive/My Drive/Colab Notebooks/birthyear.1990_2009.hf


In [37]:
# split your dataset into train and val
tweets_ds_split = tweets_ds.train_test_split(test_size=TEST_PCT)

## Initialize Untrained BERT Model

In [38]:
config = BertConfig(
    vocab_size = VOCAB_SIZE
)
bert_model = TFBertForMaskedLM(config = config)
bert_model.build()
bert_model.compile(optimizer = AdamWeightDecay(lr=2e-5, weight_decay_rate=0.01))

bert_model.summary()

Model: "tf_bert_for_masked_lm_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


  super().__init__(name, **kwargs)


## Prepare Dataset for Training Loop

In [39]:
tweets_ds_tf_train = bert_model.prepare_tf_dataset(
    dataset = tweets_ds_split["train"],
    collate_fn = data_collator,
    batch_size = TRAIN_BATCH_SIZE,
    shuffle=True
)

tweets_ds_tf_test = bert_model.prepare_tf_dataset(
    dataset = tweets_ds_split["test"],
    collate_fn = data_collator,
    batch_size = TEST_BATCH_SIZE,
    shuffle=True
)

In [15]:
bert_model.fit(
    x = tweets_ds_tf_train,
    validation_data = tweets_ds_tf_test,
    epochs = 10,
    steps_per_epoch = 3,
    validation_steps = 1,
    verbose = 1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x790b0578d630>