In [2]:
# if in Google colab, need to install datasets
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00

In [21]:
import pandas as pd
import numpy as np
import urllib.request, json, os, math

from transformers import (
    BertConfig,
    TFBertForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizerFast,
    AdamWeightDecay,
    create_optimizer
)

from tokenizers import (
    Tokenizer,
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    processors,
    decoders
)

from datasets import IterableDataset, load_dataset, load_from_disk

In [2]:
try:
    from google.colab import drive
    import sys

    drive.mount('/content/gdrive/')
    sys.path.append('/content/gdrive/My Drive/Colab Notebooks')
except:
    pass

from utils import (
    TweetIterator
)

Mounted at /content/gdrive/


### Config

In [22]:
VOCAB_SIZE = 30522
DATA_DIR = '/content/gdrive/My Drive/Colab Notebooks' # data
TWEETS_PATH = 'birthyear.1990_2009.lowercase'         # name of tweets file
TEST_PCT = 0.1 # defines pct of total dataset to use for validation, 1 - TEST_PCT = pct of dataset to use for training
TRAIN_BATCH_SIZE = 64
TEST_BATCH_SIZE = 16
INITIAL_LEARNING_RATE = 0.0001 # same as init rate in BERT paper
WEIGHT_DECAY = 0.01           # same as weight decay in BERT paper

full_tweets_path = os.path.join(DATA_DIR, TWEETS_PATH)
dataset_path = '.'.join(TWEETS_PATH.split('.')[:-1]) + '.hf'
full_dataset_path = os.path.join(DATA_DIR, dataset_path)
tokenizer_path = TWEETS_PATH + '_tokenizer'
full_tokenizer_path = os.path.join(DATA_DIR, tokenizer_path)

In [11]:
num_tweets = 0
with open(full_tweets_path, 'r') as file:
    for line in file:
        num_tweets += 1
print(f"Total Number of Tweets: {num_tweets:,.0f}")

Total Number of Tweets: 5,447,916


### Create Untrained BERT Model

### Create and Train BERT Tokenizer
https://huggingface.co/learn/nlp-course/en/chapter6/8#building-a-wordpiece-tokenizer-from-scratch

In [6]:
if tokenizer_path in os.listdir(os.path.join(DATA_DIR, os.path.curdir)):
    print(f"You've saved this tokenizer before at {full_tokenizer_path}.")
    print("Loading from disk...")

    wrapped_tokenizer = PreTrainedTokenizerFast.from_pretrained(os.path.join(DATA_DIR, tokenizer_path))
    print("Tokenizer loaded!")
else:
    print("You've never saved this dataset before. Creating a tokenizer from scratch...")
    # create a BERT tokenizer
    tokenizer = Tokenizer(model = models.WordPiece(unk_token = '[UNK]'))
    normalizer = normalizers.Sequence([
        normalizers.NFD(),            # Normalize characters to Unicode NFD
        # normalizers.Lowercase(),      # Set all characters to lowercase - not necessary, as tweets are already lowercase
        normalizers.StripAccents()    # Remove all accents from characters
    ])
    pre_tokenizer = pre_tokenizers.Whitespace()

    tokenizer.normalizer = normalizer
    tokenizer.pre_tokenizer = pre_tokenizer

    # create a WordPiece trainer
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.WordPieceTrainer(
        vocab_size = VOCAB_SIZE,
        special_tokens = special_tokens
    )
    print(f"Creating an iterator from tweets at {full_tweets_path}...")
    # create an iterator out of our tweets since they won't fit on disk
    tweets = TweetIterator(full_tweets_path)
    print(f"Training your tokenizer...")
    tokenizer.train_from_iterator(tweets, trainer=trainer, length=num_tweets)

    post_processor = processors.TemplateProcessing(
        single=f"[CLS]:0 $A:0 [SEP]:0",
        pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
        special_tokens=[("[CLS]", tokenizer.token_to_id('[CLS]')), ("[SEP]", tokenizer.token_to_id('[SEP]'))]
    )
    tokenizer.post_processor = post_processor

    decoder = decoders.WordPiece(prefix="##")
    tokenizer.decoder = decoder
    # wrap our tokenizer in a PreTrainedTokenizerFast object
    # so we can use it with a DataCollator and Trainer
    wrapped_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )
    print('Saving tokenizer to disk...')
    wrapped_tokenizer.save_pretrained(os.path.join(DATA_DIR, tokenizer_path))
    print(f'Tokenizer saved at {full_tokenizer_path}')

You've saved this tokenizer before at /content/gdrive/My Drive/Colab Notebooks/birthyear.1990_2009.lowercase_tokenizer.
Loading from disk...
Tokenizer loaded!


## Create Train and Test Datasets from Iterator

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = wrapped_tokenizer,
    mlm_probability = 0.15, # probability that a token is masked
    return_tensors="tf"
)

In [9]:
if dataset_path in os.listdir(os.path.join(DATA_DIR, os.path.curdir)):
    print(f"You've saved this dataset before at {full_dataset_path}.")
    print("Loading from disk...")
    tweets_ds = load_from_disk(full_dataset_path)
    print("Dataset loaded!")
else:
    print(f"You've never saved this dataset before. Loading dataset from {full_tweets_path}...")
    tweets_ds = load_dataset(
        path = "text",
        data_files = full_tweets_path,
        split = f"train", # train on everything
    )
    print('Tokenizing dataset...')
    tweets_ds = tweets_ds.map(
        function = lambda x: wrapped_tokenizer(x['text']),
        batched = True
    )
    print('Saving dataset to disk...')
    tweets_ds.save_to_disk(full_dataset_path)
    print(f'Dataset saved at {full_dataset_path}')

You've saved this dataset before at /content/gdrive/My Drive/Colab Notebooks/birthyear.1990_2009.hf.
Loading from disk...
Dataset loaded!


In [12]:
# split your dataset into train and val
tweets_ds_split = tweets_ds.train_test_split(test_size=TEST_PCT)

## Initialize Untrained BERT Model

In [32]:
# Define the optimizer and learning rate schedule
# original BERT Model trained for 1,000,000 steps total
# first 1% (10,000 steps) were warm-up steps w/ static 1e-4 LR, then linear weight decay

TOTAL_NUM_STEPS = 100 # to modify after testing compute time
num_warmup_steps = math.floor(TOTAL_NUM_STEPS * 0.01)

optimizer, lr_schedule = create_optimizer(
    init_lr=INITIAL_LEARNING_RATE,
    num_train_steps=TOTAL_NUM_STEPS,
    num_warmup_steps=num_warmup_steps,
    weight_decay_rate=WEIGHT_DECAY,
)

In [33]:
config = BertConfig(
    vocab_size = VOCAB_SIZE
)
bert_model = TFBertForMaskedLM(config = config)
bert_model.build()
bert_model.compile(optimizer = optimizer)

bert_model.summary()

Model: "tf_bert_for_masked_lm_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Prepare Dataset for Training Loop

In [34]:
tweets_ds_tf_train = bert_model.prepare_tf_dataset(
    dataset = tweets_ds_split["train"],
    collate_fn = data_collator,
    batch_size = TRAIN_BATCH_SIZE,
    shuffle=True
)

tweets_ds_tf_test = bert_model.prepare_tf_dataset(
    dataset = tweets_ds_split["test"],
    collate_fn = data_collator,
    batch_size = TEST_BATCH_SIZE,
    shuffle=True
)

In [35]:
bert_model.fit(
    x = tweets_ds_tf_train,
    validation_data = tweets_ds_tf_test,
    epochs = 10,
    steps_per_epoch = 10,
    validation_steps = 1,
    verbose = 1
)

Epoch 1/10


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7a85fc169c60>

## Extracting Word Embeddings from Trained Model

In [66]:
def get_token_embedding(tokenizer, embedding_layer, token):
    # convenience function to get the embedding of a particular token
    token_id = wrapped_tokenizer.convert_tokens_to_ids('dog')
    return embedding_layer(tf.constant([[token_id]]))

In [65]:
# retrieve the embedding layer from our bert model
embedding_layer = bert_model.bert.embeddings

get_token_embedding(
    wrapped_tokenizer,
    embedding_layer,
    'dog'
)

<tf.Tensor: shape=(1, 1, 768), dtype=float32, numpy=
array([[[ 7.88608253e-01,  9.68113363e-01,  1.57739028e-01,
         -9.38848078e-01,  2.14821160e-01, -1.34463608e+00,
         -2.38287240e-01, -7.72886351e-02,  2.11218882e+00,
          1.18822300e+00, -8.77568543e-01, -1.21817708e+00,
         -9.78406966e-01,  1.11187971e+00,  1.63094485e+00,
          2.72095978e-01, -5.70081413e-01, -1.39192247e+00,
          7.87784576e-01, -1.60849595e+00, -9.85559225e-01,
         -6.31287754e-01, -1.64200544e+00,  8.29331994e-01,
          3.43962401e-01, -3.06470066e-01,  3.86720568e-01,
          1.11212635e+00,  1.45007074e+00,  2.59744465e-01,
          9.94849861e-01, -9.12237465e-01,  9.45935130e-01,
         -3.84635329e-01, -8.45617533e-01, -9.22208190e-01,
         -4.82530177e-01,  7.82035351e-01,  9.06472802e-01,
          6.43554628e-01,  1.09253883e+00, -3.95650268e-01,
         -1.09564853e+00, -9.34504569e-01, -9.19237852e-01,
          5.71515143e-01, -1.51330042e+00, -1.5

In [47]:
import tensorflow as tf

In [None]:
bert_model.distilbert.embeddings.word_embeddings(["my token ids here"])