In [1]:
# if in Google colab, need to install datasets
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00

In [1]:
import pandas as pd
import numpy as np
import urllib.request, json, os, math

from transformers import (
    BertConfig,
    TFBertForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizerFast,
    AdamWeightDecay,
    create_optimizer
)

from tokenizers import (
    Tokenizer,
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    processors,
    decoders
)

from datasets import IterableDataset, load_dataset, load_from_disk

In [2]:
try:
    from google.colab import drive
    import sys

    drive.mount('/content/gdrive/')
    sys.path.append('/content/gdrive/My Drive/Colab Notebooks')
except:
    pass

from utils import (
    TweetIterator
)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


## Config

In [3]:
PROJECT_DIR = '/content/gdrive/My Drive/Colab Notebooks/w266_final_proj' # filepath to store model/tokenizer/data artifacts
TWEETS_PATH = 'birthyear.1950_1969.lowercase'                            # name of tweets file
VOCAB_SIZE = 30522                                                       # same as vocab size in BERT paper
TEST_PCT = 0.1                                                           # defines pct of total dataset to use for validation
TRAIN_BATCH_SIZE = 64                                                    # batch size for training
TEST_BATCH_SIZE = 32                                                     # batch size for validation
INITIAL_LEARNING_RATE = 0.0001                                           # LR to use @ during warmup learning schedule, same as BERT paper
WEIGHT_DECAY = 0.01                                                      # Regularization weight, same as BERT paper
TOTAL_NUM_STEPS = 1000                                                   # number of batches to use for training

# path to raw tweet data
full_tweets_path = os.path.join(PROJECT_DIR, TWEETS_PATH)

# path to save or load pre-processed tweets dataset (will load if pre-processed dataset already exists)
dataset_path = '.'.join(TWEETS_PATH.split('.')[:-1]) + '.hf'
full_dataset_path = os.path.join(PROJECT_DIR, dataset_path)

# path to save or load tokenizer to (will load if pre-trained tokenizer already exists)
tokenizer_path = TWEETS_PATH + '_tokenizer'
full_tokenizer_path = os.path.join(PROJECT_DIR, tokenizer_path)

# path to save BERT model to
model_path = f'{TWEETS_PATH}_{TRAIN_BATCH_SIZE}batch_size_{TOTAL_NUM_STEPS}steps'
full_model_path = os.path.join(PROJECT_DIR, model_path)

In [4]:
num_tweets = 0
with open(full_tweets_path, 'r') as file:
    for line in file:
        num_tweets += 1
print(f"Total Number of Tweets: {num_tweets:,.0f}")

num_training_tweets = TRAIN_BATCH_SIZE * TOTAL_NUM_STEPS
print(f"Based on the selected parameters, model will be trained on {num_training_tweets / num_tweets :.1%} of total Tweets ({num_training_tweets:,.0f})")

Total Number of Tweets: 8,167,178
Based on the selected parameters, model will be trained on 0.8% of total Tweets (64,000)


### Create Untrained BERT Model

### Create and Train BERT Tokenizer
https://huggingface.co/learn/nlp-course/en/chapter6/8#building-a-wordpiece-tokenizer-from-scratch

In [6]:
if tokenizer_path in os.listdir(PROJECT_DIR):
    print(f"You've saved this tokenizer before at {full_tokenizer_path}.")
    print("Loading from disk...")

    wrapped_tokenizer = PreTrainedTokenizerFast.from_pretrained(full_tokenizer_path)
    print("Tokenizer loaded!")
else:
    print("You've never saved this dataset before. Creating a tokenizer from scratch...")
    # create a BERT tokenizer
    tokenizer = Tokenizer(model = models.WordPiece(unk_token = '[UNK]'))
    normalizer = normalizers.Sequence([
        normalizers.NFD(),            # Normalize characters to Unicode NFD
        # normalizers.Lowercase(),      # Set all characters to lowercase - not necessary, as tweets are already lowercase
        normalizers.StripAccents()    # Remove all accents from characters
    ])
    pre_tokenizer = pre_tokenizers.Whitespace()

    tokenizer.normalizer = normalizer
    tokenizer.pre_tokenizer = pre_tokenizer

    # create a WordPiece trainer
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.WordPieceTrainer(
        vocab_size = VOCAB_SIZE,
        special_tokens = special_tokens
    )
    print(f"Creating an iterator from tweets at {full_tweets_path}...")
    # create an iterator out of our tweets since they won't fit on disk
    tweets = TweetIterator(full_tweets_path)
    print(f"Training your tokenizer...")
    tokenizer.train_from_iterator(tweets, trainer=trainer, length=num_tweets)

    post_processor = processors.TemplateProcessing(
        single=f"[CLS]:0 $A:0 [SEP]:0",
        pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
        special_tokens=[("[CLS]", tokenizer.token_to_id('[CLS]')), ("[SEP]", tokenizer.token_to_id('[SEP]'))]
    )
    tokenizer.post_processor = post_processor

    decoder = decoders.WordPiece(prefix="##")
    tokenizer.decoder = decoder
    # wrap our tokenizer in a PreTrainedTokenizerFast object
    # so we can use it with a DataCollator and Trainer
    wrapped_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )
    print('Saving tokenizer to disk...')
    wrapped_tokenizer.save_pretrained(full_tokenizer_path)
    print(f'Tokenizer saved at {full_tokenizer_path}')

You've saved this tokenizer before at /content/gdrive/My Drive/Colab Notebooks/w266_final_proj/birthyear.1950_1969.lowercase_tokenizer.
Loading from disk...
Tokenizer loaded!


## Create Train and Test Datasets from Iterator

In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = wrapped_tokenizer,
    mlm_probability = 0.15, # probability that a token is masked
    return_tensors="tf"
)

In [8]:
if dataset_path in os.listdir(PROJECT_DIR):
    print(f"You've saved this dataset before at {full_dataset_path}.")
    print("Loading from disk...")
    tweets_ds = load_from_disk(full_dataset_path)
    print("Dataset loaded!")
else:
    print(f"You've never saved this dataset before. Loading dataset from {full_tweets_path}...")
    tweets_ds = load_dataset(
        path = "text",
        data_files = full_tweets_path,
        split = f"train", # train on everything
    )
    print('Tokenizing dataset...')
    tweets_ds = tweets_ds.map(
        function = lambda x: wrapped_tokenizer(x['text']),
        batched = True
    )
    print('Saving dataset to disk...')
    tweets_ds.save_to_disk(full_dataset_path)
    print(f'Dataset saved at {full_dataset_path}')

You've saved this dataset before at /content/gdrive/My Drive/Colab Notebooks/w266_final_proj/birthyear.1950_1969.hf.
Loading from disk...
Dataset loaded!


In [9]:
# split your dataset into train and val
tweets_ds_split = tweets_ds.train_test_split(test_size=TEST_PCT)

## Initialize Untrained BERT Model



In [10]:
# Define the optimizer and learning rate schedule
# original BERT Model trained for 1,000,000 steps total
# first 1% (10,000 steps) were warm-up steps w/ static 1e-4 LR, then linear loss ratio decay
num_warmup_steps = math.floor(TOTAL_NUM_STEPS * 0.01)

# linear loss ratio decay by default is from init_lr to 0 over the remaining # of steps
optimizer, lr_schedule = create_optimizer(
    init_lr=INITIAL_LEARNING_RATE,
    num_train_steps=TOTAL_NUM_STEPS,
    num_warmup_steps=num_warmup_steps,
    weight_decay_rate=WEIGHT_DECAY,
)

In [11]:
config = BertConfig(
    vocab_size = VOCAB_SIZE
)
bert_model = TFBertForMaskedLM(config = config)
bert_model.build()
bert_model.compile(optimizer = optimizer)

bert_model.summary()

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Prepare Dataset for Training Loop

In [13]:
tweets_ds_tf_train = bert_model.prepare_tf_dataset(
    dataset = tweets_ds_split["train"],
    collate_fn = data_collator,
    batch_size = TRAIN_BATCH_SIZE,
    shuffle=True
)

tweets_ds_tf_test = bert_model.prepare_tf_dataset(
    dataset = tweets_ds_split["test"],
    collate_fn = data_collator,
    batch_size = TEST_BATCH_SIZE,
    shuffle=True
)

print(f"Total Train Dataset Size: {len(tweets_ds_tf_train):,.0f} steps, although the model will only be trained on {TOTAL_NUM_STEPS} steps.")
print(f"Total Test Dataset Size: {len(tweets_ds_tf_test):,.0f}")

Total Train Dataset Size: 114,850 steps, although the model will only be trained on 1000 steps.
Total Test Dataset Size: 25,522


In [14]:
# this number doesn't affect training since we aren't actually doing a full pass
# over the training set. just using to monitor model performance
NUM_EPOCHS = 5
steps_per_epoch = TOTAL_NUM_STEPS // NUM_EPOCHS
validation_steps = steps_per_epoch // 100

print(f'Training Steps per Epoch: {steps_per_epoch}')
print(f'Validation Steps per Epoch: {validation_steps}')

Training Steps per Epoch: 200
Validation Steps per Epoch: 2


## 🏃‍♂️‍➡️ Train!

In [None]:
history = bert_model.fit(
    x = tweets_ds_tf_train,
    validation_data = tweets_ds_tf_test,
    epochs = NUM_EPOCHS,
    steps_per_epoch = steps_per_epoch,
    validation_steps = validation_steps,
    verbose = 1
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7ef716689180>

In [None]:
bert_model.save_pretrained(full_model_path)
print(f"Model saved to {full_model_path}")

Model saved to /content/gdrive/My Drive/Colab Notebooks/w266_final_proj/birthyear.1950_1969.lowercase_64batch_size_1000steps
