In [1]:
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/#42-optimizer--learning-rate-scheduler

In [2]:
import sys
sys.executable
import os
import glob
from pathlib import Path

import tensorflow as tf
from torch.utils.data import DataLoader
import torch 
from tqdm import tqdm

from tokenizers import BertWordPieceTokenizer
from transformers import pipeline, BertTokenizerFast, BertTokenizer, BertForMaskedLM #AutoTokenizer, AutoModelWithLMHead
from nlp import load_dataset

import numpy as np
import time
import datetime

In [3]:
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home"

In [4]:
# %pip install --upgrade pip
# %pip install --upgrade torch
# %pip install --upgrade tensorflow
# %pip install --upgrade pathlib
# %pip install --upgrade --use-feature=2020-resolver tokenizers
# %pip install --upgrade --use-feature=2020-resolver transformers # resolve needed so no version conflicts
# %pip install --upgrade nlp

In [5]:
# read in the train/text IMDB reviews
def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())

    return texts

train_texts = read_imdb_split('./aclImdb/train')
test_texts = read_imdb_split('./aclImdb/test')

# and write these out to a single text file (new line for each review)
# write the list of review to text files
texts = train_texts + test_texts

# delete whatever is in the dataset folder
files = glob.glob('./aclImdbKate/*')
for f in files:
    os.remove(f)

# write each chunk of text to a single txt file 
for text in texts:
    with open('./aclImdbKate/text.txt', 'a+', encoding='utf-8') as file:
        file.write(text)
        file.write("\n") # go to new line at end of each body of text

In [6]:
# make my own tokenizer
paths = paths = [str(x) for x in Path("./aclImdbKate/").glob("**/*.txt")]

tokenizer = BertWordPieceTokenizer()

tokenizer.enable_truncation(max_length = 512)

tokenizer.train(files = paths, 
                vocab_size = 5_000, 
                min_frequency = 2,
                special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
               )

tokenizer.save_model("./aclImdbKateVocab/", name = 'imdb-bert-wordpiece')

# print(tokenizer.truncation)

['./aclImdbKateVocab/imdb-bert-wordpiece-vocab.txt']

In [7]:
# TODO: I need to find a way to separate each sentence to be on its own line...so sentences get tokenized correctly
# (start with [CLS] and end with [SEP])

# load my dataset
dataset = load_dataset(
    'text', 
    data_files={'train': ['./aclImdbKate/text.txt']}, 
    split='train')

print(type(dataset))
print(dataset.shape)
# print(dataset.column_names)
# print(dataset.features)
# print(dataset[0])
# print(dataset[1])

# 50K records are too many to process locally
dataset = dataset.filter(lambda e, i: i<1000, with_indices=True)
print(dataset.shape)

Using custom data configuration default


<class 'nlp.arrow_dataset.Dataset'>
(50000, 1)
(1000, 1)


In [8]:
# tell pytorch what kind of processor is available to it

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [9]:
# load up BERT tokenizer with the vocab I defined in another script
tokenizer = BertTokenizer(vocab_file = './aclImdbKateVocab/imdb-bert-wordpiece-vocab.txt')

In [10]:
# encode my dataset 
# (adds the ids for vocab words, indicated whether the end of the text chunk contains padding)

encoded_dataset = dataset.map(
    lambda examples: tokenizer(
        dataset['text'],
        truncation=True,
        max_length = 128, # too low for my real data
        padding=True,
        return_attention_mask = True
    ), 
    batch_size = 1000, # this is the max number of rows in your dataset that's allowed.
    batched=True)

# add a labels column, and that column needs to be set to the input_ids
encoded_dataset = encoded_dataset.map(
    lambda example: {'labels': example['input_ids']})

print(encoded_dataset.column_names)
print(encoded_dataset[0])
print(encoded_dataset['input_ids'][0])

print(tokenizer.decode(tokenizer(dataset[0]['text'])['input_ids'])) 
# note that [SEP] token doesn't appear at end of each sentence, that's what I want

['attention_mask', 'input_ids', 'labels', 'text', 'token_type_ids']
{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [2, 239, 43, 260, 225, 908, 388, 2076, 339, 919, 280, 43, 682, 199, 2484, 467, 4092, 1558, 195, 239, 224, 3126, 18, 2337, 43, 260, 583, 2585, 58, 3335, 311, 133, 207, 750, 734, 5, 301, 448, 190, 227, 446, 2479, 194, 207, 43, 637, 4450, 184, 18, 183, 1152, 194, 127, 553, 207, 242, 1310, 2865, 18, 389, 239, 4155, 6, 183, 4237, 520, 6, 1018, 124, 3992, 18, 241, 43, 1818, 61, 135, 131, 18, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [11]:
# convert the encoded data to a tensorflow type via DataLoader
encoded_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
dataloader = DataLoader(encoded_dataset, batch_size=32)

In [12]:
# dir(dataloader)
next(iter(dataloader))

  return function(data_struct)


{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'input_ids': tensor([[   2,  239,   43,  ...,    0,    0,    0],
         [   2, 2899,  819,  ...,  222,  183,    3],
         [   2,   43, 2897,  ...,    0,    0,    0],
         ...,
         [   2,  224,  207,  ...,   11,  914,    3],
         [   2,  224, 1468,  ...,   19, 1503,    3],
         [   2,   51,  240,  ...,  198,  389,    3]]),
 'labels': tensor([[   2,  239,   43,  ...,    0,    0,    0],
         [   2, 2899,  819,  ...,  222,  183,    3],
         [   2,   43, 2897,  ...,    0,    0,    0],
         ...,
         [   2,  224,  207,  ...,   11,  914,    3],
         [   2,  224, 1468,  ...,   19, 1503,    3],
         [   2,   51,  240,  ...,  198,  389,    3]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0,

In [13]:
# # start the steps needed to fine tune a model

# # this pulls the architecture of one of the existing models
# model = BertForMaskedLM.from_pretrained("bert-base-uncased")
# model.to(device) # recall this is cpu as defined in above cell
# model.train() # this changes the model's "mode", it does not perform the training

In [14]:
# # this does a pretty print-out of model architecture
# # TODO: confirm that nothing is being left out of this print-out
# params = list(model.named_parameters())

# print('The BERT model has {:} different named parameters.\n'.format(len(params)))
# print('==== Embedding Layer ====\n')
# for p in params[0:5]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
# print('\n==== First Transformer ====\n')
# for p in params[5:21]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
# print('\n==== Output Layer ====\n')
# for p in params[-4:]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
# 30522 is the length of original vocab. BUT, my vocab is smaller. So, cannot fine-tune, need to retrain completely

In [15]:
# # Not entirely clear on this, looks like the manager that hold the tuning params/model architecture for training
# optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

# for epoch in range(2):
#     for i, batch in enumerate(tqdm(dataloader)):
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs[0]
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         if i % 10 == 0:
#             print(f"loss: {loss}")

In [16]:
# save my model with new weights - the weight have been re-fitted, right?
# model.save_pretrained('./aclImdbKateModel/') 

In [17]:
# fill_mask = pipeline(
#     "fill-mask", 
#     model = model, 
#     tokenizer = tokenizer
# )

# result = fill_mask('This is one of my favorite [MASK] .')
# print(result) # TODO: fill mask pipeline not performing as expected...

In [18]:
# from transformers import AutoTokenizer, AutoModelWithLMHead

# # change tokenizer and model to some pretrained...
# tokenizer1 = AutoTokenizer.from_pretrained("bert-base-uncased")
# model1 = AutoModelWithLMHead.from_pretrained("bert-base-uncased") 

# fill_mask = pipeline(
#     "fill-mask", 
#     model = model1, 
#     tokenizer = tokenizer1
# )

# result = fill_mask('This is one of my favorite [MASK] .')
# print(result)

In [19]:
# We might need to use Trainer and TrainingArguments to train FROM SCRATCH!
from transformers import Trainer, TrainingArguments
from transformers import BertConfig 

config = BertConfig(vocab_size = tokenizer.vocab_size)  
model = BertForMaskedLM(config)

# https://github.com/huggingface/transformers/blob/a75c64d80c76c3dc71f735d9197a4a601847e0cd/src/transformers/training_args.py
training_args = TrainingArguments(
    output_dir = './results',          # output directory
    overwrite_output_dir = True,       # whether the contents in the output dir should be overwritten
    do_train = True,                   # why does this default to False? What the heck are you running this for if not to train a new model??
    num_train_epochs = 2,              # total number of training epochs (2-4, rec)
    per_device_train_batch_size = 16,  # batch size per device during training
#     per_device_eval_batch_size = 64,   # batch size for evaluation
    warmup_steps = 0,                  # number of warmup steps for learning rate scheduler
    weight_decay = 0.01,               # strength of weight decay
    logging_dir = './logs',            # directory for storing logs
    seed = 123                         # random seed
)

tokenizer = BertTokenizer(vocab_file = './aclImdbKateVocab/imdb-bert-wordpiece-vocab.txt')

# https://github.com/huggingface/transformers/blob/a75c64d80c76c3dc71f735d9197a4a601847e0cd/src/transformers/trainer.py
trainer = Trainer(
    args = training_args,              # training arguments, defined above
    tokenizer = tokenizer,
    train_dataset=encoded_dataset,     # training dataset (torch.utils.data.dataset.Dataset or nlp.Dataset?)
    model = model     
)

trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=63.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=63.0, style=ProgressStyle(description_wid…





TrainOutput(global_step=126, training_loss=2.802249484592014)

In [20]:
# this does a pretty print-out of model architecture
# TODO: confirm that nothing is being left out of this print-out
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 204 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                   (5000, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [21]:
model.save_pretrained('./aclImdbKateModel/') 

In [22]:
fill_mask = pipeline(
    "fill-mask", 
    model = model, 
    tokenizer = tokenizer
)

result = fill_mask('This is one of my favorite [MASK] .')
print(result) # TODO: make it so the results of this pipeline are reproducible...do I need to "take model out of train mode?"

[{'sequence': '[CLS] this is one of my favorite br. [SEP]', 'score': 0.005493552889674902, 'token': 206, 'token_str': 'br'}, {'sequence': '[CLS] this is one of my favorite [MASK]. [SEP]', 'score': 0.004160965792834759, 'token': 4, 'token_str': '[MASK]'}, {'sequence': '[CLS] this is one of my favorite european. [SEP]', 'score': 0.0014475906500592828, 'token': 4434, 'token_str': 'european'}, {'sequence': '[CLS] this is one of my favorite ). [SEP]', 'score': 0.0013414985733106732, 'token': 13, 'token_str': ')'}, {'sequence': '[CLS] this is one of my favoriteir. [SEP]', 'score': 0.0013259300030767918, 'token': 258, 'token_str': '##ir'}]


In [None]:
# sequence = f"Hugging Face is a French company based in {tokenizer.mask_token}"

# input_ids = tokenizer.encode(sequence, return_tensors="pt")
# mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

# token_logits = model(input_ids)[0]
# mask_token_logits = token_logits[0, mask_token_index, :]
# mask_token_logits = torch.softmax(mask_token_logits, dim=1)

# top_5 = torch.topk(mask_token_logits, 5, dim=1)
# top_5_tokens = zip(top_5.indices[0].tolist(), top_5.values[0].tolist())

# for token, score in top_5_tokens:
#     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])), f"(score: {score})")

# # Get the score of token_id
# sought_after_token = "London"
# sought_after_token_id = tokenizer.encode(sought_after_token, add_special_tokens=False, add_prefix_space=True)[0]  # 928

# token_score = mask_token_logits[:, sought_after_token_id]
# print(f"Score of {sought_after_token}: {mask_token_logits[:, sought_after_token_id]}")