# Checking Pretrained Model (Bert)

In [1]:
!pip install transformers
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "bert-base-uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [2]:
model.summary()

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109,514,298
Trainable params: 109,514,298
Non-trainable params: 0
_________________________________________________________________


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [4]:
text = "And so Sally can wait. She knows it's too late. [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] ."
mask_count = 6

In [5]:
import numpy as np
import tensorflow as tf

inputs = tokenizer(text, return_tensors="np")
token_logits = model(**inputs).logits
for i in range(mask_count):
  # Find the location of [MASK] and extract its logits
  mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[i, 1]
  mask_token_logits = token_logits[0, mask_token_index, :]
  # Pick the [MASK] candidates with the highest logits
  # We negate the array before argsort to get the largest, not the smallest, logits
  top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

  for token in top_5_tokens:
      print(i, tokenizer.decode([token]))
      #print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")
  print()

0 so
0 and
0 she
0 but
0 then

1 and
1 she
1 but
1 so
1 ,

2 will
2 can
2 is
2 and
2 ,

3 is
3 can
3 will
3 to
3 and

4 is
4 to
4 ,
4 and
4 it

5 again
5 now
5 anyway
5 then
5 together



In [6]:
tokenizer.model_max_length

512

# Make a Dataset

In [7]:
!pip install datasets
import datasets
from datasets import Dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
with open('billboard_lyrics_1964-2015.txt', 'r') as file:
  data = file.read()

In [9]:
data = data.split("\n")

In [10]:
label_data = data[0]
data = data[1:]
label_data, data[:4], len(data)

('"Rank","Song","Artist","Year","Lyrics","Source"',
 ['1,"wooly bully","sam the sham and the pharaohs",1965,"sam the sham miscellaneous wooly bully wooly bully sam the sham  the pharaohs  domingo samudio uno dos one two tres quatro matty told hatty about a thing she saw had two big horns and a wooly jaw wooly bully wooly bully wooly bully wooly bully wooly bully hatty told matty lets dont take no chance lets not belseven come and learn to dance wooly bully wooly bully wooly bully wooly bully wooly bully matty told hatty thats the thing to do get you someone really to pull the wool with you wooly bully wooly bully wooly bully wooly bully wooly bully lseven  the letter l and the number 7 when typed they form a rough square l7 so the lyrics mean lets not be square",3',
  '2,"i cant help myself sugar pie honey bunch","four tops",1965," sugar pie honey bunch you know that i love you i cant help myself i love you and nobody elsein and out my life you come and you go leaving just your picture

In [11]:
for i in range(len(data)):
  data[i] = data[i].split(",")

In [12]:
max([len(x) for x in data])

6

In [13]:
import random
random.shuffle(data)

In [14]:
print(len(data))
splitted_data_1 = data[:4500]
splitted_data_2 = data[4500:]
print(len(splitted_data_1), len(splitted_data_2))

5100
4500 600


In [15]:
lyrics_1 = [sample[4] for sample in splitted_data_1]
lyrics_2 = [sample[4] for sample in splitted_data_2]

In [16]:
train_dataset = Dataset.from_dict({"Lyrics": lyrics_1})
test_dataset = Dataset.from_dict({"Lyrics": lyrics_2})

In [17]:
dataset = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['Lyrics'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['Lyrics'],
        num_rows: 600
    })
})

In [18]:
dataset['train'][400]['Lyrics']

'" on the first part of the journey i was looking at all the life there were plants and birds and rocks and things there was sand and hills and rings the first thing i met was a fly with a buzz and the sky with no clouds the heat was hot and the ground was dry but the air was full of soundive been through the desert on a horse with no name it felt good to be out of the rain in the desert you can remember your name cause there aint no one for to give you no pain la laafter two days in the desert sun my skin began to turn red after three days in the desert fun i was looking at a river bed and the story it told of a river that flowed made me sad to think it was deadyou see ive been through the desert on a horse with no name it felt good to be out of the rain in the desert you can remember your name cause there aint no one for to give you no pain la laafter nine days i let the horse run free cause the desert had turned to sea there were plants and birds and rocks and things there was sand 

In [19]:
def tokenize_function(examples):
    result = tokenizer(examples["Lyrics"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


In [20]:
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["Lyrics"]
)
tokenized_datasets

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (781 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 600
    })
})

In [21]:
tokenizer.decode(tokenized_datasets['test'][0]["input_ids"])

'[CLS] " if i were your woman and you were my man youd have no other woman youd be weak as a lamb if you had the strength to walk out that door my love would over rule my sense and id call you back for more if i were your woman if you were my woman if i were your woman if you were my woman and you were my man yeah yeah she tears you down darling says youre nothing at all but ill pick you up darling when she lets you fall youre like a diamond but she treats you like glass yet you make it hard to love you but me you dont ask if i were your woman if you were my woman if i were your woman if you were my woman if i were your woman if you were my woman heres what id do id never no no no stop loving you yeah yeah life is so crazy and love is unkind because she came first darling will she hang on your mind youre a part of me and you dont even know it im what you need but im too afraid to show it if i were your woman if you were my woman if i were your woman if you were my woman if i were your 

In [22]:
chunk_size = 128

In [23]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [24]:
chunked_datasets = tokenized_datasets.map(group_texts, batched=True)
chunked_datasets

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 12498
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1652
    })
})

In [25]:
print(tokenizer.decode(chunked_datasets['test'][2]["input_ids"]))
print(tokenizer.decode(chunked_datasets['test'][2]["labels"]))

if i were your woman heres what id do id never never never stop loving you if i were your woman youre sweet lovin woman if you were my woman if you were my woman what would you do if you were my woman if you were my woman what would you do " [SEP] [CLS] " boy boynow in the street there is violence an an a lots of work to be done no place to hang all our washing an an i cant blame all on the sunoh no we gonna rock down to electric avenue and then well take it higher oh we gonna rock down to electric avenue and then well take it higherworkin so hard
if i were your woman heres what id do id never never never stop loving you if i were your woman youre sweet lovin woman if you were my woman if you were my woman what would you do if you were my woman if you were my woman what would you do " [SEP] [CLS] " boy boynow in the street there is violence an an a lots of work to be done no place to hang all our washing an an i cant blame all on the sunoh no we gonna rock down to electric avenue and t

# Fine Tuning


In [26]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [27]:
chunked_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 12498
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1652
    })
})

In [28]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
tf_train_dataset = model.prepare_tf_dataset(
    chunked_datasets["train"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

tf_eval_dataset = model.prepare_tf_dataset(
    chunked_datasets["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [30]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

model_name = model_checkpoint.split("/")[-1]
callback = PushToHubCallback(
    output_dir=f"{model_name}-finetuned-billboard", tokenizer=tokenizer
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
Cloning https://huggingface.co/jumshim/bert-base-uncased-finetuned-billboard into local empty directory.


In [31]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 45.67


In [32]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])



<keras.callbacks.History at 0x7fdb6d8aa380>

In [34]:
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 13.35


# Test

In [38]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="jumshim/bert-base-uncased-finetuned-billboard"
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/534M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at jumshim/bert-base-uncased-finetuned-billboard.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [49]:
text = "And so Sally can wait. She [MASK] its too late."

preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> and so sally can wait. she knows its too late.
>>> and so sally can wait. she thinks its too late.
>>> and so sally can wait. she says its too late.
>>> and so sally can wait. she feels its too late.
>>> and so sally can wait. she fears its too late.


In [50]:
preds

[{'score': 0.666015625,
  'token': 4282,
  'token_str': 'knows',
  'sequence': 'and so sally can wait. she knows its too late.'},
 {'score': 0.1849365234375,
  'token': 6732,
  'token_str': 'thinks',
  'sequence': 'and so sally can wait. she thinks its too late.'},
 {'score': 0.048980712890625,
  'token': 2758,
  'token_str': 'says',
  'sequence': 'and so sally can wait. she says its too late.'},
 {'score': 0.01554107666015625,
  'token': 5683,
  'token_str': 'feels',
  'sequence': 'and so sally can wait. she feels its too late.'},
 {'score': 0.01110076904296875,
  'token': 10069,
  'token_str': 'fears',
  'sequence': 'and so sally can wait. she fears its too late.'}]

In [67]:
text = "And so Sally can wait, she knows it's too late as we're walking on by Her soul slides away, [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] I heard you say"
text_count = 7

for i in range(text_count):
  preds = mask_filler(text)
  if i == text_count-1:
    text = text.replace("[MASK]", preds[0]['token_str'], 1)
    break
  text = text.replace("[MASK]", preds[0][0]['token_str'], 1)
  print(text)

text

And so Sally can wait, she knows it's too late as we're walking on by Her soul slides away, and [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] I heard you say
And so Sally can wait, she knows it's too late as we're walking on by Her soul slides away, and she [MASK] [MASK] [MASK] [MASK] [MASK] I heard you say
And so Sally can wait, she knows it's too late as we're walking on by Her soul slides away, and she can [MASK] [MASK] [MASK] [MASK] I heard you say
And so Sally can wait, she knows it's too late as we're walking on by Her soul slides away, and she can still [MASK] [MASK] [MASK] I heard you say
And so Sally can wait, she knows it's too late as we're walking on by Her soul slides away, and she can still see [MASK] [MASK] I heard you say
And so Sally can wait, she knows it's too late as we're walking on by Her soul slides away, and she can still see it [MASK] I heard you say


"And so Sally can wait, she knows it's too late as we're walking on by Her soul slides away, and she can still see it , I heard you say"