In [None]:
# Install `transformers` from master
%pip install git+https://github.com/huggingface/transformers
%pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

In [1]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "BPEtokenizer/vocab.json",
    "BPEtokenizer/merges.txt",
)

In [2]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [3]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=50_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [4]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("BPEtokenizer", max_len=512)

In [5]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [6]:
model.num_parameters()
# => 84 million parameters

81966416

In [7]:
from datasets import load_dataset
dataset = load_dataset("text", data_files="unlabeled_data_sentences.txt", split='train') #, split='train', streaming=True)
# dataset = load_dataset("text", data_files=r"..\ttmp\unlabeled_data_sentences.txt", split='train').shuffle()
dataset

Found cached dataset text (C:/Users/Jonathan Ipe/.cache/huggingface/datasets/text/default-9260811a8c434195/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


Dataset({
    features: ['text'],
    num_rows: 28046
})

In [None]:
dataset = dataset.train_test_split(test_size=0.2)
dataset

In [None]:
train_dataset = dataset['train']
val_dataset = dataset['test']

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/28046 [00:00<?, ? examples/s]

In [None]:
# Splitting the dataset
train_dataset_size = 1000  # Specify the number of training data points
val_dataset_size = 250    # Specify the number of validation data points

train_dataset = tokenized_datasets.take(train_dataset_size)
val_dataset = tokenized_datasets.skip(train_dataset_size).take(val_dataset_size)

In [None]:
# %%time
# from transformers import LineByLineTextDataset

# dataset = LineByLineTextDataset(
#     tokenizer=tokenizer,
#     file_path="unlabeled_data_sentences.txt",
#     block_size=128,
# )

In [9]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

In [10]:
import torch

In [11]:
torch.cuda.is_available()

True

In [None]:
import os
import wandb
os.environ["WANDB_PROJECT"]="my-awesome-project"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="BERT-Pretrained",
    # report_to="wandb",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=500,
    # save_total_limit=2,
    max_steps=100,  # specify the number of steps
    # logging_dir='./logs',  # directory for storing logs
    logging_steps=10,  # log training information every 50 steps
    # evaluation_strategy="steps",  # evaluate every `logging_steps` steps
    # load_best_model_at_end=True,  # load the best model at the end of training
    metric_for_best_model='loss',  # use accuracy to find the best model
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
    # eval_dataset=val_dataset,  # If you have a validation dataset
    # compute_metrics=compute_metrics
)

In [17]:
%%time
trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

{'loss': 7.1612, 'learning_rate': 4.5e-05, 'epoch': 0.0}
{'loss': 6.7803, 'learning_rate': 4e-05, 'epoch': 0.01}
{'loss': 6.1896, 'learning_rate': 3.5e-05, 'epoch': 0.01}
{'loss': 5.878, 'learning_rate': 3e-05, 'epoch': 0.01}
{'loss': 5.4853, 'learning_rate': 2.5e-05, 'epoch': 0.01}
{'loss': 5.2184, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 5.0677, 'learning_rate': 1.5e-05, 'epoch': 0.02}
{'loss': 4.9044, 'learning_rate': 1e-05, 'epoch': 0.02}
{'loss': 4.8558, 'learning_rate': 5e-06, 'epoch': 0.03}
{'loss': 4.851, 'learning_rate': 0.0, 'epoch': 0.03}
{'train_runtime': 186.1136, 'train_samples_per_second': 4.298, 'train_steps_per_second': 0.537, 'train_loss': 5.639166145324707, 'epoch': 0.03}
CPU times: total: 3min 6s
Wall time: 3min 6s


TrainOutput(global_step=100, training_loss=5.639166145324707, metrics={'train_runtime': 186.1136, 'train_samples_per_second': 4.298, 'train_steps_per_second': 0.537, 'train_loss': 5.639166145324707, 'epoch': 0.03})

In [18]:
wandb.finish()

NameError: name 'wandb' is not defined

In [19]:
trainer.save_model("BERT-Pretrained")

In [21]:
dataset[5]

{'text': '¯¯·¯±¯¯¯ ¯¯¯¯ï¯¯¯ ¯į¯¯¯¯¯¯ ¯¯¯¯į¯¯¯ ¯¯¯¯ï¯¯¯ ¯¯±¯į¯¯¯ ¯¯¯¯¿¯¯¯ ¯¯³¯Ï¯¯¯ ¯¯¯¯¿¯¯¯ ¯¯±¯į¯¯¯ ¯¯¯¯¿¯¯¯ ¯¯°¯¿¯¯¯ ¯¯¯¯·¯¯¯ ¯ï¯¯¯°¯¯ ¯¯¯¯·¯¯¯ ¯į¯¯·¯¯¯ ¯¯¯¯³¯¯¯ ¯¯¯¯±¯¯¯ ¯Ï¯¯³¯¯¯ ¯¯¯¯·¯¯¯ ¯¯¯¯¿¯¯¯ ¯¯¯¯·¯¯¯ ¯¿¯¯¿¯¯¯ ¯¯·¯¯¯¯¯ ¯¯³¯¯¯¯¯ ¯¯·¯¯¯¯¯ ¯¯¯¯ï¯¯¯ ¯¯°¯Ï¯¯¯ ¯¯¯¯ï¯¯¯ ¯¯±¯į¯¯¯ ¯¯¯¯¯°¯¯ ¯¯°¯į¯¯¯ ¯¯¯¯Ï¯¯¯ ¯¯·¯ï¯¯¯ ¯¯¯¯į¯¯¯ ¯¯°¯ï¯¯¯ ¯¯¯¯¿¯¯¯ ¯¯·¯±¯¯¯ ¯¯°¯Ï¯¯¯ ¯¯¯¯ï¯¯¯ ¯į¯¯Ï¯¯¯ ¯¯¯¯¿¯¯¯ ¯¯¿¯·¯¯¯ ¯¯¯¯Ï¯¯¯ ¯į¯¯¿¯¯¯ ¯¯¯¯·¯¯¯ ¯į¯¯¯°¯¯ ¯ï¯¯¿¯¯¯ ¯Ï¯¯·¯¯¯ ¯ï¯¯¿¯¯¯ ¯¯¯¯ï¯¯¯ ¯į¯¯Ï¯¯¯ ¯¯¯¯į¯¯¯ ¯¯°¯ï¯¯¯ ¯¯¯¯į¯¯¯ ¯į¯¯¯°¯¯ ¯¯¯¯¯±¯¯ ¯¯°¯Ï¯¯¯ ¯°¯¯¯¯¯¯ ¯¯¯¯¿±¯¯ ¯¿¯¯¿¯¯¯ ¯¯¯¯¯·¯¯ ¯Ï³¯¯³¯¯ ¯ï¯¯¯·¯¯ ¯į¯¯¯¯¯¯ ¯Ï¯¯¯°¯¯ ¯ï¯¯¯¯¯¯ ¯į¯¯¯±¯¯ ¯¯°¯¯¯¯¯ ¯į¯¯¯°¯¯ ¯Ï¯¯¯¯¯¯ ¯ï¯¯¯·¯¯ ¯į¯¯¯¯¯¯ ¯ï¯¯¯°¯¯ ¯¿¯¯¯¯¯¯ ¯Ï¯¯¯°¯¯ ¯ï¯¯¯¯¯¯ ¯Ï¯¯į¯¯¯ ¯¿¯¯¯¯¯¯ ¯·¯¯¯¿¯¯ ¯Ï¯¯¯¯¯¯ ¯¿¯¯į¯¯¯ ¯·¯¯¯¯¯¯ ¯¯·¯±¯¯¯ ¯¿¯¯į¯¯¯ ¯¯·¯ï¯¯¯ ¯¯³¯Ï¯¯¯ ¯¯·¯ï¯¯¯ ¯¯¯¯į¯¯¯ ¯¯°¯ï¯¯¯ ¯¯¯¯·¯¯¯ ¯¯±¯¿¯¯¯ ¯¯¯°¯·¯¯ ¯¯į¯¯³¯¯ ¯¯¯°¯·¯¯ ¯¯¯±¯¯¯¯ ¯¯¯°¯°¯¯ ¯¯Ï¯¯¯¯¯ ¯¯ï¯¯±¯¯ ¯¯¯¯¯³¯¯ ¯¯Ï¯¯±¯¯ ¯¯¯¯į¯¯¯ ¯¯¿¯¯³¯¯ ¯¯¯¯į¯¯¯ ¯¯·¯¯·¯¯ ¯¯¯¯į¯¯¯ ¯¯³¯¯¿¯¯ ¯¯¯¯¯Ï¯¯ ¯·¯¯¯¿¯¯ ¯¯¯¯¯³¯¯ 

In [None]:
import pandas as pd
df = pd.DataFrame(trainer.state.log_history)

In [None]:
print(df)

In [None]:
df.plot(x='epoch', y='train_loss')