In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m98.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [1]:
import os
import torch
from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
class LargeTextDataset(torch.utils.data.IterableDataset):
    def __init__(self, file_path, tokenizer, block_size, stride):
        self.file_path = file_path
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.stride = stride

    def __iter__(self):
        with open(self.file_path, "r", encoding="utf-8") as file:
            text_buffer = ""
            for line in file:
                text_buffer += line
                while len(text_buffer) > self.block_size:
                    tokens = self.tokenizer(
                        text_buffer[:self.block_size],
                        padding="max_length",
                        truncation=True,
                        max_length=self.block_size,
                        return_tensors="pt"
                    )
                    text_buffer = text_buffer[self.stride:]
                    input_ids = tokens['input_ids'].squeeze(0)
                    attention_mask = tokens['attention_mask'].squeeze(0)
                    yield {"input_ids": input_ids, "attention_mask": attention_mask}


In [5]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

file_path = "/content/drive/MyDrive/Coursework/large_file.txt"
block_size = 512
stride = 512
dataset = LargeTextDataset(file_path, tokenizer, block_size, stride)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=1,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    gradient_checkpointing=False,
    position_embedding_type="absolute",
    use_cache=True,
)

model = BertForMaskedLM(config)


NameError: name 'tokenizer' is not defined

In [7]:
file_size = os.path.getsize(file_path)
approx_tokens_per_block = block_size // stride
approx_total_tokens = file_size * approx_tokens_per_block
approx_steps = approx_total_tokens // (512) 

training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    max_steps=approx_steps
)


In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,3.8144
1000,3.3248
1500,3.3163
2000,3.3109
2500,3.317
3000,3.3084
3500,3.31
4000,3.3018
4500,3.3001
5000,3.2973


In [None]:
trainer.save_model("output")