https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=kD140sFjh0LQ

In [12]:
# Check that we have a GPU
!nvidia-smi

Thu Mar 25 12:44:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 1070    Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   55C    P3    22W /  N/A |    645MiB /  8119MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [2]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [3]:
# Now let's re-create our tokenizer in transformers

from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./kaz-tokenizer", max_len=512)

In [4]:
# As we are training from scratch, we only initialize from a config, not from an existing pretrained model 
# or checkpoint.

from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [5]:
model.num_parameters()
# => 84 million parameters

83504416

Now let's build our training Dataset

In [6]:
%%time
from datasets import load_dataset

from pathlib import Path

paths = [str(x) for x in Path("./all-datasets/").glob("**/*.txt")]

dataset = load_dataset('text', data_files=paths)

Using custom data configuration default-b13449c91ebc259b


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/text/default-b13449c91ebc259b/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-b13449c91ebc259b/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.
CPU times: user 2.89 s, sys: 740 ms, total: 3.63 s
Wall time: 5.15 s


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1300000
    })
})

In [8]:
def token_encode(examples):
    tokenizer_out = tokenizer(examples['text'], truncation=True,  padding="max_length", add_special_tokens=True, max_length=128)
    return tokenizer_out

dataset = dataset.map(token_encode, batched=True, batch_size = 20000, num_proc = 16)

# dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])



















In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'text'],
        num_rows: 1300000
    })
})

In [10]:
# Small helper that batches different samples of the dataset together into an object that PyTorch knows how to perform backprop on.
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Initialize our Trainer

In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./KazrBERTo",
    overwrite_output_dir=True,
    num_train_epochs=1, #5
    #per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
)

Start training

In [None]:
%%time
trainer.train()

Step,Training Loss
