https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=kD140sFjh0LQ

In [1]:
# Check that we have a GPU
!nvidia-smi

Fri Mar 26 17:33:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 1070    Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   58C    P0    33W /  N/A |    399MiB /  8119MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [3]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [4]:
# Now let's re-create our tokenizer in transformers

from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./kaz-tokenizer", max_len=512)

In [5]:
# As we are training from scratch, we only initialize from a config, not from an existing pretrained model 
# or checkpoint.

from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [6]:
model.num_parameters()
# => 84 million parameters

83504416

Now let's build our training Dataset

In [7]:
%%time
from datasets import load_dataset

from pathlib import Path

paths = [str(x) for x in Path("./all-datasets/").glob("**/*.txt")]

dataset = load_dataset('text', data_files=paths)

Using custom data configuration default-7f576d4dee36b10b


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/text/default-7f576d4dee36b10b/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-7f576d4dee36b10b/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.
CPU times: user 2.76 s, sys: 639 ms, total: 3.4 s
Wall time: 5.57 s


In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1299999
    })
})

In [9]:
def token_encode(examples):
    tokenizer_out = tokenizer(examples['text'], truncation=True,  padding="max_length", add_special_tokens=True, max_length=128)
    return tokenizer_out

dataset = dataset.map(token_encode, batched=True, batch_size = 20000, num_proc = 16)

# dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])



















In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'text'],
        num_rows: 1299999
    })
})

In [11]:
# Small helper that batches different samples of the dataset together into an object that PyTorch knows how to perform backprop on.
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Initialize our Trainer

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./KazrBERTo",
    overwrite_output_dir=True,
    num_train_epochs=1, #5
    #per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
)

Start training

In [13]:
%%time
trainer.train()

Step,Training Loss
500,9.3451
1000,8.9162
1500,8.7893
2000,8.6565
2500,8.5437
3000,8.4511
3500,8.3179
4000,8.227
4500,8.2121
5000,8.1269


CPU times: user 9h 5min 10s, sys: 1min 55s, total: 9h 7min 6s
Wall time: 9h 4min 29s


TrainOutput(global_step=162500, training_loss=6.095179568810096, metrics={'train_runtime': 32669.0108, 'train_samples_per_second': 4.974, 'total_flos': 8.337074480300851e+16, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 57635, 'init_mem_gpu_alloc_delta': 334180352, 'init_mem_cpu_peaked_delta': 18258, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 586019, 'train_mem_gpu_alloc_delta': 1022767104, 'train_mem_cpu_peaked_delta': 207434372, 'train_mem_gpu_peaked_delta': 1310920704})

In [14]:
trainer.save_model("./KazBERTo")