In [25]:
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification, AdamW, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, concatenate_datasets
import pandas as pd

In [19]:
tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")
model = RobertaForMaskedLM.from_pretrained("pdelobelle/robbert-v2-dutch-base")

In [8]:
with open('../../input_data/train.sliding.full.txt') as f:
    train_lines = f.readlines()
train_lines_df = pd.DataFrame(train_lines)
train_lines_df = train_lines_df.rename(columns={0: 'text'})
train_dataset = Dataset.from_pandas(train_lines_df)

In [9]:
with open('../../input_data/eval.sliding.full.txt') as f:
    eval_lines = f.readlines()
eval_lines_df = pd.DataFrame(eval_lines)
eval_lines_df = eval_lines_df.rename(columns={0: 'text'})
eval_dataset = Dataset.from_pandas(eval_lines_df)

In [10]:
raw_datasets = concatenate_datasets([train_dataset, eval_dataset])

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [12]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/2155783 [00:00<?, ? examples/s]

In [22]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True)

In [20]:
model.train()


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(40000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [15]:
optimizer = AdamW(model.parameters(), lr=1e-5)



In [16]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

In [23]:
tokenized_datasets_split = tokenized_datasets.train_test_split(test_size=0.2)


In [26]:
args = TrainingArguments(
    output_dir="",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA devices.