In [13]:
from transformers import AutoModel, Seq2SeqTrainer, AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

In [14]:
checkpoint = 'roberta-large'
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
raw_dataset = load_dataset('scientific_papers','arxiv')

Found cached dataset scientific_papers (/home/jmunse/.cache/huggingface/datasets/scientific_papers/arxiv/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)
100%|██████████| 3/3 [00:00<00:00, 68.84it/s]


In [16]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6440
    })
})

In [17]:
coll = DataCollatorWithPadding(tokenizer=tokenizer)
def tokenize_function(seq):
    return tokenizer(seq["article"], seq["abstract"], truncation=True)

In [23]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6440
    })
})

In [18]:
data_tokenized = raw_dataset.map(tokenize_function)

Loading cached processed dataset at /home/jmunse/.cache/huggingface/datasets/scientific_papers/arxiv/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f/cache-56cfbae28a36ab33.arrow
100%|██████████| 6436/6436 [02:41<00:00, 39.90ex/s]
Loading cached processed dataset at /home/jmunse/.cache/huggingface/datasets/scientific_papers/arxiv/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f/cache-bd23dc4969746424.arrow


In [25]:
len(data_tokenized['train']['input_ids'][0])

512

In [26]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [27]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=data_tokenized["train"],
    eval_dataset=data_tokenized["validation"],
    data_collator=coll,
    tokenizer=tokenizer,
)

In [28]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaModel.forward` and have been ignored: abstract, article, section_names. If abstract, article, section_names are not expected by `RobertaModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 203037
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 19035
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: Caught RuntimeError in replica 2 on device 2.
Original Traceback (most recent call last):
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 852, in forward
    encoder_outputs = self.encoder(
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 528, in forward
    layer_outputs = layer_module(
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 413, in forward
    self_attention_outputs = self.attention(
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 340, in forward
    self_outputs = self.self(
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 270, in forward
    attention_probs = self.dropout(attention_probs)
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/torch/nn/modules/dropout.py", line 58, in forward
    return F.dropout(input, self.p, self.training, self.inplace)
  File "/home/jmunse/anaconda3/envs/test/lib/python3.9/site-packages/torch/nn/functional.py", line 1252, in dropout
    return _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
RuntimeError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 2; 31.75 GiB total capacity; 11.96 GiB already allocated; 41.00 MiB free; 12.11 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
