In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import re
from functools import partial

REGEX_MULTI_SPACE = re.compile("\s+")


def preprocess_text(_re, _regex, s):
    return {
        "text": _re.sub(_regex, " ", s["title"])
        # + "\n\n"
        # + _re.sub(_regex, " ", s["abstract"])
    }
    
partial_preprocess_text = partial(preprocess_text, re, REGEX_MULTI_SPACE)

In [3]:
from datasets import load_dataset

dataset = load_dataset("aalksii/ml-arxiv-papers")
dataset = dataset.map(
    partial_preprocess_text,
)
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'abstract', 'text'],
        num_rows: 105832
    })
    test: Dataset({
        features: ['title', 'abstract', 'text'],
        num_rows: 11760
    })
})

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

# def add_tokens(tokenizer, new_tokens):
    # new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())
    # tokenizer.add_tokens(list(new_tokens))
    # return tokenizer

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
def tokenize_function(tokenizer, examples):
    return tokenizer(examples["text"], truncation=True)



partial_tokenize_function = partial(tokenize_function, tokenizer)

tokens = dataset.map(
    partial_tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/105832 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/11760 [00:00<?, ? examples/s]

In [6]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

In [7]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./results_masked",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    save_steps=1000,
    optim="adafactor",
    fp16=True,
    # load_best_model_at_end=True,
)

# NOTE: There's a HuggingFace bug on this; will fix later 
# (validate that eval_loss doesn't worsen by manual inspection for now)
# early_stopping = EarlyStoppingCallback(
#     early_stopping_patience=3, early_stopping_threshold=0.03
# )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokens["train"],
    eval_dataset=tokens["test"],
    data_collator=data_collator,
    # callbacks=[early_stopping],
)

trainer.train()

  0%|          | 0/39687 [00:00<?, ?it/s]

{'loss': 3.2395, 'learning_rate': 1.975004409504372e-05, 'epoch': 0.04}
{'loss': 2.9825, 'learning_rate': 1.949807241666037e-05, 'epoch': 0.08}
{'loss': 2.7744, 'learning_rate': 1.924610073827702e-05, 'epoch': 0.11}
{'loss': 2.7988, 'learning_rate': 1.899412905989367e-05, 'epoch': 0.15}
{'loss': 2.762, 'learning_rate': 1.8742157381510318e-05, 'epoch': 0.19}
{'loss': 2.6762, 'learning_rate': 1.849018570312697e-05, 'epoch': 0.23}
{'loss': 2.7256, 'learning_rate': 1.823821402474362e-05, 'epoch': 0.26}
{'loss': 2.7004, 'learning_rate': 1.7986746289717036e-05, 'epoch': 0.3}
{'loss': 2.6525, 'learning_rate': 1.7734774611333687e-05, 'epoch': 0.34}
{'loss': 2.5652, 'learning_rate': 1.748280293295034e-05, 'epoch': 0.38}
{'loss': 2.5228, 'learning_rate': 1.723083125456699e-05, 'epoch': 0.42}
{'loss': 2.597, 'learning_rate': 1.6979363519540405e-05, 'epoch': 0.45}
{'loss': 2.475, 'learning_rate': 1.6727391841157053e-05, 'epoch': 0.49}
{'loss': 2.5657, 'learning_rate': 1.6475420162773704e-05, 'epoc

  0%|          | 0/1470 [00:00<?, ?it/s]

{'eval_loss': 2.2009363174438477, 'eval_runtime': 19.696, 'eval_samples_per_second': 597.077, 'eval_steps_per_second': 74.635, 'epoch': 1.0}
{'loss': 2.2936, 'learning_rate': 1.3200796230503692e-05, 'epoch': 1.02}
{'loss': 2.3287, 'learning_rate': 1.294932849547711e-05, 'epoch': 1.06}
{'loss': 2.2704, 'learning_rate': 1.269735681709376e-05, 'epoch': 1.1}
{'loss': 2.2588, 'learning_rate': 1.244538513871041e-05, 'epoch': 1.13}
{'loss': 2.331, 'learning_rate': 1.219341346032706e-05, 'epoch': 1.17}
{'loss': 2.2291, 'learning_rate': 1.194144178194371e-05, 'epoch': 1.21}
{'loss': 2.2719, 'learning_rate': 1.1689974046917128e-05, 'epoch': 1.25}
{'loss': 2.2463, 'learning_rate': 1.1438002368533778e-05, 'epoch': 1.29}
{'loss': 2.3335, 'learning_rate': 1.1186030690150429e-05, 'epoch': 1.32}
{'loss': 2.2488, 'learning_rate': 1.0934059011767077e-05, 'epoch': 1.36}
{'loss': 2.2094, 'learning_rate': 1.0682591276740494e-05, 'epoch': 1.4}
{'loss': 2.2814, 'learning_rate': 1.0430619598357145e-05, 'epoch

  0%|          | 0/1470 [00:00<?, ?it/s]

{'eval_loss': 2.022162914276123, 'eval_runtime': 19.8886, 'eval_samples_per_second': 591.294, 'eval_steps_per_second': 73.912, 'epoch': 2.0}
{'loss': 2.1199, 'learning_rate': 6.652556252677199e-06, 'epoch': 2.0}
{'loss': 2.14, 'learning_rate': 6.40058457429385e-06, 'epoch': 2.04}
{'loss': 2.1327, 'learning_rate': 6.148612895910501e-06, 'epoch': 2.08}
{'loss': 2.0482, 'learning_rate': 5.8966412175271504e-06, 'epoch': 2.12}
{'loss': 2.0838, 'learning_rate': 5.644669539143801e-06, 'epoch': 2.15}
{'loss': 2.1741, 'learning_rate': 5.393201804117217e-06, 'epoch': 2.19}
{'loss': 2.0584, 'learning_rate': 5.141230125733868e-06, 'epoch': 2.23}
{'loss': 2.0864, 'learning_rate': 4.889258447350518e-06, 'epoch': 2.27}
{'loss': 2.1298, 'learning_rate': 4.637286768967168e-06, 'epoch': 2.31}
{'loss': 2.0758, 'learning_rate': 4.385315090583819e-06, 'epoch': 2.34}
{'loss': 2.1086, 'learning_rate': 4.133343412200469e-06, 'epoch': 2.38}
{'loss': 2.044, 'learning_rate': 3.881371733817119e-06, 'epoch': 2.42}

  0%|          | 0/1470 [00:00<?, ?it/s]

{'eval_loss': 1.9867643117904663, 'eval_runtime': 19.498, 'eval_samples_per_second': 603.14, 'eval_steps_per_second': 75.393, 'epoch': 3.0}
{'train_runtime': 2924.7538, 'train_samples_per_second': 108.555, 'train_steps_per_second': 13.569, 'train_loss': 2.2956298002265676, 'epoch': 3.0}


TrainOutput(global_step=39687, training_loss=2.2956298002265676, metrics={'train_runtime': 2924.7538, 'train_samples_per_second': 108.555, 'train_steps_per_second': 13.569, 'train_loss': 2.2956298002265676, 'epoch': 3.0})