## Masked Language Modeling
Using MLM, we train adapters for each of the GLUE tasks. This adapts the pre-trained language model to the language corpus specific to the GLUE task

In [1]:
from mlm import masked_language_modeling
from mlm_utils import DomainModelArguments, DomainDataTrainingArguments
from transformers import TrainingArguments, MultiLingAdapterArguments

In [2]:
glue_tasks = [
    "cola",
    "mnli",
    #"mrpc",
    "qnli",
    "qqp",
    "rte",
    "sst2",
    "stsb",
    "wnli",
]

In [3]:
model = DomainModelArguments(
    model_name_or_path="roberta-base",
)

adapter = MultiLingAdapterArguments(
    train_adapter=True,
    adapter_config="pfeiffer+inv",
)

In [4]:
# %%capture
results = {}
for dataset in glue_tasks:
    data = DomainDataTrainingArguments(
        dataset_name="glue",
        dataset_config_name=dataset,
    )
    
    training = TrainingArguments(
        learning_rate=1e-4,
        overwrite_output_dir=True,
        output_dir=f"./adapter/mlm/{dataset}",
        do_train=True,
        do_eval=True,
        num_train_epochs=10,
    )

    train_stats, eval_stats = masked_language_modeling(
        model_args=model, data_args=data, training_args=training, adapter_args=adapter
    )
    
    results[dataset] = {"training" : train_stats, "eval" : eval_stats}


07/26/2021 12:41:49 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0001,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=./adapter/mlm/cola/runs/Jul26_12-41-49_alienware-r12,
logging_first_step=False,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler

[INFO|configuration_utils.py:531] 2021-07-26 12:41:50,365 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 12:41:50,368 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "t



[INFO|trainer.py:546] 2021-07-26 12:41:54,411 >> The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1199] 2021-07-26 12:41:54,416 >> ***** Running training *****
[INFO|trainer.py:1200] 2021-07-26 12:41:54,416 >>   Num examples = 185
[INFO|trainer.py:1201] 2021-07-26 12:41:54,416 >>   Num Epochs = 10
[INFO|trainer.py:1202] 2021-07-26 12:41:54,416 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1203] 2021-07-26 12:41:54,416 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1204] 2021-07-26 12:41:54,417 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1205] 2021-07-26 12:41:54,417 >>   Total optimization steps = 240


Step,Training Loss


[INFO|trainer.py:1403] 2021-07-26 12:42:31,384 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:1989] 2021-07-26 12:42:31,385 >> Saving model checkpoint to ./adapter/mlm/cola
[INFO|loading.py:59] 2021-07-26 12:42:31,386 >> Configuration saved in ./adapter/mlm/cola/glue/adapter_config.json
[INFO|loading.py:72] 2021-07-26 12:42:31,393 >> Module weights saved in ./adapter/mlm/cola/glue/pytorch_adapter.bin
[INFO|loading.py:59] 2021-07-26 12:42:31,394 >> Configuration saved in ./adapter/mlm/cola/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 12:42:31,560 >> Module weights saved in ./adapter/mlm/cola/glue/pytorch_model_head.bin
[INFO|loading.py:59] 2021-07-26 12:42:31,561 >> Configuration saved in ./adapter/mlm/cola/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 12:42:31,765 >> Module weights saved in ./adapter/mlm/cola/glue/pytorch_model_head.bin
[INFO|tokenization_utils_base.py:1948] 2021-07-26 12:42:31,765 >> tokenizer

***** train metrics *****
  epoch                    =       10.0
  total_flos               =   666311GF
  train_loss               =       1.25
  train_runtime            = 0:00:36.96
  train_samples            =        185
  train_samples_per_second =     50.044
  train_steps_per_second   =      6.492
07/26/2021 12:42:31 - INFO - mlm -   *** Evaluate ***


[INFO|trainer.py:546] 2021-07-26 12:42:31,820 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:2239] 2021-07-26 12:42:31,821 >> ***** Running Evaluation *****
[INFO|trainer.py:2241] 2021-07-26 12:42:31,821 >>   Num examples = 22
[INFO|trainer.py:2244] 2021-07-26 12:42:31,821 >>   Batch size = 8


***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =     1.6797
  eval_runtime            = 0:00:00.19
  eval_samples            =         22
  eval_samples_per_second =    112.484
  eval_steps_per_second   =     15.339
  perplexity              =      5.364


[INFO|training_args.py:784] 2021-07-26 12:42:32,021 >> PyTorch: setting up devices
[INFO|training_args.py:680] 2021-07-26 12:42:32,022 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


07/26/2021 12:42:32 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0001,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=./adapter/mlm/mnli/runs/Jul26_12-42-32_alienware-r12,
logging_first_step=False,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler

[INFO|configuration_utils.py:531] 2021-07-26 12:42:34,166 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 12:42:34,168 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "t



[INFO|trainer.py:546] 2021-07-26 12:42:36,179 >> The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1199] 2021-07-26 12:42:36,184 >> ***** Running training *****
[INFO|trainer.py:1200] 2021-07-26 12:42:36,184 >>   Num examples = 19060
[INFO|trainer.py:1201] 2021-07-26 12:42:36,185 >>   Num Epochs = 10
[INFO|trainer.py:1202] 2021-07-26 12:42:36,185 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1203] 2021-07-26 12:42:36,185 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1204] 2021-07-26 12:42:36,185 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1205] 2021-07-26 12:42:36,185 >>   Total optimization steps = 23830


Step,Training Loss
500,2.2983
1000,2.1864
1500,2.1431
2000,2.1071
2500,2.0998
3000,2.0833
3500,2.0727
4000,2.0604
4500,2.0541
5000,2.0474


[INFO|trainer.py:1989] 2021-07-26 12:43:56,367 >> Saving model checkpoint to ./adapter/mlm/mnli/checkpoint-500
[INFO|loading.py:59] 2021-07-26 12:43:56,368 >> Configuration saved in ./adapter/mlm/mnli/checkpoint-500/glue/adapter_config.json
[INFO|loading.py:72] 2021-07-26 12:43:56,375 >> Module weights saved in ./adapter/mlm/mnli/checkpoint-500/glue/pytorch_adapter.bin
[INFO|loading.py:59] 2021-07-26 12:43:56,375 >> Configuration saved in ./adapter/mlm/mnli/checkpoint-500/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 12:43:56,536 >> Module weights saved in ./adapter/mlm/mnli/checkpoint-500/glue/pytorch_model_head.bin
[INFO|loading.py:59] 2021-07-26 12:43:56,537 >> Configuration saved in ./adapter/mlm/mnli/checkpoint-500/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 12:43:56,731 >> Module weights saved in ./adapter/mlm/mnli/checkpoint-500/glue/pytorch_model_head.bin
[INFO|tokenization_utils_base.py:1948] 2021-07-26 12:43:56,732 >> tokenizer config file saved in ./adapter

***** train metrics *****
  epoch                    =       10.0
  total_flos               = 68648121GF
  train_loss               =     2.0062
  train_runtime            = 1:03:11.47
  train_samples            =      19060
  train_samples_per_second =     50.271
  train_steps_per_second   =      6.285
07/26/2021 13:45:48 - INFO - mlm -   *** Evaluate ***


[INFO|trainer.py:546] 2021-07-26 13:45:48,102 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:2239] 2021-07-26 13:45:48,104 >> ***** Running Evaluation *****
[INFO|trainer.py:2241] 2021-07-26 13:45:48,104 >>   Num examples = 994
[INFO|trainer.py:2244] 2021-07-26 13:45:48,104 >>   Batch size = 8


[INFO|training_args.py:784] 2021-07-26 13:45:56,520 >> PyTorch: setting up devices
[INFO|training_args.py:680] 2021-07-26 13:45:56,521 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =     1.8546
  eval_runtime            = 0:00:08.40
  eval_samples            =        994
  eval_samples_per_second =    118.286
  eval_steps_per_second   =     14.875
  perplexity              =     6.3893
07/26/2021 13:45:56 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.000

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=10627589.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset glue downloaded and prepared to /home/jason/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


[INFO|configuration_utils.py:531] 2021-07-26 13:46:01,473 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 13:46:01,475 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "t

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=105.0, s…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=6.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=6.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=105.0, style=Progre…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=6.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=6.0, style=Progress…

[INFO|trainer.py:546] 2021-07-26 13:46:10,411 >> The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1199] 2021-07-26 13:46:10,416 >> ***** Running training *****
[INFO|trainer.py:1200] 2021-07-26 13:46:10,416 >>   Num examples = 2857
[INFO|trainer.py:1201] 2021-07-26 13:46:10,416 >>   Num Epochs = 10
[INFO|trainer.py:1202] 2021-07-26 13:46:10,416 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1203] 2021-07-26 13:46:10,416 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1204] 2021-07-26 13:46:10,417 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1205] 2021-07-26 13:46:10,417 >>   Total optimization steps = 3580





Step,Training Loss
500,2.3429
1000,2.2076
1500,2.152
2000,2.1262
2500,2.1021
3000,2.0842
3500,2.0796


[INFO|trainer.py:1989] 2021-07-26 13:47:30,184 >> Saving model checkpoint to ./adapter/mlm/qnli/checkpoint-500
[INFO|loading.py:59] 2021-07-26 13:47:30,184 >> Configuration saved in ./adapter/mlm/qnli/checkpoint-500/glue/adapter_config.json
[INFO|loading.py:72] 2021-07-26 13:47:30,192 >> Module weights saved in ./adapter/mlm/qnli/checkpoint-500/glue/pytorch_adapter.bin
[INFO|loading.py:59] 2021-07-26 13:47:30,192 >> Configuration saved in ./adapter/mlm/qnli/checkpoint-500/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 13:47:30,359 >> Module weights saved in ./adapter/mlm/qnli/checkpoint-500/glue/pytorch_model_head.bin
[INFO|loading.py:59] 2021-07-26 13:47:30,359 >> Configuration saved in ./adapter/mlm/qnli/checkpoint-500/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 13:47:30,552 >> Module weights saved in ./adapter/mlm/qnli/checkpoint-500/glue/pytorch_model_head.bin
[INFO|tokenization_utils_base.py:1948] 2021-07-26 13:47:30,553 >> tokenizer config file saved in ./adapter

***** train metrics *****
  epoch                    =       10.0
  total_flos               = 10290014GF
  train_loss               =     2.1544
  train_runtime            = 0:09:37.26
  train_samples            =       2857
  train_samples_per_second =     49.492
  train_steps_per_second   =      6.202
07/26/2021 13:55:48 - INFO - mlm -   *** Evaluate ***


[INFO|trainer.py:546] 2021-07-26 13:55:48,130 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:2239] 2021-07-26 13:55:48,132 >> ***** Running Evaluation *****
[INFO|trainer.py:2241] 2021-07-26 13:55:48,132 >>   Num examples = 150
[INFO|trainer.py:2244] 2021-07-26 13:55:48,132 >>   Batch size = 8


[INFO|training_args.py:784] 2021-07-26 13:55:49,404 >> PyTorch: setting up devices
[INFO|training_args.py:680] 2021-07-26 13:55:49,405 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =     2.0639
  eval_runtime            = 0:00:01.26
  eval_samples            =        150
  eval_samples_per_second =    118.551
  eval_steps_per_second   =     15.016
  perplexity              =     7.8769
07/26/2021 13:55:49 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.000

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=41696084.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset glue downloaded and prepared to /home/jason/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


[INFO|configuration_utils.py:531] 2021-07-26 13:56:04,732 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 13:56:04,735 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "t

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=364.0, s…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=41.0, st…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=391.0, s…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=364.0, style=Progre…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=41.0, style=Progres…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=391.0, style=Progre…

[INFO|trainer.py:546] 2021-07-26 13:56:55,801 >> The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1199] 2021-07-26 13:56:55,807 >> ***** Running training *****
[INFO|trainer.py:1200] 2021-07-26 13:56:55,808 >>   Num examples = 10788
[INFO|trainer.py:1201] 2021-07-26 13:56:55,808 >>   Num Epochs = 10
[INFO|trainer.py:1202] 2021-07-26 13:56:55,809 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1203] 2021-07-26 13:56:55,809 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1204] 2021-07-26 13:56:55,809 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1205] 2021-07-26 13:56:55,809 >>   Total optimization steps = 13490





Step,Training Loss
500,1.9574
1000,1.8587
1500,1.8097
2000,1.779
2500,1.7664
3000,1.7412
3500,1.7363
4000,1.7233
4500,1.7143
5000,1.7056


[INFO|trainer.py:1989] 2021-07-26 13:58:15,888 >> Saving model checkpoint to ./adapter/mlm/qqp/checkpoint-500
[INFO|loading.py:59] 2021-07-26 13:58:15,888 >> Configuration saved in ./adapter/mlm/qqp/checkpoint-500/glue/adapter_config.json
[INFO|loading.py:72] 2021-07-26 13:58:15,895 >> Module weights saved in ./adapter/mlm/qqp/checkpoint-500/glue/pytorch_adapter.bin
[INFO|loading.py:59] 2021-07-26 13:58:15,896 >> Configuration saved in ./adapter/mlm/qqp/checkpoint-500/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 13:58:16,082 >> Module weights saved in ./adapter/mlm/qqp/checkpoint-500/glue/pytorch_model_head.bin
[INFO|loading.py:59] 2021-07-26 13:58:16,082 >> Configuration saved in ./adapter/mlm/qqp/checkpoint-500/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 13:58:16,279 >> Module weights saved in ./adapter/mlm/qqp/checkpoint-500/glue/pytorch_model_head.bin
[INFO|tokenization_utils_base.py:1948] 2021-07-26 13:58:16,280 >> tokenizer config file saved in ./adapter/mlm/qq

***** train metrics *****
  epoch                    =       10.0
  total_flos               = 38854980GF
  train_loss               =     1.7051
  train_runtime            = 0:36:12.19
  train_samples            =      10788
  train_samples_per_second =     49.664
  train_steps_per_second   =       6.21
07/26/2021 14:33:08 - INFO - mlm -   *** Evaluate ***


[INFO|trainer.py:546] 2021-07-26 14:33:08,453 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:2239] 2021-07-26 14:33:08,455 >> ***** Running Evaluation *****
[INFO|trainer.py:2241] 2021-07-26 14:33:08,455 >>   Num examples = 1199
[INFO|trainer.py:2244] 2021-07-26 14:33:08,455 >>   Batch size = 8


[INFO|training_args.py:784] 2021-07-26 14:33:18,494 >> PyTorch: setting up devices
[INFO|training_args.py:680] 2021-07-26 14:33:18,495 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =     1.5639
  eval_runtime            = 0:00:10.02
  eval_samples            =       1199
  eval_samples_per_second =    119.549
  eval_steps_per_second   =     14.956
  perplexity              =     4.7774
07/26/2021 14:33:18 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.000

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=697150.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset glue downloaded and prepared to /home/jason/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


[INFO|configuration_utils.py:531] 2021-07-26 14:33:21,233 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 14:33:21,236 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "t

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=3.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=1.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=3.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=3.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=1.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=3.0, style=Progress…

[INFO|trainer.py:546] 2021-07-26 14:33:24,374 >> The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1199] 2021-07-26 14:33:24,378 >> ***** Running training *****
[INFO|trainer.py:1200] 2021-07-26 14:33:24,379 >>   Num examples = 278
[INFO|trainer.py:1201] 2021-07-26 14:33:24,379 >>   Num Epochs = 10
[INFO|trainer.py:1202] 2021-07-26 14:33:24,379 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1203] 2021-07-26 14:33:24,379 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1204] 2021-07-26 14:33:24,380 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1205] 2021-07-26 14:33:24,380 >>   Total optimization steps = 350





Step,Training Loss


[INFO|trainer.py:1403] 2021-07-26 14:34:19,987 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:1989] 2021-07-26 14:34:19,988 >> Saving model checkpoint to ./adapter/mlm/rte
[INFO|loading.py:59] 2021-07-26 14:34:19,989 >> Configuration saved in ./adapter/mlm/rte/glue/adapter_config.json
[INFO|loading.py:72] 2021-07-26 14:34:19,998 >> Module weights saved in ./adapter/mlm/rte/glue/pytorch_adapter.bin
[INFO|loading.py:59] 2021-07-26 14:34:19,999 >> Configuration saved in ./adapter/mlm/rte/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 14:34:20,223 >> Module weights saved in ./adapter/mlm/rte/glue/pytorch_model_head.bin
[INFO|loading.py:59] 2021-07-26 14:34:20,224 >> Configuration saved in ./adapter/mlm/rte/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 14:34:20,421 >> Module weights saved in ./adapter/mlm/rte/glue/pytorch_model_head.bin
[INFO|tokenization_utils_base.py:1948] 2021-07-26 14:34:20,422 >> tokenizer config

***** train metrics *****
  epoch                    =       10.0
  total_flos               =  1001268GF
  train_loss               =     1.5585
  train_runtime            = 0:00:55.60
  train_samples            =        278
  train_samples_per_second =     49.993
  train_steps_per_second   =      6.294
07/26/2021 14:34:20 - INFO - mlm -   *** Evaluate ***


[INFO|trainer.py:546] 2021-07-26 14:34:20,474 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:2239] 2021-07-26 14:34:20,476 >> ***** Running Evaluation *****
[INFO|trainer.py:2241] 2021-07-26 14:34:20,476 >>   Num examples = 30
[INFO|trainer.py:2244] 2021-07-26 14:34:20,476 >>   Batch size = 8


[INFO|training_args.py:784] 2021-07-26 14:34:20,743 >> PyTorch: setting up devices
[INFO|training_args.py:680] 2021-07-26 14:34:20,743 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =     1.4394
  eval_runtime            = 0:00:00.26
  eval_samples            =         30
  eval_samples_per_second =    115.337
  eval_steps_per_second   =     15.378
  perplexity              =     4.2183
07/26/2021 14:34:20 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.000

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=7439277.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

[INFO|configuration_utils.py:531] 2021-07-26 14:34:24,464 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 14:34:24,467 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "t

Dataset glue downloaded and prepared to /home/jason/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


[INFO|tokenization_auto.py:427] 2021-07-26 14:34:24,583 >> Could not locate the tokenizer configuration file, will try to use the model config instead.
[INFO|configuration_utils.py:531] 2021-07-26 14:34:24,714 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 14:34:24,716 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 51

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=68.0, st…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=1.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=2.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=68.0, style=Progres…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=1.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=2.0, style=Progress…




[INFO|trainer.py:546] 2021-07-26 14:34:30,623 >> The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1199] 2021-07-26 14:34:30,628 >> ***** Running training *****
[INFO|trainer.py:1200] 2021-07-26 14:34:30,628 >>   Num examples = 1853
[INFO|trainer.py:1201] 2021-07-26 14:34:30,628 >>   Num Epochs = 10
[INFO|trainer.py:1202] 2021-07-26 14:34:30,628 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1203] 2021-07-26 14:34:30,629 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1204] 2021-07-26 14:34:30,629 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1205] 2021-07-26 14:34:30,629 >>   Total optimization steps = 2320


Step,Training Loss
500,2.4203
1000,2.2347
1500,2.1732
2000,2.1285


[INFO|trainer.py:1989] 2021-07-26 14:35:49,733 >> Saving model checkpoint to ./adapter/mlm/sst2/checkpoint-500
[INFO|loading.py:59] 2021-07-26 14:35:49,734 >> Configuration saved in ./adapter/mlm/sst2/checkpoint-500/glue/adapter_config.json
[INFO|loading.py:72] 2021-07-26 14:35:49,740 >> Module weights saved in ./adapter/mlm/sst2/checkpoint-500/glue/pytorch_adapter.bin
[INFO|loading.py:59] 2021-07-26 14:35:49,741 >> Configuration saved in ./adapter/mlm/sst2/checkpoint-500/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 14:35:49,900 >> Module weights saved in ./adapter/mlm/sst2/checkpoint-500/glue/pytorch_model_head.bin
[INFO|loading.py:59] 2021-07-26 14:35:49,901 >> Configuration saved in ./adapter/mlm/sst2/checkpoint-500/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 14:35:50,081 >> Module weights saved in ./adapter/mlm/sst2/checkpoint-500/glue/pytorch_model_head.bin
[INFO|tokenization_utils_base.py:1948] 2021-07-26 14:35:50,082 >> tokenizer config file saved in ./adapter

***** train metrics *****
  epoch                    =       10.0
  total_flos               =  6673922GF
  train_loss               =     2.2219
  train_runtime            = 0:06:11.51
  train_samples            =       1853
  train_samples_per_second =     49.877
  train_steps_per_second   =      6.245
07/26/2021 14:40:42 - INFO - mlm -   *** Evaluate ***


[INFO|trainer.py:546] 2021-07-26 14:40:42,556 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:2239] 2021-07-26 14:40:42,558 >> ***** Running Evaluation *****
[INFO|trainer.py:2241] 2021-07-26 14:40:42,558 >>   Num examples = 44
[INFO|trainer.py:2244] 2021-07-26 14:40:42,558 >>   Batch size = 8


[INFO|training_args.py:784] 2021-07-26 14:40:42,938 >> PyTorch: setting up devices
[INFO|training_args.py:680] 2021-07-26 14:40:42,939 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =     1.8202
  eval_runtime            = 0:00:00.37
  eval_samples            =         44
  eval_samples_per_second =    117.726
  eval_steps_per_second   =     16.054
  perplexity              =     6.1728
07/26/2021 14:40:42 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.000

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=802872.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset glue downloaded and prepared to /home/jason/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


[INFO|configuration_utils.py:531] 2021-07-26 14:40:45,577 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 14:40:45,579 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "t

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=6.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=2.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=2.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=6.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=2.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=2.0, style=Progress…




[INFO|trainer.py:546] 2021-07-26 14:40:48,058 >> The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1199] 2021-07-26 14:40:48,063 >> ***** Running training *****
[INFO|trainer.py:1200] 2021-07-26 14:40:48,063 >>   Num examples = 159
[INFO|trainer.py:1201] 2021-07-26 14:40:48,063 >>   Num Epochs = 10
[INFO|trainer.py:1202] 2021-07-26 14:40:48,063 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1203] 2021-07-26 14:40:48,064 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1204] 2021-07-26 14:40:48,064 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1205] 2021-07-26 14:40:48,064 >>   Total optimization steps = 200


Step,Training Loss


[INFO|trainer.py:1403] 2021-07-26 14:41:19,709 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:1989] 2021-07-26 14:41:19,710 >> Saving model checkpoint to ./adapter/mlm/stsb
[INFO|loading.py:59] 2021-07-26 14:41:19,711 >> Configuration saved in ./adapter/mlm/stsb/glue/adapter_config.json
[INFO|loading.py:72] 2021-07-26 14:41:19,718 >> Module weights saved in ./adapter/mlm/stsb/glue/pytorch_adapter.bin
[INFO|loading.py:59] 2021-07-26 14:41:19,719 >> Configuration saved in ./adapter/mlm/stsb/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 14:41:19,899 >> Module weights saved in ./adapter/mlm/stsb/glue/pytorch_model_head.bin
[INFO|loading.py:59] 2021-07-26 14:41:19,900 >> Configuration saved in ./adapter/mlm/stsb/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 14:41:20,115 >> Module weights saved in ./adapter/mlm/stsb/glue/pytorch_model_head.bin
[INFO|tokenization_utils_base.py:1948] 2021-07-26 14:41:20,115 >> tokenizer

***** train metrics *****
  epoch                    =       10.0
  total_flos               =   572667GF
  train_loss               =     2.0214
  train_runtime            = 0:00:31.64
  train_samples            =        159
  train_samples_per_second =     50.245
  train_steps_per_second   =       6.32
07/26/2021 14:41:20 - INFO - mlm -   *** Evaluate ***


[INFO|trainer.py:546] 2021-07-26 14:41:20,166 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:2239] 2021-07-26 14:41:20,168 >> ***** Running Evaluation *****
[INFO|trainer.py:2241] 2021-07-26 14:41:20,168 >>   Num examples = 47
[INFO|trainer.py:2244] 2021-07-26 14:41:20,168 >>   Batch size = 8


[INFO|training_args.py:784] 2021-07-26 14:41:20,573 >> PyTorch: setting up devices
[INFO|training_args.py:680] 2021-07-26 14:41:20,573 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =     1.7703
  eval_runtime            = 0:00:00.39
  eval_samples            =         47
  eval_samples_per_second =    117.834
  eval_steps_per_second   =     15.043
  perplexity              =     5.8725
07/26/2021 14:41:20 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.000

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28999.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

[INFO|configuration_utils.py:531] 2021-07-26 14:41:22,471 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 14:41:22,474 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "t

Dataset glue downloaded and prepared to /home/jason/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


[INFO|tokenization_auto.py:427] 2021-07-26 14:41:22,599 >> Could not locate the tokenizer configuration file, will try to use the model config instead.
[INFO|configuration_utils.py:531] 2021-07-26 14:41:22,707 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-26 14:41:22,709 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 51

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=1.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=1.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on every text in dataset', max=1.0, sty…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=1.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=1.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 512', max=1.0, style=Progress…




[INFO|trainer.py:546] 2021-07-26 14:41:24,543 >> The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1199] 2021-07-26 14:41:24,547 >> ***** Running training *****
[INFO|trainer.py:1200] 2021-07-26 14:41:24,547 >>   Num examples = 31
[INFO|trainer.py:1201] 2021-07-26 14:41:24,548 >>   Num Epochs = 10
[INFO|trainer.py:1202] 2021-07-26 14:41:24,548 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1203] 2021-07-26 14:41:24,548 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1204] 2021-07-26 14:41:24,548 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1205] 2021-07-26 14:41:24,548 >>   Total optimization steps = 40


Step,Training Loss


[INFO|trainer.py:1403] 2021-07-26 14:41:30,664 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:1989] 2021-07-26 14:41:30,665 >> Saving model checkpoint to ./adapter/mlm/wnli
[INFO|loading.py:59] 2021-07-26 14:41:30,666 >> Configuration saved in ./adapter/mlm/wnli/glue/adapter_config.json
[INFO|loading.py:72] 2021-07-26 14:41:30,673 >> Module weights saved in ./adapter/mlm/wnli/glue/pytorch_adapter.bin
[INFO|loading.py:59] 2021-07-26 14:41:30,674 >> Configuration saved in ./adapter/mlm/wnli/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 14:41:30,824 >> Module weights saved in ./adapter/mlm/wnli/glue/pytorch_model_head.bin
[INFO|loading.py:59] 2021-07-26 14:41:30,825 >> Configuration saved in ./adapter/mlm/wnli/glue/head_config.json
[INFO|loading.py:72] 2021-07-26 14:41:31,018 >> Module weights saved in ./adapter/mlm/wnli/glue/pytorch_model_head.bin
[INFO|tokenization_utils_base.py:1948] 2021-07-26 14:41:31,018 >> tokenizer

***** train metrics *****
  epoch                    =       10.0
  total_flos               =   111652GF
  train_loss               =     1.6329
  train_runtime            = 0:00:06.11
  train_samples            =         31
  train_samples_per_second =     50.686
  train_steps_per_second   =       6.54
07/26/2021 14:41:31 - INFO - mlm -   *** Evaluate ***


[INFO|trainer.py:546] 2021-07-26 14:41:31,069 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:2239] 2021-07-26 14:41:31,071 >> ***** Running Evaluation *****
[INFO|trainer.py:2241] 2021-07-26 14:41:31,071 >>   Num examples = 3
[INFO|trainer.py:2244] 2021-07-26 14:41:31,071 >>   Batch size = 8


***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =     1.6839
  eval_runtime            = 0:00:00.02
  eval_samples            =          3
  eval_samples_per_second =    105.691
  eval_steps_per_second   =      35.23
  perplexity              =     5.3866


In [5]:
from pprint import pprint
pprint(results)

{'cola': {'eval': {'epoch': 10.0,
                   'eval_loss': 1.679701566696167,
                   'eval_runtime': 0.1956,
                   'eval_samples': 22,
                   'eval_samples_per_second': 112.484,
                   'eval_steps_per_second': 15.339,
                   'perplexity': 5.3639549494375895},
          'training': {'epoch': 10.0,
                       'total_flos': 715446823680000.0,
                       'train_loss': 1.250021235148112,
                       'train_runtime': 36.9675,
                       'train_samples': 185,
                       'train_samples_per_second': 50.044,
                       'train_steps_per_second': 6.492}},
 'mnli': {'eval': {'epoch': 10.0,
                   'eval_loss': 1.854617953300476,
                   'eval_runtime': 8.4033,
                   'eval_samples': 994,
                   'eval_samples_per_second': 118.286,
                   'eval_steps_per_second': 14.875,
                   'perplexity': 6.3