## Masked Language Modeling
Using MLM, we train adapters for each of the GLUE tasks. This adapts the pre-trained language model to the language corpus specific to the GLUE task

In [1]:
# !pip install -Uqq adapter-transformers datasets
%load_ext autoreload
%autoreload 2

from mlm import masked_language_modeling
from mlm_utils import DomainModelArguments, DomainDataTrainingArguments
from transformers import TrainingArguments, MultiLingAdapterArguments

In [2]:
glue_tasks = [
    "cola",
    #"mnli",
    #"mrpc",
    "qnli",
    #"qqp",
    "rte",
    "sst2",
    #"stsb",
    "wnli",
]

In [3]:
model = DomainModelArguments(
    model_name_or_path="roberta-base",
)

adapter = MultiLingAdapterArguments(
    train_adapter=True,
    adapter_config="pfeiffer+inv",
)

In [4]:
# %%capture
results = {}
for dataset in glue_tasks[:1]:
    data = DomainDataTrainingArguments(
        dataset_name="glue",
        dataset_config_name=dataset,
    )
    
    training = TrainingArguments(
        learning_rate=1e-4,
        overwrite_output_dir=True,
        output_dir=f"./adapter/test-mlm/{dataset}",
        do_train=True,
        do_eval=True,
        num_train_epochs=10,
    )

    train_stats, eval_stats = masked_language_modeling(
        model_args=model, data_args=data, training_args=training, adapter_args=adapter
    )
    
    results[dataset] = {"training" : train_stats, "eval" : eval_stats}


08/01/2021 15:22:33 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0001,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=./adapter/test-mlm/cola/runs/Aug01_15-22-33_alienware-r12,
logging_first_step=False,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_sche

[ERROR|configuration_utils.py:512] 2021-08-01 15:22:34,004 >> 403 Client Error: Forbidden for url: https://huggingface.co/roberta-base/resolve/main/config.json


OSError: Can't load config for 'roberta-base'. Make sure that:

- 'roberta-base' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'roberta-base' is the correct path to a directory containing a config.json file



In [None]:
from pprint import pprint
pprint(results)

## Run eval on the glue tasks with the MLM adapted model
This establishes whether the MLM training improved the model's performance on each task
1. Load the model
1. load the adapter
1. Loop over the tasks and capture output

In [2]:
glue_tasks = ["sst2", "cola", "wnli", "rte", "qnli"]
final_params = {
    "sst2" : {
        "learning_rate": 5e-4,
        "max_seq_length": 64,
        "per_device_train_batch_size": 32,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 1,
    },
    "cola" : {
        "learning_rate": 5e-4,
        "max_seq_length": 64,
        "per_device_train_batch_size": 32,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 1,
    },
    "wnli" : {
        "learning_rate": 1e-5,
        "max_seq_length": 256,
        "per_device_train_batch_size": 16,
        "adam_epsilon": 1e-6,
        "num_train_epochs": 1,
    },
    "rte" : {
        "learning_rate": 5e-4,
        "max_seq_length": 256,
        "per_device_train_batch_size": 16,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 1,
    },
    "qnli" : {
        "learning_rate": 5e-4,
        "max_seq_length": 64,
        "per_device_train_batch_size": 32,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 1,
    },
}

In [39]:
# test loading the pre-trained model and adding an adapter
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoModelForMaskedLM, RobertaModelWithHeads, AutoModelWithHeads, AdapterConfig
from transformers.adapters.composition import Stack

model = AutoModelForMaskedLM.from_pretrained('/home/jason/git/roberta-base/')

lang_adapter_config = AdapterConfig.load("pfeiffer+inv")
mlm_adapter = model.load_adapter("./adapter/mlm/rte/glue", config=lang_adapter_config)

task_adapter_config = AdapterConfig.load("pfeiffer")
task_adapter = model.load_adapter("./adapter/task/final_rte/rte", config=task_adapter_config)
model.set_active_adapters(task_adapter)
model.set_active_adapters(task_adapter)
# model.add_classification_head("rte", num_labels=8)

model.active_adapters = Stack("glue", "rte")

loading configuration file /home/jason/git/roberta-base/config.json
Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "2.1.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file /home/jason/git/roberta-base/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkpoint at

In [29]:
from task import final_training
for task in glue_tasks[3:4]:
    
    print(f"\n\n##### START TASK: {task} #####\n{final_params.get(task)}\n\n")
    
    final_training(pre_trained_model=model,
                   task=task,
                   prefix="mlm_test_",
                   do_train=False,
                   **final_params.get(task))

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
07/31/2021 15:59:01 - INFO - task -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-07,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0005



##### START TASK: rte #####
{'learning_rate': 0.0005, 'max_seq_length': 256, 'per_device_train_batch_size': 16, 'adam_epsilon': 1e-07, 'num_train_epochs': 1}




loading configuration file /home/jason/git/roberta-base/config.json
Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": "rte",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "2.1.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file /home/jason/git/roberta-base/config.json
Model config RobertaConfig {
  "adapters

ValueError: Expected input batch_size (2048) to match target batch_size (8).

In [7]:
from transformers import AutoModelWithHeads
test_model = AutoModelWithHeads.from_pretrained("fake-model")

[ERROR|configuration_utils.py:512] 2021-08-01 15:33:07,515 >> 403 Client Error: Forbidden for url: https://huggingface.co/fake-model/resolve/main/config.json


OSError: Can't load config for 'fake-model'. Make sure that:

- 'fake-model' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'fake-model' is the correct path to a directory containing a config.json file

