In [1]:
# !pip install -Uqq adapter-transformers datasets

%load_ext autoreload
%autoreload 1
%aimport adapter_utils
%aimport mlm
from adapter_utils import get_model, get_tokenizer, adapt_model, get_test_data
from mlm import masked_language_modeling

### Load and Adapt a Model
I've written helper functions to generalize / abstract the loading of the model and the tokenizer

In [None]:
tokenizer = get_tokenizer()

In [None]:
model = get_model()

In [None]:
adapt_model(model=model, adapter_name="qa/squad1@ukp", adapter_arch="houlsby")

### Test that it's working
Test that things are working by running the Q&A example from Adapter Hub's [sample notebook](https://github.com/Adapter-Hub/adapter-transformers/blob/master/notebooks/02_Adapter_Inference.ipynb).

In [None]:
from transformers import QuestionAnsweringPipeline

qa = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

context = """
The current modus operandi in NLP involves downloading and fine-tuning pre-trained models consisting of millions or billions of parameters.
Storing and sharing such large trained models is expensive, slow, and time-consuming, which impedes progress towards more general and versatile NLP methods that learn from and for many tasks.
Adapters -- small learnt bottleneck layers inserted within each layer of a pre-trained model -- ameliorate this issue by avoiding full fine-tuning of the entire model.
However, sharing and integrating adapter layers is not straightforward.
We propose AdapterHub, a framework that allows dynamic "stitching-in" of pre-trained adapters for different tasks and languages.
The framework, built on top of the popular HuggingFace Transformers library, enables extremely easy and quick adaptations of state-of-the-art pre-trained models (e.g., BERT, RoBERTa, XLM-R) across tasks and languages.
Downloading, sharing, and training adapters is as seamless as possible using minimal changes to the training scripts and a specialized infrastructure.
Our framework enables scalable and easy access to sharing of task-specific models, particularly in low-resource scenarios.
AdapterHub includes all recent adapter architectures and can be found at AdapterHub.ml.
"""

In [None]:
# ignore all FutureWarnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [None]:
def answer_questions(questions):
    for question in questions:
        result = qa(question=question, context=context)
        print("❔", question)
        print("💡", result["answer"])
        print()

answer_questions([
    "What are Adapters?",
    "What do Adapters avoid?",
    "What is proposed?",
    "What does AdapterHub allow?",
    "Where can I find AdapterHub?",
])

### List the datasets that exist at HuggingFace
HuggingFace makes it easy to access public NLP datasets

In [None]:
import datasets
hf_data = datasets.list_datasets()
for data in hf_data:
    if "glue" in data.lower():
        print(data)

### Test some data
Test things with the rotten tomatoes example data

In [None]:
dataset = get_test_data()

### Test training
try the training procedure here: https://github.com/Adapter-Hub/adapter-transformers/blob/master/notebooks/02_Adapter_Inference.ipynb

In [None]:
from transformers import RobertaConfig, RobertaModelWithHeads

config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=2,
)
model = RobertaModelWithHeads.from_pretrained(
    "roberta-base",
    config=config,
)

# Add a new adapter
model.add_adapter("rotten_tomatoes")
# Add a matching classification head
model.add_classification_head(
    "rotten_tomatoes",
    num_labels=2,
    id2label={ 0: "👎", 1: "👍"}
  )
# Activate the adapter
model.train_adapter("rotten_tomatoes")

In [None]:
import numpy as np
from transformers import TrainingArguments, Trainer, EvalPrediction

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_accuracy,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
from transformers import TextClassificationPipeline

classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)

classifier("This is awesome!")

In [None]:
model.save_adapter("./final_adapter", "rotten_tomatoes")

!ls -lh final_adapter

### The Arvix dataset is going to take more prep work
You have to manually download and extract this dataset, and even then I haven't been able to get it work yet. There's some nuance in the way you have to tell HuggingFace to split the data and I haven't gotten this part working yet.

In [None]:
from datasets import load_dataset, SplitInfo
arvix_data = load_dataset("arxiv_dataset", data_dir="./arvix_data/", split=SplitInfo(name='train', num_bytes=2246545603, num_examples=1796911, dataset_name='arxiv_dataset'))

### Training a GLUE Adapter

Using `run_glue_alt.py` from the Adapter Transformers repo, we can easily create an adapter for one of the GLUE tasks. Here's an example from their documentation:

```
export TASK_NAME=mrpc

python run_glue_alt.py \
  --model_name_or_path bert-base-uncased \
  --task_name $TASK_NAME \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 32 \
  --learning_rate 1e-4 \
  --num_train_epochs 10.0 \
  --output_dir /tmp/$TASK_NAME \
  --overwrite_output_dir \
  --train_adapter \
  --adapter_config pfeiffer
```

For convenience, I've created a shell script that launches the training process to create an adapter for the CoLA task called `cola_adapter.sh` the contents of which are:

```
#!/bin/bash
export TASK_NAME=cola

python run_glue_alt.py \
  --model_name_or_path roberta-base \
  --task_name $TASK_NAME \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 32 \
  --learning_rate 1e-5 \
  --num_train_epochs 10.0 \
  --output_dir ./adapter/$TASK_NAME \
  --overwrite_output_dir \
  --train_adapter \
  --adapter_config pfeiffer
```

You can run the script by running the next cell.

In [None]:
!sh ./cola_adapter.sh

### Train an adapter for The Stanford Sentiment Treebank dataset
1. Create `sst_adapter.sh` using `cola_adapter.sh` as an example, but train for the SST task.
1. Push the newly created `sst_adapter.sh` file into GitLab
1. Run the script to train the adapter. (in the cell below)
1. Download the trained adapter from ./adapter/sst and store it in our Google Drive

In [None]:
!sh ./sst_adapter.sh

### Exploring Unmasking with Vannila RoBERTa

In [24]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='roberta-base')
# unmasker = pipeline('fill-mask', model='bert-base-uncased')

In [21]:
from random import randint

In [22]:
def random_mask(sentence: str) -> str:
    """
    :param sentence: sentence without punctuation
    Randomly masks one word in a sentence
    """
    words = str.split(sentence, " ")
    idx = randint(0, len(words) - 1)
    orig_word = words[idx]
    words[idx] = "<mask>"
    masked = ""
    for word in words:
        masked += " " + word
    return masked, idx, orig_word

In [31]:
sample_sentence = "Don't forget to tip the waiter!"
# masked_sentence, index, orignal = random_mask(sample_sentence)
masked_sentence = "17 plus 4 equals <mask>."
predictions = unmasker(masked_sentence)

print(f"Sentence:\t{masked_sentence}\n")
for prediction in predictions:
    print(f"Prediction:\t{prediction.get('sequence')}\nScore:\t\t{prediction.get('score') * 100:.2f}%\n")
#     if prediction.get("token_str") == orignal:
#         print("CORRECT")
#     else:
#         print("INCORRECT")

Sentence:	17 plus 4 equals <mask>.

Prediction:	17 plus 4 equals 17.
Score:		15.90%

Prediction:	17 plus 4 equals 18.
Score:		7.56%

Prediction:	17 plus 4 equals 16.
Score:		4.89%

Prediction:	17 plus 4 equals 24.
Score:		3.18%

Prediction:	17 plus 4 equals 20.
Score:		3.09%



### Testing the refactored MLM script

In [6]:
from mlm import masked_language_modeling
from mlm_utils import ModelArguments, DataTrainingArguments
from transformers import TrainingArguments, MultiLingAdapterArguments

In [7]:
dataset = "cola"

model = ModelArguments(
    model_name_or_path="roberta-base",
)

data = DataTrainingArguments(
    dataset_name="glue",
    dataset_config_name=dataset,
)

training = TrainingArguments(
    learning_rate=1e-4,
    overwrite_output_dir=True,
    output_dir=f"./adapter/mlm/{dataset}",
    do_train=True,
    do_eval=True,
    num_train_epochs=10,
)

adapter = MultiLingAdapterArguments(
    train_adapter=True,
    adapter_config="pfeiffer+inv",
)

[INFO|training_args.py:784] 2021-07-23 16:24:43,711 >> PyTorch: setting up devices
[INFO|training_args.py:680] 2021-07-23 16:24:43,712 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [8]:
train_stats, eval_stats = masked_language_modeling(
    model_args=model, data_args=data, training_args=training, adapter_args=adapter
)

07/23/2021 16:24:43 - INFO - mlm -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0001,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=./adapter/mlm/cola/runs/Jul23_16-24-43_alienware-r12,
logging_first_step=False,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler

[INFO|configuration_utils.py:531] 2021-07-23 16:24:44,856 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/jason/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:569] 2021-07-23 16:24:44,856 >> Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "t



[INFO|trainer.py:546] 2021-07-23 16:24:46,712 >> The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:1199] 2021-07-23 16:24:46,717 >> ***** Running training *****
[INFO|trainer.py:1200] 2021-07-23 16:24:46,718 >>   Num examples = 185
[INFO|trainer.py:1201] 2021-07-23 16:24:46,718 >>   Num Epochs = 10
[INFO|trainer.py:1202] 2021-07-23 16:24:46,718 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1203] 2021-07-23 16:24:46,718 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1204] 2021-07-23 16:24:46,718 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1205] 2021-07-23 16:24:46,719 >>   Total optimization steps = 240


Step,Training Loss


[INFO|trainer.py:1403] 2021-07-23 16:25:25,758 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


[INFO|trainer.py:1989] 2021-07-23 16:25:25,759 >> Saving model checkpoint to ./adapter/mlm/cola
[INFO|loading.py:59] 2021-07-23 16:25:25,760 >> Configuration saved in ./adapter/mlm/cola/glue/adapter_config.json
[INFO|loading.py:72] 2021-07-23 16:25:25,799 >> Module weights saved in ./adapter/mlm/cola/glue/pytorch_adapter.bin
[INFO|loading.py:59] 2021-07-23 16:25:25,800 >> Configuration saved in ./adapter/mlm/cola/glue/head_config.json
[INFO|loading.py:72] 2021-07-23 16:25:26,016 >> Module weights saved in ./adapter/mlm/cola/glue/pytorch_model_head.bin
[INFO|loading.py:59] 2021-07-23 16:25:26,017 >> Configuration saved in ./adapter/mlm/cola/glue/head_config.json
[INFO|loading.py:72] 2021-07-23 16:25:26,228 >> Module weights saved in ./adapter/mlm/cola/glue/pytorch_model_head.bin
[INFO|tokenization_utils_base.py:1948] 2021-07-23 16:25:26,229 >> tokenizer

***** train metrics *****
  epoch                    =       10.0
  total_flos               =   666311GF
  train_loss               =       1.25
  train_runtime            = 0:00:39.03
  train_samples            =        185
  train_samples_per_second =     47.388
  train_steps_per_second   =      6.148
07/23/2021 16:25:26 - INFO - mlm -   *** Evaluate ***


[INFO|trainer.py:546] 2021-07-23 16:25:26,292 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask.
[INFO|trainer.py:2239] 2021-07-23 16:25:26,294 >> ***** Running Evaluation *****
[INFO|trainer.py:2241] 2021-07-23 16:25:26,294 >>   Num examples = 22
[INFO|trainer.py:2244] 2021-07-23 16:25:26,294 >>   Batch size = 8


***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =     1.6797
  eval_runtime            = 0:00:00.20
  eval_samples            =         22
  eval_samples_per_second =    105.802
  eval_steps_per_second   =     14.428
  perplexity              =      5.364


In [9]:
train_stats

{'train_runtime': 39.0394,
 'train_samples_per_second': 47.388,
 'train_steps_per_second': 6.148,
 'total_flos': 715446823680000.0,
 'train_loss': 1.250021235148112,
 'epoch': 10.0,
 'train_samples': 185}

In [10]:
eval_stats

{'eval_loss': 1.679701566696167,
 'eval_runtime': 0.2079,
 'eval_samples_per_second': 105.802,
 'eval_steps_per_second': 14.428,
 'epoch': 10.0,
 'eval_samples': 22,
 'perplexity': 5.3639549494375895}