## Training Task Adapters
Using randomized search, we identify optimal hyperparameters to train task specfic adapters on GLUE tasks.

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# cd drive/MyDrive/cs7643-deep-learning-summer-2021/

In [3]:
# !pip install -Uqq adapter-transformers datasets

%load_ext autoreload
%autoreload 2

import torch
from time import time
from typing import Dict, List
from task_utils import TaskModelArguments, TaskDataTrainingArguments
from task import train_task_adapter
from transformers import (
    MultiLingAdapterArguments,
    TrainingArguments,
)

### CoLA Task

In [4]:
import random
import itertools
import subprocess
import pandas as pd

def getParams(dictionary, limit):
    paramsList = [dict(zip(dictionary, v)) for v in itertools.product(*dictionary.values())]
    random.shuffle(paramsList)

    if limit is not False:
        paramsList = paramsList[0:min(limit, len(paramsList))]

    return paramsList

def initParse(dictionary: Dict):
    model = TaskModelArguments(
        model_name_or_path=dictionary.get('model_name_or_path')
    )

    data = TaskDataTrainingArguments(
        task_name=dictionary.get('task_name'),
        max_seq_length=dictionary.get('max_seq_length'),
        pad_to_max_length=dictionary.get('pad_to_max_length')
    )

    training = TrainingArguments(
        adam_beta1=dictionary.get('adam_beta1'),
        adam_beta2=dictionary.get('adam_beta2'),
        adam_epsilon=dictionary.get('adam_epsilon'),
        learning_rate=dictionary.get('learning_rate'),
        fp16=dictionary.get('fp16'),
        warmup_ratio=dictionary.get('warmup_ratio'),
        warmup_steps=dictionary.get('warmup_steps'),
        weight_decay=dictionary.get('weight_decay'),
        do_train=dictionary.get('do_train'),
        do_eval=dictionary.get('do_train'),
        per_device_train_batch_size=dictionary.get('per_device_train_batch_size'),
        num_train_epochs=dictionary.get('num_train_epochs'), # CHANGE ME
        overwrite_output_dir=dictionary.get('overwrite_output_dir'),
        output_dir=f"./adapter/task/{dictionary.get('task_name')}",
    )

    adapter = MultiLingAdapterArguments(
        train_adapter=True,
        adapter_config="pfeiffer",
    )

    return model, data, training, adapter

def train(params: Dict) -> List:
    model, data, training, adapter = initParse(params)
    
    train_stats, eval_stats = train_task_adapter(
        model_args=model, 
        adapter_args=adapter, 
        training_args=training, 
        data_args=data
    )
    
    row = []
    row.extend(list(params.values()))
    row.extend(list(train_stats.values()))
    row.extend(list(eval_stats.values()))
    
    header = []
    header.extend(list(params.keys()))
    header.extend(list(train_stats.keys()))
    header.extend(list(eval_stats.keys()))

    output_df = pd.DataFrame([row], columns=header)
    
    del model
    del data
    del training
    del adapter
    
    return output_df

**Define Dictionary of Hyperparameters**

In [None]:
glue_tasks = [
    #"cola",
    #"mnli",
    #"mrpc",
    #"qnli",
    #"qqp",
    "rte",
    "sst2",
    "stsb",
    "wnli",
]

In [14]:
task = 'cola'
paramDictionary = {
    'task_name':[task],
    'model_name_or_path':['roberta-base'],
    'max_seq_length':[64, 128, 256],
    'pad_to_max_length':[True],
    'per_device_train_batch_size':[16, 32, 64],
    'adam_beta1':[.9],
    'adam_beta2':[.999],
    'adam_epsilon':[1e-8,1e-7,1e-6],
    'fp16':[True],
    'learning_rate':[1e-5,5e-5,1e-4,5e-4,1e-3],
    'warmup_ratio':[0.0],
    'warmup_steps':[0],
    'weight_decay':[0.0],
    'do_train':[True],
    'do_eval':[True],
    'num_train_epochs':[10],
    'overwrite_output_dir':[True],
    'adapter_config':['pfeiffer']
}

**Begin Looping**

In [None]:
limit = 15 #Numerical or False for no limit

for data_set in glue_tasks:
    paramDictionary["task_name"] = [data_set]
    paramsList = getParams(paramDictionary, limit)

    results = None
    for p in paramsList:
        trial_data = train(p)
        
        if results is not None:
            results = results.append(trial_data)
        else:
            results = trial_data #first pass through the loop
        
        torch.cuda.empty_cache()
    results.to_csv(f"./adapter/task/{data_set}_hp_search.{time():.0f}.csv")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
07/27/2021 09:39:38 - INFO - task -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-07,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0005,

HBox(children=(FloatProgress(value=0.0, max=105.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

07/27/2021 09:39:44 - INFO - task -   Sample 83810 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'idx': 83810, 'input_ids': [0, 2264, 8893, 9, 5, 221, 9788, 58, 382, 8, 248, 9335, 3517, 45, 2460, 7, 3679, 116, 2, 2, 4148, 974, 759, 23, 5, 2238, 4580, 760, 6, 10, 121, 4, 104, 4, 262, 212, 35614, 2925, 6304, 40955, 30456, 2711, 36, 246, 6, 151, 3878, 43, 8, 5, 121, 4, 104, 4, 112, 620, 6144, 2925, 36, 1092, 6, 151, 2383, 2], 'label': 0, 'question': 'What tactics of the PVA were US and ROK troops not prepared to handle?', 'sentence': "On 27 November at the Korean eastern front, a U.S. 7th Infantry Division Regimental Combat Team (3,000 soldiers) and the U.S. 1st Marine Division (12,000–15,000 marines) were unprepared for the PVA 9th Army Group's three-pronged encirclement tactics at the Battle of Chosin 




Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `RobertaModelWithHeads.forward` and have been ignored: sentence, idx, question.
***** Running training *****
  Num examples = 104743
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 32740


Step,Training Loss
500,0.4877
1000,0.3759
1500,0.3482
2000,0.3281
2500,0.325
3000,0.3087
3500,0.2983
4000,0.2842
4500,0.2728
5000,0.2778


  nn.utils.clip_grad_norm_(
Saving model checkpoint to ./adapter/task/qnli/checkpoint-500
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/adapter_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_adapter.bin
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_model_head.bin
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_model_head.bin
tokenizer config file saved in ./adapter/task/qnli/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./adapter/task/qnli/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./adapter/task/qnli/checkpoint-1000
Configuration saved in ./adapter/task/qnli/checkpoint-1000/qnli/adapter_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-1000/qnli/pytorch_adapter.bin
C

07/27/2021 10:01:53 - INFO - /home/jason/.local/share/virtualenvs/cs7643-deep-learning-summer-2021-YD_jfhWv/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/jason/.cache/huggingface/metrics/glue/qnli/default_experiment-1-0.arrow
07/27/2021 10:01:53 - INFO - task -   ***** Eval results qnli *****
07/27/2021 10:01:53 - INFO - task -     eval_loss = 0.3384419083595276
07/27/2021 10:01:53 - INFO - task -     eval_accuracy = 0.9114039904814205
07/27/2021 10:01:53 - INFO - task -     eval_runtime = 5.2842
07/27/2021 10:01:53 - INFO - task -     eval_samples_per_second = 1033.831
07/27/2021 10:01:53 - INFO - task -     eval_steps_per_second = 129.253
07/27/2021 10:01:53 - INFO - task -     epoch = 10.0
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this in

HBox(children=(FloatProgress(value=0.0, max=105.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

07/27/2021 10:01:59 - INFO - task -   Sample 83810 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], 'idx': 83810, 'input_ids': [0, 2264, 8893, 9, 5, 221, 9788, 58, 382, 8, 248, 9335, 3517, 45, 2460, 7, 3679, 116, 2, 2, 4148, 974, 759, 23, 5, 2238, 4580, 760, 6, 10, 121, 4, 104, 4, 262, 212, 35614, 2925, 6304, 40955, 30456, 2711, 36, 246, 6, 151, 3878, 43, 8, 5, 121, 4, 104, 4, 112, 620, 6144, 2925, 36, 1092, 6, 151, 2383, 996, 6, 151, 42669, 43, 58, 35578, 13, 5, 221, 9788, 361, 212, 2938, 826, 18, 130, 12, 4862, 1657, 196, 9689, 21163, 13767, 8893, 23, 5, 9846, 9, 732, 366, 179, 23895, 13878, 6, 53, 51, 2312, 7, 51




Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `RobertaModelWithHeads.forward` and have been ignored: sentence, idx, question.
***** Running training *****
  Num examples = 104743
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 32740


Step,Training Loss
500,0.4512
1000,0.3497
1500,0.3291
2000,0.3142
2500,0.3095
3000,0.2912
3500,0.2807
4000,0.2687
4500,0.2643
5000,0.2659


  nn.utils.clip_grad_norm_(
Saving model checkpoint to ./adapter/task/qnli/checkpoint-500
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/adapter_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_adapter.bin
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_model_head.bin
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_model_head.bin
tokenizer config file saved in ./adapter/task/qnli/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./adapter/task/qnli/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./adapter/task/qnli/checkpoint-1000
Configuration saved in ./adapter/task/qnli/checkpoint-1000/qnli/adapter_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-1000/qnli/pytorch_adapter.bin
C

07/27/2021 10:41:01 - INFO - /home/jason/.local/share/virtualenvs/cs7643-deep-learning-summer-2021-YD_jfhWv/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/jason/.cache/huggingface/metrics/glue/qnli/default_experiment-1-0.arrow
07/27/2021 10:41:01 - INFO - task -   ***** Eval results qnli *****
07/27/2021 10:41:01 - INFO - task -     eval_loss = 0.27380824089050293
07/27/2021 10:41:01 - INFO - task -     eval_accuracy = 0.9229361156873512
07/27/2021 10:41:01 - INFO - task -     eval_runtime = 8.5349
07/27/2021 10:41:01 - INFO - task -     eval_samples_per_second = 640.079
07/27/2021 10:41:01 - INFO - task -     eval_steps_per_second = 80.024
07/27/2021 10:41:01 - INFO - task -     epoch = 10.0
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this inf

HBox(children=(FloatProgress(value=0.0, max=105.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

07/27/2021 10:41:07 - INFO - task -   Sample 83810 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'idx': 83810, 'input_ids': [0, 2264, 8893, 9, 5, 221, 9788, 58, 382, 8, 248, 9335, 3517, 45, 2460, 7, 3679, 116, 2, 2, 4148, 974, 759, 23, 




Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `RobertaModelWithHeads.forward` and have been ignored: sentence, idx, question.
***** Running training *****
  Num examples = 104743
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 16370


Step,Training Loss
500,0.4215
1000,0.3238
1500,0.3018
2000,0.2776
2500,0.2616
3000,0.2525
3500,0.2365
4000,0.226
4500,0.2251
5000,0.2177


Saving model checkpoint to ./adapter/task/qnli/checkpoint-500
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/adapter_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_adapter.bin
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_model_head.bin
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_model_head.bin
tokenizer config file saved in ./adapter/task/qnli/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./adapter/task/qnli/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./adapter/task/qnli/checkpoint-1000
Configuration saved in ./adapter/task/qnli/checkpoint-1000/qnli/adapter_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-1000/qnli/pytorch_adapter.bin
Configuration saved in ./adap

07/27/2021 11:57:48 - INFO - /home/jason/.local/share/virtualenvs/cs7643-deep-learning-summer-2021-YD_jfhWv/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/jason/.cache/huggingface/metrics/glue/qnli/default_experiment-1-0.arrow
07/27/2021 11:57:48 - INFO - task -   ***** Eval results qnli *****
07/27/2021 11:57:48 - INFO - task -     eval_loss = 0.2628665268421173
07/27/2021 11:57:48 - INFO - task -     eval_accuracy = 0.9234852645066813
07/27/2021 11:57:48 - INFO - task -     eval_runtime = 16.9653
07/27/2021 11:57:48 - INFO - task -     eval_samples_per_second = 322.01
07/27/2021 11:57:48 - INFO - task -     eval_steps_per_second = 40.259
07/27/2021 11:57:48 - INFO - task -     epoch = 10.0
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

07/27/2021 11:57:51 - INFO - task -   Sample 83810 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'idx': 83810, 'input_ids': [0, 2264, 8893, 9, 5, 221, 9788, 58, 382, 8, 248, 9335, 3517, 45, 2460, 7, 3679, 116, 2, 2, 4148, 974, 759, 23, 5, 2238, 4580, 760, 6, 10, 121, 4, 104, 4, 262, 212, 35614, 2925, 6304, 40955, 30456, 2711, 36, 246, 6, 151, 3878, 43, 8, 5, 121, 4, 104, 4, 112, 620, 6144, 2925, 36, 1092, 6, 151, 2383, 2], 'label': 0, 'question': 'What tactics of the PVA were US and ROK troops not prepared to handle?', 'sentence': "On 27 November at the Korean eastern front, a U.S. 7th Infantry Division Regimental Combat Team (3,000 soldiers) and the U.S. 1st Marine Division (12,000–15,000 marines) were unprepared for the PVA 9th Army Group's three-pronged encirclement tactics at the Battle of Chosin 




Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `RobertaModelWithHeads.forward` and have been ignored: sentence, idx, question.
***** Running training *****
  Num examples = 104743
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 65470


Step,Training Loss
500,0.5335
1000,0.4385
1500,0.4183
2000,0.4076
2500,0.3844
3000,0.3925
3500,0.3716
4000,0.3547
4500,0.3872
5000,0.3616


Saving model checkpoint to ./adapter/task/qnli/checkpoint-500
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/adapter_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_adapter.bin
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_model_head.bin
Configuration saved in ./adapter/task/qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-500/qnli/pytorch_model_head.bin
tokenizer config file saved in ./adapter/task/qnli/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./adapter/task/qnli/checkpoint-500/special_tokens_map.json
  nn.utils.clip_grad_norm_(
Saving model checkpoint to ./adapter/task/qnli/checkpoint-1000
Configuration saved in ./adapter/task/qnli/checkpoint-1000/qnli/adapter_config.json
Module weights saved in ./adapter/task/qnli/checkpoint-1000/qnli/pytorch_adapter.bin
C

In [None]:
results

In [12]:
results.to_csv(f"./adapter/task/{data_set}_hp_search.{time():.0f}.csv")