## Training Task Adapters
Using randomized search, we identify optimal hyperparameters to train task specfic adapters on GLUE tasks.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# cd drive/MyDrive/cs7643-deep-learning-summer-2021/

In [1]:
# !pip install -Uqq adapter-transformers datasets

%load_ext autoreload
%autoreload 2

import torch
from time import time
from typing import Dict, List
from task_utils import TaskModelArguments, TaskDataTrainingArguments
from task import train_task_adapter
from transformers import (
    MultiLingAdapterArguments,
    TrainingArguments,
)

### Utility Fuctions

In [2]:
import random
import itertools
import subprocess
import pandas as pd

def getParams(dictionary, limit):
    paramsList = [dict(zip(dictionary, v)) for v in itertools.product(*dictionary.values())]
    random.shuffle(paramsList)

    if limit is not False:
        paramsList = paramsList[0:min(limit, len(paramsList))]

    return paramsList

def initParse(dictionary: Dict, output_prefix = ""):
    model = TaskModelArguments(
        model_name_or_path=dictionary.get('model_name_or_path')
    )

    data = TaskDataTrainingArguments(
        task_name=dictionary.get('task_name'),
        max_seq_length=dictionary.get('max_seq_length'),
        pad_to_max_length=dictionary.get('pad_to_max_length')
    )

    training = TrainingArguments(
        adam_beta1=dictionary.get('adam_beta1'),
        adam_beta2=dictionary.get('adam_beta2'),
        adam_epsilon=dictionary.get('adam_epsilon'),
        learning_rate=dictionary.get('learning_rate'),
        fp16=dictionary.get('fp16'),
        warmup_ratio=dictionary.get('warmup_ratio'),
        warmup_steps=dictionary.get('warmup_steps'),
        weight_decay=dictionary.get('weight_decay'),
        do_train=dictionary.get('do_train'),
        do_eval=dictionary.get('do_train'),
        per_device_train_batch_size=dictionary.get('per_device_train_batch_size'),
        num_train_epochs=dictionary.get('num_train_epochs'), # CHANGE ME
        overwrite_output_dir=dictionary.get('overwrite_output_dir'),
        output_dir=f"./adapter/task/{output_prefix}{dictionary.get('task_name')}",
    )

    adapter = MultiLingAdapterArguments(
        train_adapter=True,
        adapter_config="pfeiffer",
    )

    return model, data, training, adapter

def train(params: Dict, output_prefix = "") -> List:
    model, data, training, adapter = initParse(params, output_prefix)
    
    train_stats, eval_stats = train_task_adapter(
        model_args=model, 
        adapter_args=adapter, 
        training_args=training, 
        data_args=data
    )
    
    row = []
    row.extend(list(params.values()))
    row.extend(list(train_stats.values()))
    row.extend(list(eval_stats.values()))
    
    header = []
    header.extend(list(params.keys()))
    header.extend(list(train_stats.keys()))
    header.extend(list(eval_stats.keys()))

    output_df = pd.DataFrame([row], columns=header)
    
    del model
    del data
    del training
    del adapter
    
    return output_df

## Random Grid Search for Hyperparameter Tuning

**Define Dictionary of Hyperparameters**

In [3]:
glue_tasks = [
    "cola",
    #"mnli",
    #"mrpc",
    #"qnli",
    #"qqp",
    #"rte",
    #sst2",
    #"stsb",
    #"wnli",
]

In [4]:
task = 'cola'
paramDictionary = {
    'task_name':[task],
    'model_name_or_path':['roberta-base'],
    'max_seq_length':[64, 128, 256],
    'pad_to_max_length':[True],
    'per_device_train_batch_size':[16, 32, 64],
    'adam_beta1':[.9],
    'adam_beta2':[.999],
    'adam_epsilon':[1e-8,1e-7,1e-6],
    'fp16':[True],
    'learning_rate':[1e-5,5e-5,1e-4,5e-4,1e-3],
    'warmup_ratio':[0.0],
    'warmup_steps':[0],
    'weight_decay':[0.0],
    'do_train':[True],
    'do_eval':[True],
    'num_train_epochs':[10],
    'overwrite_output_dir':[True],
    'adapter_config':['pfeiffer']
}

**Begin Looping**

In [None]:
limit = 15 #Numerical or False for no limit

for data_set in glue_tasks:
    paramDictionary["task_name"] = [data_set]
    paramsList = getParams(paramDictionary, limit)

    results = None
    for p in paramsList:
        trial_data = train(p)
        
        if results is not None:
            results = results.append(trial_data)
        else:
            results = trial_data #first pass through the loop
        
        torch.cuda.empty_cache()
    results.to_csv(f"./adapter/task/{data_set}_hp_search.{time():.0f}.csv")

In [None]:
results

In [None]:
results.to_csv(f"./adapter/task/{data_set}_hp_search.{time():.0f}.csv")

## Final Training
Training each adapter again with the optimal settings discovered through the random search

In [3]:
from pathlib import Path
from shutil import copyfile
def final_training(
    task, 
    learning_rate, 
    max_seq_length, 
    per_device_train_batch_size, 
    adam_epsilon,
    num_train_epochs
    ):
    
    home = str(Path.home())
    model_dir = f"{home}/git/roberta-base"
    
    copy_adapter_config(task_name=task, model_dir=model_dir)
    
    final_params = {
        'task_name':[task],
        'model_name_or_path':[model_dir],
        'max_seq_length':[max_seq_length],
        'pad_to_max_length':[True],
        'per_device_train_batch_size':[per_device_train_batch_size],
        'adam_beta1':[.9],
        'adam_beta2':[.999],
        'adam_epsilon':[adam_epsilon],
        'fp16':[True],
        'learning_rate':[learning_rate],
        'warmup_ratio':[0.0],
        'warmup_steps':[0],
        'weight_decay':[0.0],
        'do_train':[True],
        'do_eval':[True],
        'num_train_epochs':[num_train_epochs],
        'overwrite_output_dir':[True],
        'adapter_config':[f"pfeiffer"],
    }
    
    prefix = "final_"
    p = getParams(final_params, 1)
    result = train(params=p[0], output_prefix=prefix)
    result.to_csv(f"./adapter/task/{prefix}{task}_hp_search.{time():.0f}.csv")
    
def copy_adapter_config(task_name:str, model_dir:str):
    """Copy the adapter config into the downloaded local model location"""
    
    config_location = f"./adapter/task/{task_name}/{task_name}"
    
    copyfile(src=f"{config_location}/adapter_config.json", dst=f"{model_dir}/.git/adapter_config.json")
    copyfile(src=f"{config_location}/pytorch_adapter.bin", dst=f"{model_dir}/.git/pytorch_adapter.bin")

In [5]:
glue_tasks = ["sst2", "cola", "wnli", "rte", "qnli"]
final_params = {
    "sst2" : {
        "learning_rate": 5e-4,
        "max_seq_length": 64,
        "per_device_train_batch_size": 32,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 10,
    },
    "cola" : {
        "learning_rate": 1e-3,
        "max_seq_length": 256,
        "per_device_train_batch_size": 64,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 10,
    },
    "wnli" : {
        "learning_rate": 1e-5,
        "max_seq_length": 256,
        "per_device_train_batch_size": 16,
        "adam_epsilon": 1e-6,
        "num_train_epochs": 10,
    },
    "rte" : {
        "learning_rate": 5e-4,
        "max_seq_length": 256,
        "per_device_train_batch_size": 16,
        "adam_epsilon": 1e-8,
        "num_train_epochs": 10,
    },
    "qnli" : {
        "learning_rate": 1e-3,
        "max_seq_length": 128,
        "per_device_train_batch_size": 64,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 10,
    },
}

In [None]:
for task in glue_tasks:
    
    print(f"\n\n##### START TASK: {task} #####\n{final_params.get(task)}\n\n")
    
    final_training(task=task,
                   **final_params.get(task))



##### START TASK: sst2 #####
{'learning_rate': 0.0005, 'max_seq_length': 64, 'per_device_train_batch_size': 32, 'adam_epsilon': 1e-07, 'num_train_epochs': 10}




07/31/2021 15:44:44 - INFO - task -   Training/evaluation parameters TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-07,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0005,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=./adapter/task/final_sst2/runs/Jul31_15-44-41_ip-172-16-1-120,
logging_first_step=False,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_

Step,Training Loss
500,0.2159
1000,0.1724
1500,0.1444
2000,0.1237
2500,0.1082
3000,0.0969
3500,0.0854
4000,0.0754
4500,0.066
5000,0.061


Saving model checkpoint to ./adapter/task/final_sst2/checkpoint-500
Configuration saved in ./adapter/task/final_sst2/checkpoint-500/sst2/adapter_config.json
Module weights saved in ./adapter/task/final_sst2/checkpoint-500/sst2/pytorch_adapter.bin
Configuration saved in ./adapter/task/final_sst2/checkpoint-500/sst2/head_config.json
Module weights saved in ./adapter/task/final_sst2/checkpoint-500/sst2/pytorch_model_head.bin
Configuration saved in ./adapter/task/final_sst2/checkpoint-500/sst2/head_config.json
Module weights saved in ./adapter/task/final_sst2/checkpoint-500/sst2/pytorch_model_head.bin
tokenizer config file saved in ./adapter/task/final_sst2/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./adapter/task/final_sst2/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./adapter/task/final_sst2/checkpoint-1000
Configuration saved in ./adapter/task/final_sst2/checkpoint-1000/sst2/adapter_config.json
Module weights saved in ./adapter/task/final_sst

07/31/2021 16:11:33 - INFO - task -   ***** Eval results sst2 *****
07/31/2021 16:11:33 - INFO - task -     eval_loss = 0.215029776096344
07/31/2021 16:11:33 - INFO - task -     eval_accuracy = 0.9495412844036697
07/31/2021 16:11:33 - INFO - task -     eval_runtime = 3.4394
07/31/2021 16:11:33 - INFO - task -     eval_samples_per_second = 253.53
07/31/2021 16:11:33 - INFO - task -     eval_steps_per_second = 8.141
07/31/2021 16:11:33 - INFO - task -     epoch = 10.0
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
07/31/2021 16:11:33 - INFO - task -   Training/evaluation parameters TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-07,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader



##### START TASK: cola #####
{'learning_rate': 0.001, 'max_seq_length': 256, 'per_device_train_batch_size': 64, 'adam_epsilon': 1e-07, 'num_train_epochs': 10}




loading configuration file /home/ubuntu/git/roberta-base/config.json
Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": "cola",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "2.1.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file /home/ubuntu/git/roberta-base/config.json
Model config RobertaConfig {
  "adapt

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

07/31/2021 16:11:35 - INFO - task -   Sample 1824 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'idx': 1824, 'input_ids': [0, 100, 5055, 14, 127, 1150, 6, 37, 21, 3229, 25, 41, 37323, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./adapter/task/final_cola
Configuration saved in ./adapter/task/final_cola/cola/adapter_config.json
Module weights saved in ./adapter/task/final_cola/cola/pytorch_adapter.bin
Configuration saved in ./adapter/task/final_cola/cola/head_config.json
Module weights saved in ./adapter/task/final_cola/cola/pytorch_model_head.bin
Configuration saved in ./adapter/task/final_cola/cola/head_config.json
Module weights saved in ./adapter/task/final_cola/cola/pytorch_model_head.bin
tokenizer config file saved in ./adapter/task/final_cola/tokenizer_config.json
Special tokens file saved in ./adapter/task/final_cola/special_tokens_map.json
07/31/2021 16:18:03 - INFO - task -   *** Evaluate ***
The following columns in the evaluation set  don't have a corresponding argument in `RobertaModelWithHeads.forward` and have been ignored: idx, sentence.
***** Running Evaluation *****
  Num examples =

07/31/2021 16:18:10 - INFO - task -   ***** Eval results cola *****
07/31/2021 16:18:10 - INFO - task -     eval_loss = 0.5718619227409363
07/31/2021 16:18:10 - INFO - task -     eval_matthews_correlation = 0.5879831868448624
07/31/2021 16:18:10 - INFO - task -     eval_runtime = 7.3667
07/31/2021 16:18:10 - INFO - task -     eval_samples_per_second = 141.583
07/31/2021 16:18:10 - INFO - task -     eval_steps_per_second = 4.48
07/31/2021 16:18:10 - INFO - task -     epoch = 10.0
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
07/31/2021 16:18:10 - INFO - task -   Training/evaluation parameters TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-06,
dataloader_drop_last=False,
dataloader_num_workers=



##### START TASK: wnli #####
{'learning_rate': 1e-05, 'max_seq_length': 256, 'per_device_train_batch_size': 16, 'adam_epsilon': 1e-06, 'num_train_epochs': 10}


Downloading and preparing dataset glue/wnli (download: 28.32 KiB, generated: 154.03 KiB, post-processed: Unknown size, total: 182.35 KiB) to /home/ubuntu/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading:   0%|          | 0.00/29.0k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

loading configuration file /home/ubuntu/git/roberta-base/config.json
Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": "wnli",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "2.1.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file /home/ubuntu/git/roberta-base/config.json
Model config RobertaConfig {
  "adapt

Dataset glue downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


Some weights of the model checkpoint at /home/ubuntu/git/roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at /home/ubuntu/git/roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

07/31/2021 16:18:13 - INFO - task -   Sample 114 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'idx': 114, 'input_ids': [0, 1121, 550, 6, 5911, 90, 29465, 2348, 2998, 997, 15, 19810, 7046, 330, 4, 1773, 19810, 7046, 330, 18, 3835, 21, 2

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./adapter/task/final_wnli
Configuration saved in ./adapter/task/final_wnli/wnli/adapter_config.json
Module weights saved in ./adapter/task/final_wnli/wnli/pytorch_adapter.bin
Configuration saved in ./adapter/task/final_wnli/wnli/head_config.json
Module weights saved in ./adapter/task/final_wnli/wnli/pytorch_model_head.bin
Configuration saved in ./adapter/task/final_wnli/wnli/head_config.json
Module weights saved in ./adapter/task/final_wnli/wnli/pytorch_model_head.bin
tokenizer config file saved in ./adapter/task/final_wnli/tokenizer_config.json
Special tokens file saved in ./adapter/task/final_wnli/special_tokens_map.json
07/31/2021 16:18:57 - INFO - task -   *** Evaluate ***
The following columns in the evaluation set  don't have a corresponding argument in `RobertaModelWithHeads.forward` and have been ignored: sentence1, sentence2, idx.
***** Running Evaluation *****
  Nu

07/31/2021 16:18:58 - INFO - task -   ***** Eval results wnli *****
07/31/2021 16:18:58 - INFO - task -     eval_loss = 0.6883140206336975
07/31/2021 16:18:58 - INFO - task -     eval_accuracy = 0.5633802816901409
07/31/2021 16:18:58 - INFO - task -     eval_runtime = 0.5812
07/31/2021 16:18:58 - INFO - task -     eval_samples_per_second = 122.155
07/31/2021 16:18:58 - INFO - task -     eval_steps_per_second = 5.161
07/31/2021 16:18:58 - INFO - task -     epoch = 10.0
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
07/31/2021 16:18:58 - INFO - task -   Training/evaluation parameters TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataload



##### START TASK: rte #####
{'learning_rate': 0.0005, 'max_seq_length': 256, 'per_device_train_batch_size': 16, 'adam_epsilon': 1e-08, 'num_train_epochs': 10}


Downloading and preparing dataset glue/rte (download: 680.81 KiB, generated: 1.83 MiB, post-processed: Unknown size, total: 2.49 MiB) to /home/ubuntu/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading:   0%|          | 0.00/697k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

loading configuration file /home/ubuntu/git/roberta-base/config.json
Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": "rte",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "2.1.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file /home/ubuntu/git/roberta-base/config.json
Model config RobertaConfig {
  "adapte

Dataset glue downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


Some weights of the model checkpoint at /home/ubuntu/git/roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at /home/ubuntu/git/roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

07/31/2021 16:19:02 - INFO - task -   Sample 456 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'idx': 456, 'input_ids': [0, 250, 3034, 467, 2988, 1367, 159, 458, 1446, 23, 5, 5308, 3412, 3080, 13, 144, 9, 2350, 6, 5, 2373, 10044, 7, 124

Step,Training Loss


  nn.utils.clip_grad_norm_(


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./adapter/task/final_rte
Configuration saved in ./adapter/task/final_rte/rte/adapter_config.json
Module weights saved in ./adapter/task/final_rte/rte/pytorch_adapter.bin
Configuration saved in ./adapter/task/final_rte/rte/head_config.json
Module weights saved in ./adapter/task/final_rte/rte/pytorch_model_head.bin
Configuration saved in ./adapter/task/final_rte/rte/head_config.json
Module weights saved in ./adapter/task/final_rte/rte/pytorch_model_head.bin
tokenizer config file saved in ./adapter/task/final_rte/tokenizer_config.json
Special tokens file saved in ./adapter/task/final_rte/special_tokens_map.json
07/31/2021 16:21:53 - INFO - task -   *** Evaluate ***
The following columns in the evaluation set  don't have a corresponding argument in `RobertaModelWithHeads.forward` and have been ignored: sentence1, sentence2, idx.
***** Running Evaluati

07/31/2021 16:21:55 - INFO - task -   ***** Eval results rte *****
07/31/2021 16:21:55 - INFO - task -     eval_loss = 0.7079261541366577
07/31/2021 16:21:55 - INFO - task -     eval_accuracy = 0.7581227436823105
07/31/2021 16:21:55 - INFO - task -     eval_runtime = 2.0077
07/31/2021 16:21:55 - INFO - task -     eval_samples_per_second = 137.972
07/31/2021 16:21:55 - INFO - task -     eval_steps_per_second = 4.483
07/31/2021 16:21:55 - INFO - task -     epoch = 10.0
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
07/31/2021 16:21:55 - INFO - task -   Training/evaluation parameters TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-07,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloade



##### START TASK: qnli #####
{'learning_rate': 0.001, 'max_seq_length': 128, 'per_device_train_batch_size': 64, 'adam_epsilon': 1e-07, 'num_train_epochs': 10}


Downloading and preparing dataset glue/qnli (download: 10.14 MiB, generated: 27.11 MiB, post-processed: Unknown size, total: 37.24 MiB) to /home/ubuntu/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

loading configuration file /home/ubuntu/git/roberta-base/config.json
Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": "qnli",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "2.1.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file /home/ubuntu/git/roberta-base/config.json
Model config RobertaConfig {
  "adapt

Dataset glue downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


Some weights of the model checkpoint at /home/ubuntu/git/roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at /home/ubuntu/git/roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to

  0%|          | 0/105 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

07/31/2021 16:22:07 - INFO - task -   Sample 83810 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], 'idx': 83810, 'input_ids': [0, 2264, 8893, 9, 5, 221, 9788, 58, 382, 8, 248, 9335, 3517, 45, 2460, 7, 3679, 116, 2, 2, 4148, 974, 759, 23, 5, 2238, 4580, 760, 6, 10, 121, 4, 104, 4, 262, 212, 35614, 2925, 6304, 40955, 30456, 2711, 36, 246, 6, 151, 3878, 43, 8, 5, 121, 4, 104, 4, 112, 620, 6144, 2925, 36, 1092, 6, 151, 2383, 996, 6, 151, 42669, 43, 58, 35578, 13, 5, 221, 9788, 361, 212, 2938, 826, 18, 130, 12, 4862, 1657, 196, 9689, 21163, 13767, 8893, 23, 5, 9846, 9, 732, 366, 179, 23895, 13878, 6, 53, 51, 2312, 7, 51

Step,Training Loss
500,0.3429
1000,0.2509
1500,0.2147
2000,0.1885
2500,0.1636


Saving model checkpoint to ./adapter/task/final_qnli/checkpoint-500
Configuration saved in ./adapter/task/final_qnli/checkpoint-500/qnli/adapter_config.json
Module weights saved in ./adapter/task/final_qnli/checkpoint-500/qnli/pytorch_adapter.bin
Configuration saved in ./adapter/task/final_qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/final_qnli/checkpoint-500/qnli/pytorch_model_head.bin
Configuration saved in ./adapter/task/final_qnli/checkpoint-500/qnli/head_config.json
Module weights saved in ./adapter/task/final_qnli/checkpoint-500/qnli/pytorch_model_head.bin
tokenizer config file saved in ./adapter/task/final_qnli/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./adapter/task/final_qnli/checkpoint-500/special_tokens_map.json
  nn.utils.clip_grad_norm_(
Saving model checkpoint to ./adapter/task/final_qnli/checkpoint-1000
Configuration saved in ./adapter/task/final_qnli/checkpoint-1000/qnli/adapter_config.json
Module weights saved