## Training Task Adapters
Using randomized search, we identify optimal hyperparameters to train task specfic adapters on GLUE tasks.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# cd drive/MyDrive/cs7643-deep-learning-summer-2021/

In [1]:
# !pip install -Uqq adapter-transformers datasets

%load_ext autoreload
%autoreload 2

import torch
from time import time
from typing import Dict, List
from task_utils import TaskModelArguments, TaskDataTrainingArguments
from task import train_task_adapter
from transformers import (
    MultiLingAdapterArguments,
    TrainingArguments,
)

### Utility Fuctions

In [2]:
import random
import itertools
import subprocess
import pandas as pd

def getParams(dictionary, limit):
    paramsList = [dict(zip(dictionary, v)) for v in itertools.product(*dictionary.values())]
    random.shuffle(paramsList)

    if limit is not False:
        paramsList = paramsList[0:min(limit, len(paramsList))]

    return paramsList

def initParse(dictionary: Dict, output_prefix = ""):
    model = TaskModelArguments(
        model_name_or_path=dictionary.get('model_name_or_path')
    )

    data = TaskDataTrainingArguments(
        task_name=dictionary.get('task_name'),
        max_seq_length=dictionary.get('max_seq_length'),
        pad_to_max_length=dictionary.get('pad_to_max_length')
    )

    training = TrainingArguments(
        adam_beta1=dictionary.get('adam_beta1'),
        adam_beta2=dictionary.get('adam_beta2'),
        adam_epsilon=dictionary.get('adam_epsilon'),
        learning_rate=dictionary.get('learning_rate'),
        fp16=dictionary.get('fp16'),
        warmup_ratio=dictionary.get('warmup_ratio'),
        warmup_steps=dictionary.get('warmup_steps'),
        weight_decay=dictionary.get('weight_decay'),
        do_train=dictionary.get('do_train'),
        do_eval=dictionary.get('do_train'),
        per_device_train_batch_size=dictionary.get('per_device_train_batch_size'),
        num_train_epochs=dictionary.get('num_train_epochs'), # CHANGE ME
        overwrite_output_dir=dictionary.get('overwrite_output_dir'),
        output_dir=f"./adapter/task/{output_prefix}{dictionary.get('task_name')}",
    )

    adapter = MultiLingAdapterArguments(
        train_adapter=True,
        adapter_config="pfeiffer",
    )

    return model, data, training, adapter

def train(params: Dict, output_prefix = "") -> List:
    model, data, training, adapter = initParse(params, output_prefix)
    
    train_stats, eval_stats = train_task_adapter(
        model_args=model, 
        adapter_args=adapter, 
        training_args=training, 
        data_args=data
    )
    
    row = []
    row.extend(list(params.values()))
    row.extend(list(train_stats.values()))
    row.extend(list(eval_stats.values()))
    
    header = []
    header.extend(list(params.keys()))
    header.extend(list(train_stats.keys()))
    header.extend(list(eval_stats.keys()))

    output_df = pd.DataFrame([row], columns=header)
    
    del model
    del data
    del training
    del adapter
    
    return output_df

## Random Grid Search for Hyperparameter Tuning

**Define Dictionary of Hyperparameters**

In [None]:
glue_tasks = [
    #"cola",
    #"mnli",
    #"mrpc",
    #"qnli",
    #"qqp",
    #"rte",
    "sst2",
    #"stsb",
    #"wnli",
]

In [None]:
task = 'cola'
paramDictionary = {
    'task_name':[task],
    'model_name_or_path':['roberta-base'],
    'max_seq_length':[64, 128, 256],
    'pad_to_max_length':[True],
    'per_device_train_batch_size':[16, 32, 64],
    'adam_beta1':[.9],
    'adam_beta2':[.999],
    'adam_epsilon':[1e-8,1e-7,1e-6],
    'fp16':[True],
    'learning_rate':[1e-5,5e-5,1e-4,5e-4,1e-3],
    'warmup_ratio':[0.0],
    'warmup_steps':[0],
    'weight_decay':[0.0],
    'do_train':[True],
    'do_eval':[True],
    'num_train_epochs':[10],
    'overwrite_output_dir':[True],
    'adapter_config':['pfeiffer']
}

**Begin Looping**

In [None]:
limit = 15 #Numerical or False for no limit

for data_set in glue_tasks:
    paramDictionary["task_name"] = [data_set]
    paramsList = getParams(paramDictionary, limit)

    results = None
    for p in paramsList:
        trial_data = train(p)
        
        if results is not None:
            results = results.append(trial_data)
        else:
            results = trial_data #first pass through the loop
        
        torch.cuda.empty_cache()
    results.to_csv(f"./adapter/task/{data_set}_hp_search.{time():.0f}.csv")

In [None]:
results

In [None]:
results.to_csv(f"./adapter/task/{data_set}_hp_search.{time():.0f}.csv")

## Final Training
Training each adapter again with the optimal settings discovered through the random search

In [3]:
from pathlib import Path
def final_training(
    task, 
    learning_rate, 
    max_seq_length, 
    per_device_train_batch_size, 
    adam_epsilon,
    num_train_epochs
    ):
    
    home = str(Path.home())
    final_params = {
        'task_name':[task],
        'model_name_or_path':[f'{home}/git/roberta-base'],
        'max_seq_length':[max_seq_length],
        'pad_to_max_length':[True],
        'per_device_train_batch_size':[per_device_train_batch_size],
        'adam_beta1':[.9],
        'adam_beta2':[.999],
        'adam_epsilon':[adam_epsilon],
        'fp16':[True],
        'learning_rate':[learning_rate],
        'warmup_ratio':[0.0],
        'warmup_steps':[0],
        'weight_decay':[0.0],
        'do_train':[True],
        'do_eval':[True],
        'num_train_epochs':[num_train_epochs],
        'overwrite_output_dir':[True],
        'adapter_config':["pfeiffer"]
    }
    
    prefix = "final_"
    p = getParams(final_params, 1)
    result = train(params=p[0], output_prefix=prefix)
    result.to_csv(f"./adapter/task/{prefix}{task}_hp_search.{time():.0f}.csv")

In [4]:
#sst2
task = "sst2"
! cp ./adapter/task/sst2/sst2/* ~/git/roberta-base/.git
final_training(task=task,
               learning_rate=5e-4,
               max_seq_length=64,
               per_device_train_batch_size=32,
               adam_epsilon=1e-7,
               num_train_epochs=20)

07/30/2021 19:51:17 - INFO - task -   Training/evaluation parameters TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-07,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.NO,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0005,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=./adapter/task/final_sst2/runs/Jul30_19-51-15_ip-172-16-1-120,
logging_first_step=False,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_

RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cpu