## Training Task Adapters
Using randomized search, we identify optimal hyperparameters to train task specfic adapters on GLUE tasks.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# cd drive/MyDrive/cs7643-deep-learning-summer-2021/

In [None]:
# !pip install -Uqq adapter-transformers datasets

%load_ext autoreload
%autoreload 2

import torch
from time import time
from typing import Dict, List
from utils.task_utils import TaskModelArguments, TaskDataTrainingArguments
from utils.task import train_task_adapter
from transformers import (
    MultiLingAdapterArguments,
    TrainingArguments,
)

### Utility Fuctions

In [None]:
import random
import itertools
import subprocess
import pandas as pd

def getParams(dictionary, limit):
    paramsList = [dict(zip(dictionary, v)) for v in itertools.product(*dictionary.values())]
    random.shuffle(paramsList)

    if limit is not False:
        paramsList = paramsList[0:min(limit, len(paramsList))]

    return paramsList

def initParse(dictionary: Dict, output_prefix = ""):
    model = TaskModelArguments(
        model_name_or_path=dictionary.get('model_name_or_path')
    )

    data = TaskDataTrainingArguments(
        task_name=dictionary.get('task_name'),
        max_seq_length=dictionary.get('max_seq_length'),
        pad_to_max_length=dictionary.get('pad_to_max_length')
    )

    training = TrainingArguments(
        adam_beta1=dictionary.get('adam_beta1'),
        adam_beta2=dictionary.get('adam_beta2'),
        adam_epsilon=dictionary.get('adam_epsilon'),
        learning_rate=dictionary.get('learning_rate'),
        fp16=dictionary.get('fp16'),
        warmup_ratio=dictionary.get('warmup_ratio'),
        warmup_steps=dictionary.get('warmup_steps'),
        weight_decay=dictionary.get('weight_decay'),
        do_train=dictionary.get('do_train'),
        do_eval=dictionary.get('do_train'),
        per_device_train_batch_size=dictionary.get('per_device_train_batch_size'),
        num_train_epochs=dictionary.get('num_train_epochs'), # CHANGE ME
        overwrite_output_dir=dictionary.get('overwrite_output_dir'),
        output_dir=f"./adapter/task/{output_prefix}{dictionary.get('task_name')}",
    )

    adapter = MultiLingAdapterArguments(
        train_adapter=True,
        adapter_config="pfeiffer",
    )

    return model, data, training, adapter

def train(params: Dict, output_prefix = "") -> List:
    model, data, training, adapter = initParse(params, output_prefix)
    
    train_stats, eval_stats = train_task_adapter(
        model_args=model, 
        adapter_args=adapter, 
        training_args=training, 
        data_args=data
    )
    
    row = []
    row.extend(list(params.values()))
    row.extend(list(train_stats.values()))
    row.extend(list(eval_stats.values()))
    
    header = []
    header.extend(list(params.keys()))
    header.extend(list(train_stats.keys()))
    header.extend(list(eval_stats.keys()))

    output_df = pd.DataFrame([row], columns=header)
    
    del model
    del data
    del training
    del adapter
    
    return output_df

## Random Grid Search for Hyperparameter Tuning

**Define Dictionary of Hyperparameters**

In [None]:
glue_tasks = [
    "cola",
    #"mnli",
    #"mrpc",
    #"qnli",
    #"qqp",
    #"rte",
    #sst2",
    #"stsb",
    #"wnli",
]

In [None]:
task = 'cola'
paramDictionary = {
    'task_name':[task],
    'model_name_or_path':['roberta-base'],
    'max_seq_length':[64, 128, 256],
    'pad_to_max_length':[True],
    'per_device_train_batch_size':[16, 32, 64],
    'adam_beta1':[.9],
    'adam_beta2':[.999],
    'adam_epsilon':[1e-8,1e-7,1e-6],
    'fp16':[True],
    'learning_rate':[1e-5,5e-5,1e-4,5e-4,1e-3],
    'warmup_ratio':[0.0],
    'warmup_steps':[0],
    'weight_decay':[0.0],
    'do_train':[True],
    'do_eval':[True],
    'num_train_epochs':[10],
    'overwrite_output_dir':[True],
    'adapter_config':['pfeiffer']
}

**Begin Looping**

In [None]:
limit = 15 #Numerical or False for no limit

for data_set in glue_tasks:
    paramDictionary["task_name"] = [data_set]
    paramsList = getParams(paramDictionary, limit)

    results = None
    for p in paramsList:
        trial_data = train(p)
        
        if results is not None:
            results = results.append(trial_data)
        else:
            results = trial_data #first pass through the loop
        
        torch.cuda.empty_cache()
    results.to_csv(f"./adapter/task/{data_set}_hp_search.{time():.0f}.csv")

In [None]:
results

In [None]:
results.to_csv(f"./adapter/task/{data_set}_hp_search.{time():.0f}.csv")

## Final Training
Training each adapter again with the optimal settings discovered through the random search

In [None]:
from utils.task import final_training

In [None]:
glue_tasks = ["sst2", "cola", "wnli", "rte", "qnli"]
final_params = {
    "sst2" : {
        "learning_rate": 5e-4,
        "max_seq_length": 64,
        "per_device_train_batch_size": 32,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 10,
    },
    "cola" : {
        "learning_rate": 1e-3,
        "max_seq_length": 256,
        "per_device_train_batch_size": 64,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 10,
    },
    "wnli" : {
        "learning_rate": 1e-5,
        "max_seq_length": 256,
        "per_device_train_batch_size": 16,
        "adam_epsilon": 1e-6,
        "num_train_epochs": 10,
    },
    "rte" : {
        "learning_rate": 5e-4,
        "max_seq_length": 256,
        "per_device_train_batch_size": 16,
        "adam_epsilon": 1e-8,
        "num_train_epochs": 10,
    },
    "qnli" : {
        "learning_rate": 1e-3,
        "max_seq_length": 128,
        "per_device_train_batch_size": 64,
        "adam_epsilon": 1e-7,
        "num_train_epochs": 10,
    },
}

In [None]:
for task in glue_tasks:
    
    print(f"\n\n##### START TASK: {task} #####\n{final_params.get(task)}\n\n")
    
    final_training(task=task,
                   **final_params.get(task))

## Run eval on the glue tasks with the task adapted model
This establishes whether the task specific training improved the model's performance on each task
1. Load the model
1. Loop over the tasks, loading the approapriate adapter and running inference

In [None]:
# test loading the pre-trained model and adding an adapter
glue_tasks = ["sst2", "cola", "wnli", "rte", "qnli"]

from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoModelForMaskedLM, RobertaModelWithHeads, AutoModelWithHeads, AdapterConfig
from transformers.adapters.composition import Stack

model = AutoModelWithHeads.from_pretrained('roberta-base')

# lang_adapter_config = AdapterConfig.load("pfeiffer+inv")
# mlm_adapter = model.load_adapter("./adapter/mlm/rte/glue", config=lang_adapter_config)


# model.active_adapters = Stack("glue", "rte")

In [None]:
# run inference on each task using the task-specific adapter
from utils.task import final_training
for task in glue_tasks:
    
    task_adapter_config = AdapterConfig.load("pfeiffer")
    task_adapter = model.load_adapter(f"./adapter/task/final_{task}/{task}", config=task_adapter_config)
    model.active_adapters = f"{task}"
    
    print(f"\n\n##### START TASK: {task} #####\n{final_params.get(task)}\n\n")
    
    final_training(pre_trained_model=model,
                   task=task,
                   prefix="mlm_test_",
                   do_train=False)