## Problem 5

In [None]:
!pip install git+https://github.com/adapter-hub/adapter-transformers.git
!git clone https://github.com/huggingface/transformers
!python transformers/utils/download_glue_data.py --tasks RTE

import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np

import torch
from transformers import AutoTokenizer, EvalPrediction, GlueDataset, GlueDataTrainingArguments, AutoModelWithHeads, AdapterType, AutoConfig, AutoModelForSequenceClassification
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (
    Trainer,
    TrainingArguments,
    glue_compute_metrics,
    glue_tasks_num_labels,
    set_seed,
)

model_name = "bert-base-uncased"


# Refer to the notebook for training an adapter to write these. Set the number of epochs to 3, and learning rate to 5e-5. Rest of the hyperparameters can stay the same. 

data_args = GlueDataTrainingArguments(task_name="rte", data_dir="./glue_data/RTE")

training_args = TrainingArguments(
    logging_steps=50, 
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=64, 
    save_steps=1000,
    evaluate_during_training=True,
    output_dir="./models/rte",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    learning_rate=5e-5,
    num_train_epochs=3,
)


Collecting git+https://github.com/adapter-hub/adapter-transformers.git
  Cloning https://github.com/adapter-hub/adapter-transformers.git to /tmp/pip-req-build-_qnizs4v
  Running command git clone -q https://github.com/adapter-hub/adapter-transformers.git /tmp/pip-req-build-_qnizs4v
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 8.7MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 31



##42

In [None]:
# TODO: Change this seed when re-running your code to report the mean and std dev
set_seed(42)
num_labels = glue_tasks_num_labels[data_args.task_name]


config = AutoConfig.from_pretrained(
        model_name,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=".",
    )

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=".",
)

model = AutoModelWithHeads.from_pretrained(model_name, config=config)

# First, load the pre-trained adapters we want to fuse from Hub
from transformers.adapter_config import PfeifferConfig

model.load_adapter("nli/rte@ukp", "text_task", config=PfeifferConfig(), load_as='rte', with_head=False)
model.load_adapter("nli/scitail@ukp", "text_task", config=PfeifferConfig(), load_as='scitail', with_head=False)
model.load_adapter("nli/sick@ukp", "text_task", config=PfeifferConfig(), load_as='sick', with_head=False)
model.load_adapter("nli/multinli@ukp", "text_task", config=PfeifferConfig(), load_as='mnli', with_head=False)

# Add a fusion layer and tell the model to train fusion (freezes the rest of the weights) (here can either add the actual atsk adapter or not)
model.add_fusion([
        "rte",
        "scitail",
        "sick",
        "mnli"
    ])

# Add a classification head for our target task
model.add_classification_head("rte", num_labels=num_labels)

adapter_setup = [
                 [
        "rte",
        "scitail",
        "sick",
        "mnli" 
    ]
]
model.train_fusion(adapter_setup)
print(training_args)

train_dataset = GlueDataset(data_args, tokenizer=tokenizer)
eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        do_save_full_model=False,
        do_save_adapter_fusion=True,
    )

trainer.train()
trainer.evaluate()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=692.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=7182460.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3327637.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3327294.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=7192265.0, style=ProgressStyle(descript…


TrainingArguments(output_dir='./models/rte', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=True, evaluate_during_training=True, evaluation_strategy=<EvaluationStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=64, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3, max_steps=-1, warmup_steps=0, logging_dir='runs/Nov18_22-04-02_42083763637d', logging_first_step=False, logging_steps=50, save_steps=1000, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=0, past_index=-1, run_name=None, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_en

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=78.0, style=ProgressStyle(description_wid…

{'loss': 0.4245258712768555, 'learning_rate': 3.931623931623932e-05, 'epoch': 0.6410256410256411, 'total_flos': 165770745446400, 'step': 50}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.7715781331062317, 'eval_acc': 0.7328519855595668, 'epoch': 0.6410256410256411, 'total_flos': 165770745446400, 'step': 50}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=78.0, style=ProgressStyle(description_wid…

{'loss': 0.2325020980834961, 'learning_rate': 2.863247863247863e-05, 'epoch': 1.282051282051282, 'total_flos': 330919850597376, 'step': 100}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.9112586827699889, 'eval_acc': 0.703971119133574, 'epoch': 1.282051282051282, 'total_flos': 330919850597376, 'step': 100}
{'loss': 0.18040390014648439, 'learning_rate': 1.794871794871795e-05, 'epoch': 1.9230769230769231, 'total_flos': 496690596043776, 'step': 150}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 1.0126082073049856, 'eval_acc': 0.7256317689530686, 'epoch': 1.9230769230769231, 'total_flos': 496690596043776, 'step': 150}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=78.0, style=ProgressStyle(description_wid…

{'loss': 0.15813812255859375, 'learning_rate': 7.264957264957266e-06, 'epoch': 2.564102564102564, 'total_flos': 661839701194752, 'step': 200}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 1.0618355726076807, 'eval_acc': 0.7148014440433214, 'epoch': 2.564102564102564, 'total_flos': 661839701194752, 'step': 200}




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 1.0513012757800546, 'eval_acc': 0.703971119133574, 'epoch': 3.0, 'total_flos': 773942167802880, 'step': 234}


{'epoch': 3.0,
 'eval_acc': 0.703971119133574,
 'eval_loss': 1.0513012757800546,
 'total_flos': 773942167802880}

## 24

In [None]:
# TODO: Change this seed when re-running your code to report the mean and std dev
set_seed(24)
num_labels = glue_tasks_num_labels[data_args.task_name]


config = AutoConfig.from_pretrained(
        model_name,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=".",
    )

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=".",
)

model = AutoModelWithHeads.from_pretrained(model_name, config=config)

# First, load the pre-trained adapters we want to fuse from Hub
from transformers.adapter_config import PfeifferConfig

model.load_adapter("nli/rte@ukp", "text_task", config=PfeifferConfig(), load_as='rte', with_head=False)
model.load_adapter("nli/scitail@ukp", "text_task", config=PfeifferConfig(), load_as='scitail', with_head=False)
model.load_adapter("nli/sick@ukp", "text_task", config=PfeifferConfig(), load_as='sick', with_head=False)
model.load_adapter("nli/multinli@ukp", "text_task", config=PfeifferConfig(), load_as='mnli', with_head=False)

# Add a fusion layer and tell the model to train fusion (freezes the rest of the weights) (here can either add the actual atsk adapter or not)
model.add_fusion([
        "rte",
        "scitail",
        "sick",
        "mnli"
    ])

# Add a classification head for our target task
model.add_classification_head("rte", num_labels=num_labels)

adapter_setup = [
                 [
        "rte",
        "scitail",
        "sick",
        "mnli" 
    ]
]
model.train_fusion(adapter_setup)
print(training_args)

train_dataset = GlueDataset(data_args, tokenizer=tokenizer)
eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        do_save_full_model=False,
        do_save_adapter_fusion=True,
    )

trainer.train()
trainer.evaluate()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TrainingArguments(output_dir='./models/rte', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=True, evaluate_during_training=True, evaluation_strategy=<EvaluationStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=64, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3, max_steps=234, warmup_steps=0, logging_dir='runs/Nov18_22-04-02_42083763637d', logging_first_step=False, logging_steps=50, save_steps=1000, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=0, past_index=-1, run_name=None, disable_tqdm=False, remove_unused_columns=True, label_names=['labels'], load_best_model

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=78.0, style=ProgressStyle(description_wid…

{'loss': 0.539609031677246, 'learning_rate': 3.931623931623932e-05, 'epoch': 0.6410256410256411, 'total_flos': 165770745446400, 'step': 50}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.559556812155548, 'eval_acc': 0.7653429602888087, 'epoch': 0.6410256410256411, 'total_flos': 165770745446400, 'step': 50}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=78.0, style=ProgressStyle(description_wid…

{'loss': 0.361682243347168, 'learning_rate': 2.863247863247863e-05, 'epoch': 1.282051282051282, 'total_flos': 330919850597376, 'step': 100}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.7277662255918936, 'eval_acc': 0.7184115523465704, 'epoch': 1.282051282051282, 'total_flos': 330919850597376, 'step': 100}
{'loss': 0.22066841125488282, 'learning_rate': 1.794871794871795e-05, 'epoch': 1.9230769230769231, 'total_flos': 496690596043776, 'step': 150}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.8827093897743776, 'eval_acc': 0.7292418772563177, 'epoch': 1.9230769230769231, 'total_flos': 496690596043776, 'step': 150}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=78.0, style=ProgressStyle(description_wid…

{'loss': 0.1830804443359375, 'learning_rate': 7.264957264957266e-06, 'epoch': 2.564102564102564, 'total_flos': 661839701194752, 'step': 200}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.9395371483981825, 'eval_acc': 0.7220216606498195, 'epoch': 2.564102564102564, 'total_flos': 661839701194752, 'step': 200}




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.9391087452857503, 'eval_acc': 0.7220216606498195, 'epoch': 3.0, 'total_flos': 773942167802880, 'step': 234}


{'epoch': 3.0,
 'eval_acc': 0.7220216606498195,
 'eval_loss': 0.9391087452857503,
 'total_flos': 773942167802880}

##8

In [None]:
# TODO: Change this seed when re-running your code to report the mean and std dev
set_seed(8)
num_labels = glue_tasks_num_labels[data_args.task_name]


config = AutoConfig.from_pretrained(
        model_name,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=".",
    )

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=".",
)

model = AutoModelWithHeads.from_pretrained(model_name, config=config)

# First, load the pre-trained adapters we want to fuse from Hub
from transformers.adapter_config import PfeifferConfig

model.load_adapter("nli/rte@ukp", "text_task", config=PfeifferConfig(), load_as='rte', with_head=False)
model.load_adapter("nli/scitail@ukp", "text_task", config=PfeifferConfig(), load_as='scitail', with_head=False)
model.load_adapter("nli/sick@ukp", "text_task", config=PfeifferConfig(), load_as='sick', with_head=False)
model.load_adapter("nli/multinli@ukp", "text_task", config=PfeifferConfig(), load_as='mnli', with_head=False)

# Add a fusion layer and tell the model to train fusion (freezes the rest of the weights) (here can either add the actual atsk adapter or not)
model.add_fusion([
        "rte",
        "scitail",
        "sick",
        "mnli"
    ])

# Add a classification head for our target task
model.add_classification_head("rte", num_labels=num_labels)

adapter_setup = [
                 [
        "rte",
        "scitail",
        "sick",
        "mnli" 
    ]
]
model.train_fusion(adapter_setup)
print(training_args)

train_dataset = GlueDataset(data_args, tokenizer=tokenizer)
eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        do_save_full_model=False,
        do_save_adapter_fusion=True,
    )

trainer.train()
trainer.evaluate()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TrainingArguments(output_dir='./models/rte', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=True, evaluate_during_training=True, evaluation_strategy=<EvaluationStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=32, per_device_eval_batch_size=64, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3, max_steps=234, warmup_steps=0, logging_dir='runs/Nov18_22-04-02_42083763637d', logging_first_step=False, logging_steps=50, save_steps=1000, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=50, dataloader_num_workers=0, past_index=-1, run_name=None, disable_tqdm=False, remove_unused_columns=True, label_names=['labels'], load_best_model

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=78.0, style=ProgressStyle(description_wid…

{'loss': 0.49638824462890624, 'learning_rate': 3.931623931623932e-05, 'epoch': 0.6410256410256411, 'total_flos': 165770745446400, 'step': 50}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.6400528409205619, 'eval_acc': 0.7184115523465704, 'epoch': 0.6410256410256411, 'total_flos': 165770745446400, 'step': 50}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=78.0, style=ProgressStyle(description_wid…

{'loss': 0.27100242614746095, 'learning_rate': 2.863247863247863e-05, 'epoch': 1.282051282051282, 'total_flos': 330919850597376, 'step': 100}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.8182979854435697, 'eval_acc': 0.7220216606498195, 'epoch': 1.282051282051282, 'total_flos': 330919850597376, 'step': 100}
{'loss': 0.19478981018066407, 'learning_rate': 1.794871794871795e-05, 'epoch': 1.9230769230769231, 'total_flos': 496690596043776, 'step': 150}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.9411013575021971, 'eval_acc': 0.7328519855595668, 'epoch': 1.9230769230769231, 'total_flos': 496690596043776, 'step': 150}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=78.0, style=ProgressStyle(description_wid…

{'loss': 0.16704620361328126, 'learning_rate': 7.264957264957266e-06, 'epoch': 2.564102564102564, 'total_flos': 661839701194752, 'step': 200}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.9949881630253705, 'eval_acc': 0.7328519855595668, 'epoch': 2.564102564102564, 'total_flos': 661839701194752, 'step': 200}




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{'eval_loss': 0.9721590493560268, 'eval_acc': 0.7364620938628159, 'epoch': 3.0, 'total_flos': 773942167802880, 'step': 234}


{'epoch': 3.0,
 'eval_acc': 0.7364620938628159,
 'eval_loss': 0.9721590493560268,
 'total_flos': 773942167802880}

## mean and std

In [1]:
torch.mean(torch.Tensor([0.7328519855595668, 0.7653429602888087, 0.7364620938628159]))

tensor(0.7449)

In [2]:
torch.std(torch.Tensor([0.7328519855595668, 0.7653429602888087, 0.7364620938628159]))

tensor(0.0178)