In [1]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers.adapters.composition import Stack
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_data(lang):
    data = pd.read_csv("data/train.tsv",sep="\t")
    data = data[data["language"] == lang]
    return data

In [3]:
def get_languages():
    data = pd.read_csv("data/train.tsv",sep="\t")
    return data["language"].unique()

languages = get_languages()

In [4]:
languages

array(['en', 'vi', 'de', 'ar', 'es', 'bg', 'el', 'th', 'ru', 'tr', 'sw',
       'ur', 'zh', 'hi', 'fr'], dtype=object)

In [5]:
en_data = get_data("en")

In [6]:
en_data.head()

Unnamed: 0,gold_label,premise,hypothesis,language
0,neutral,"At ground level, the asymmetrical cathedral is...",It's hard to find a dramatic view of the cathe...,en
1,contradiction,Hanuman is a beneficent deity predating classi...,Hanuman declared that all the lemurs here need...,en
2,contradiction,All other spending as well as federal revenue ...,None of the federal spending is assumed to grow,en
3,neutral,uh-huh that's interesting well it sounds as th...,That information about graduation rates is int...,en
4,neutral,Some kind of instant recognition on his father...,Did his father recognize him?,en


In [7]:
labels = en_data["gold_label"].values
labels = [0 if label == "entailment" else 1 if label == "neutral" else 2 for label in labels]
en_data["gold_label"] = labels

In [8]:
en_data = Dataset.from_pandas(en_data)

In [9]:
en_data

Dataset({
    features: ['gold_label', 'premise', 'hypothesis', 'language', '__index_level_0__'],
    num_rows: 100993
})

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

Downloading config.json: 100%|██████████| 615/615 [00:00<00:00, 131kB/s]
Downloading sentencepiece.bpe.model: 100%|██████████| 4.83M/4.83M [00:02<00:00, 1.79MB/s]
Downloading tokenizer.json: 100%|██████████| 8.68M/8.68M [00:11<00:00, 758kB/s] 


In [11]:
from transformers import AutoConfig, AutoAdapterModel

config = AutoConfig.from_pretrained(
    "xlm-roberta-base",
)
model = AutoAdapterModel.from_pretrained(
    "xlm-roberta-base",
    config=config,
)

Downloading pytorch_model.bin: 100%|██████████| 1.04G/1.04G [04:13<00:00, 4.40MB/s]
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaAdapterModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should

In [12]:
from transformers import AdapterConfig

# Load the language adapters
lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)
model.load_adapter("en/wiki@ukp", config=lang_adapter_config)
model.load_adapter("de/wiki@ukp", config=lang_adapter_config)

# Add a new task adapter
model.add_adapter("nli")

# Add a classification head for our target task
model.add_multiple_choice_head("nli", num_choices=3)

Downloading xlm-roberta-base.json: 10.1kB [00:00, 4.99MB/s]                 
Downloading en_relu_2.zip: 100%|██████████| 28.2M/28.2M [00:09<00:00, 3.01MB/s]
Downloading de_relu_2.zip: 100%|██████████| 28.2M/28.2M [00:07<00:00, 3.99MB/s]


In [70]:
def encode_batch(examples):
  """Encodes a batch of input data using the model tokenizer."""
  all_encoded = {"input_ids": [], "attention_mask": []}
  # Iterate through all examples in this batch
  for premise, hypothesis in (zip(examples["premise"], examples["hypothesis"])):
    premise = [str(premise)+ " " + str(hypothesis) for _ in range(3)]
    choices = ["0","1","2"]
    encoded = tokenizer(
        premise,
        choices,
        max_length=60,
        truncation=True,
        padding="max_length",
    )
    all_encoded["input_ids"].append(encoded["input_ids"])
    all_encoded["attention_mask"].append(encoded["attention_mask"])
  return all_encoded

def preprocess_dataset(dataset):
  # Encode the input data
  dataset = dataset.map(encode_batch, batched=True)
  # The transformers model expects the target class column to be named "labels"
  dataset = dataset.rename_column("gold_label", "labels")
  # Transform to pytorch tensors and only output the required columns
  dataset.set_format(columns=["input_ids", "attention_mask", "labels"])
  return dataset

dataset_en = preprocess_dataset(en_data)

  0%|          | 0/100993 [10:34<?, ?it/s]
100%|██████████| 101/101 [00:35<00:00,  2.86ba/s]


In [71]:
# iterate in dataset_en
for batch in dataset_en:
    print(batch)
    break

{'labels': 1, 'input_ids': [[0, 1913, 61585, 17366, 4, 70, 10, 230612, 6827, 7515, 5252, 7263, 83, 2360, 4806, 23, 390, 199656, 25, 7, 6, 29786, 3055, 111, 4420, 48800, 7, 4, 20662, 442, 34844, 47, 19069, 31949, 100, 10, 6, 129980, 538, 155034, 21455, 5, 1650, 25, 7, 7941, 47, 7413, 10, 155034, 21455, 111, 70, 7515, 5252, 7263, 2, 2, 757, 2], [0, 1913, 61585, 17366, 4, 70, 10, 230612, 6827, 7515, 5252, 7263, 83, 2360, 4806, 23, 390, 199656, 25, 7, 6, 29786, 3055, 111, 4420, 48800, 7, 4, 20662, 442, 34844, 47, 19069, 31949, 100, 10, 6, 129980, 538, 155034, 21455, 5, 1650, 25, 7, 7941, 47, 7413, 10, 155034, 21455, 111, 70, 7515, 5252, 7263, 2, 2, 106, 2], [0, 1913, 61585, 17366, 4, 70, 10, 230612, 6827, 7515, 5252, 7263, 83, 2360, 4806, 23, 390, 199656, 25, 7, 6, 29786, 3055, 111, 4420, 48800, 7, 4, 20662, 442, 34844, 47, 19069, 31949, 100, 10, 6, 129980, 538, 155034, 21455, 5, 1650, 25, 7, 7941, 47, 7413, 10, 155034, 21455, 111, 70, 7515, 5252, 7263, 2, 2, 116, 2]], 'attention_mask': [[

In [72]:
dataset_en = dataset_en.remove_columns(["language", "premise", "hypothesis", "__index_level_0__"])

In [73]:
# iterate in dataset_en
for batch in dataset_en:
    print(batch)
    break

{'labels': 1, 'input_ids': [[0, 1913, 61585, 17366, 4, 70, 10, 230612, 6827, 7515, 5252, 7263, 83, 2360, 4806, 23, 390, 199656, 25, 7, 6, 29786, 3055, 111, 4420, 48800, 7, 4, 20662, 442, 34844, 47, 19069, 31949, 100, 10, 6, 129980, 538, 155034, 21455, 5, 1650, 25, 7, 7941, 47, 7413, 10, 155034, 21455, 111, 70, 7515, 5252, 7263, 2, 2, 757, 2], [0, 1913, 61585, 17366, 4, 70, 10, 230612, 6827, 7515, 5252, 7263, 83, 2360, 4806, 23, 390, 199656, 25, 7, 6, 29786, 3055, 111, 4420, 48800, 7, 4, 20662, 442, 34844, 47, 19069, 31949, 100, 10, 6, 129980, 538, 155034, 21455, 5, 1650, 25, 7, 7941, 47, 7413, 10, 155034, 21455, 111, 70, 7515, 5252, 7263, 2, 2, 106, 2], [0, 1913, 61585, 17366, 4, 70, 10, 230612, 6827, 7515, 5252, 7263, 83, 2360, 4806, 23, 390, 199656, 25, 7, 6, 29786, 3055, 111, 4420, 48800, 7, 4, 20662, 442, 34844, 47, 19069, 31949, 100, 10, 6, 129980, 538, 155034, 21455, 5, 1650, 25, 7, 7941, 47, 7413, 10, 155034, 21455, 111, 70, 7515, 5252, 7263, 2, 2, 116, 2]], 'attention_mask': [[

In [74]:
model.train_adapter(["nli"])

In [75]:
model.active_adapters = Stack("en", "nli")

In [76]:
from transformers import TrainingArguments, AdapterTrainer
from datasets import concatenate_datasets

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    logging_steps=100,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_en,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [77]:
trainer.train()

***** Running training *****
  Num examples = 100993
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 100993

  0%|          | 28/100993 [04:53<7:09:03,  3.92it/s] 

{'loss': 1.1525, 'learning_rate': 9.990098323646194e-05, 'epoch': 0.0}



  0%|          | 28/100993 [05:18<7:09:03,  3.92it/s] 

{'loss': 1.1643, 'learning_rate': 9.980196647292387e-05, 'epoch': 0.0}




KeyboardInterrupt: 