In [33]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers.adapters.composition import Stack


In [48]:
def get_data(lang):
    data = pd.read_csv("data/train.tsv",sep="\t")
    data = data[data["language"] == lang]
    return data

In [49]:
def get_languages():
    data = pd.read_csv("data/train.tsv",sep="\t")
    return data["language"].unique()

languages = get_languages()

In [50]:
languages

array(['en', 'vi', 'de', 'ar', 'es', 'bg', 'el', 'th', 'ru', 'tr', 'sw',
       'ur', 'zh', 'hi', 'fr'], dtype=object)

In [63]:
en_data = get_data("en")

In [64]:
en_data.head()

Unnamed: 0,gold_label,premise,hypothesis,language
0,neutral,"At ground level, the asymmetrical cathedral is...",It's hard to find a dramatic view of the cathe...,en
1,contradiction,Hanuman is a beneficent deity predating classi...,Hanuman declared that all the lemurs here need...,en
2,contradiction,All other spending as well as federal revenue ...,None of the federal spending is assumed to grow,en
3,neutral,uh-huh that's interesting well it sounds as th...,That information about graduation rates is int...,en
4,neutral,Some kind of instant recognition on his father...,Did his father recognize him?,en


In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [11]:
from transformers import AutoConfig, AutoAdapterModel

config = AutoConfig.from_pretrained(
    "xlm-roberta-base",
)
model = AutoAdapterModel.from_pretrained(
    "xlm-roberta-base",
    config=config,
)

Downloading pytorch_model.bin:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaAdapterModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for prediction

In [16]:
from transformers import AdapterConfig

# Load the language adapters
lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)
model.load_adapter("en/wiki@ukp", config=lang_adapter_config)
model.load_adapter("de/wiki@ukp", config=lang_adapter_config)

# Add a new task adapter
model.add_adapter("nli")

# Add a classification head for our target task
model.add_multiple_choice_head("nli", num_choices=3)

Downloading xlm-roberta-base.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading en_relu_2.zip:   0%|          | 0.00/28.2M [00:00<?, ?B/s]

Downloading de_relu_2.zip:   0%|          | 0.00/28.2M [00:00<?, ?B/s]

In [65]:
def preprocess_dataset(dataset):
  # Encode the input data
  all_encoded = {"input_ids": [], "attention_mask": []}
  for premise, hypothesis in tqdm(zip(dataset["premise"], dataset["hypothesis"])):
    premise = str(premise)
    hypothesis = str(hypothesis)
    encoded = tokenizer(
        premise,
        hypothesis,
        max_length=60,
        truncation=True,
        padding="max_length",
    )
    all_encoded["input_ids"].append(encoded["input_ids"])
    all_encoded["attention_mask"].append(encoded["attention_mask"])
  
  dataset["input_ids"] = all_encoded["input_ids"]
  dataset["attention_mask"] = all_encoded["attention_mask"]

  # Encode the labels
  labels = dataset["gold_label"].values
  labels = [0 if label == "entailment" else 1 if label == "neutral" else 2 for label in labels]
  dataset["gold_label"] = labels

  # The transformers model expects the target class column to be named "labels"
  dataset.rename(columns={"gold_label": "labels"}, inplace=True)
  # Transform to pytorch tensors and only output the required columns
  dataset = dataset[["input_ids", "attention_mask", "labels"]]
  return dataset

en_data = preprocess_dataset(en_data)

100993it [00:13, 7231.67it/s]


In [66]:
en_data.head()

Unnamed: 0,input_ids,attention_mask,labels
0,"[0, 1913, 61585, 17366, 4, 70, 10, 230612, 682...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"[0, 2548, 38782, 83, 10, 137909, 2517, 8, 2481...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
2,"[0, 3164, 3789, 140533, 237, 5299, 237, 30361,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[0, 38074, 9, 1132, 127, 450, 25, 7, 49041, 52...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,"[0, 31384, 8562, 111, 34648, 230466, 98, 1919,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [72]:
# make the dataset a pytorch dataset
en_dataset = torch.utils.data.TensorDataset(
    torch.tensor(en_data["input_ids"].values.tolist()),
    torch.tensor(en_data["attention_mask"].values.tolist()),
    torch.tensor(en_data["labels"].values.tolist()),
)

In [32]:
model.train_adapter(["nli"])

In [34]:
model.active_adapters = Stack("en", "nli")

In [69]:
from transformers import TrainingArguments, AdapterTrainer

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_steps=100,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=en_dataset,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [70]:
trainer.train()

***** Running training *****
  Num examples = 100993
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 25249


  0%|          | 0/25249 [00:00<?, ?it/s]

TypeError: vars() argument must have __dict__ attribute