In [1]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers.adapters.composition import Stack
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_data(lang):
    data = pd.read_csv("data/train.tsv",sep="\t")
    # take a subset 10%
    data = data.sample(frac=0.1, random_state=42)
    data = data[data["language"] == lang]
    return data

In [3]:
def get_languages():
    data = pd.read_csv("data/train.tsv",sep="\t")
    return data["language"].unique()

languages = get_languages()

In [4]:
languages

array(['en', 'vi', 'de', 'ar', 'es', 'bg', 'el', 'th', 'ru', 'tr', 'sw',
       'ur', 'zh', 'hi', 'fr'], dtype=object)

In [5]:
en_data = get_data("en")

In [6]:
en_data.head()

Unnamed: 0,gold_label,premise,hypothesis,language
64432,neutral,Not to agree to an unsettling and impossible t...,The purpose might be different to what is expe...,en
85813,neutral,He wants to find the secret rational formula t...,"AMC Movie Theaters knows the formula, that is ...",en
92520,entailment,What is reputed to be Gen.,Gen's reputation is a mystery.,en
55356,neutral,it has worked out so he's got a real nice bene...,There are no downsides to being with a big com...,en
68875,contradiction,"Henceforth, this column will refer to the show...",The current name of the show is permanent.,en


In [7]:
labels = en_data["gold_label"].values
labels = [0 if label == "entailment" else 1 if label == "neutral" else 2 for label in labels]
en_data["gold_label"] = labels

In [8]:
en_data = Dataset.from_pandas(en_data)

In [9]:
en_data

Dataset({
    features: ['gold_label', 'premise', 'hypothesis', 'language', '__index_level_0__'],
    num_rows: 10162
})

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [11]:
from transformers import AutoConfig, AutoAdapterModel

config = AutoConfig.from_pretrained(
    "xlm-roberta-base",
)
model = AutoAdapterModel.from_pretrained(
    "xlm-roberta-base",
    config=config,
)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaAdapterModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for prediction

In [12]:
from transformers import AdapterConfig

# Load the language adapters
lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)
model.load_adapter("en/wiki@ukp", config=lang_adapter_config)
model.load_adapter("de/wiki@ukp", config=lang_adapter_config)

# Add a new task adapter
model.add_adapter("nli")

# Add a classification head for our target task
model.add_multiple_choice_head("nli", num_choices=3)

In [14]:
def encode_batch(examples):
  """Encodes a batch of input data using the model tokenizer."""
  all_encoded = {"input_ids": [], "attention_mask": []}
  # Iterate through all examples in this batch
  for premise, hypothesis in (zip(examples["premise"], examples["hypothesis"])):
    premise = [str(premise)+ " " + str(hypothesis) for _ in range(3)]
    choices = ["0","1","2"]
    encoded = tokenizer(
        premise,
        choices,
        max_length=60,
        add_special_tokens=True,
        truncation=True,
        padding="max_length",
        return_special_tokens_mask=True,
    )
    all_encoded["input_ids"].append(encoded["input_ids"])
    all_encoded["attention_mask"].append(encoded["attention_mask"])
  return all_encoded

def preprocess_dataset(dataset):
  # Encode the input data
  dataset = dataset.map(encode_batch, batched=True)
  # The transformers model expects the target class column to be named "labels"
  dataset = dataset.rename_column("gold_label", "labels")
  # Transform to pytorch tensors and only output the required columns
  dataset.set_format(columns=["input_ids", "attention_mask", "labels"])
  return dataset

dataset_en = preprocess_dataset(en_data)

100%|██████████| 11/11 [00:05<00:00,  2.03ba/s]


In [15]:
dataset_en = dataset_en.remove_columns(["language", "premise", "hypothesis", "__index_level_0__"])

In [16]:
model.train_adapter(["nli"])

In [17]:
model.active_adapters = Stack("en", "nli")

In [18]:
from transformers import TrainingArguments, AdapterTrainer
from datasets import concatenate_datasets

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_steps=100,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_en,
)

In [None]:
trainer.train()