In [1]:
!pip install -Uq apache_beam mwparserfromhell

In [22]:
from itertools import chain

import datasets
from datasets import load_dataset, load_metric

import transformers
import transformers.adapters.composition as ac
from transformers import (
    AdapterTrainer,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    MultiLingAdapterArguments,
    TrainingArguments,
    set_seed,
)
from transformers.adapters.configuration import AdapterConfig
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version

import gc
import sys

In [23]:
set_seed(42069)

In [24]:
valid_percentage = 20
language_code = "am"
task_name = language_code + "_mlm"
date = "20221120"

training_args = TrainingArguments(language_code + '_output',
    learning_rate=1e-4,
    num_train_epochs=10.0, report_to="all", 
    gradient_accumulation_steps=4,
    per_device_train_batch_size=2, per_device_eval_batch_size=4)

In [25]:
raw_datasets = load_dataset(
    "wikipedia", language=language_code, date=date, beam_runner='DirectRunner'
)
raw_datasets["validation"] = load_dataset(
    "wikipedia", language=language_code, date=date, 
    split=f"train[:{valid_percentage}%]", beam_runner='DirectRunner'
)
raw_datasets["train"] = load_dataset(
    "wikipedia", language=language_code, date=date, 
    split=f"train[{valid_percentage}%:]", beam_runner='DirectRunner'
)
len(raw_datasets["validation"]), len(raw_datasets["train"]), len(raw_datasets["validation"]) + len(raw_datasets["train"])

Using custom data configuration 20221120.am-date=20221120,language=am
Found cached dataset wikipedia (C:/Users/Thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration 20221120.am-date=20221120,language=am
Found cached dataset wikipedia (C:/Users/Thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)
Using custom data configuration 20221120.am-date=20221120,language=am
Found cached dataset wikipedia (C:/Users/Thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


(2740, 10960, 13700)

In [26]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
# model.resize_token_embeddings(len(tokenizer))

In [27]:
adapter_config = AdapterConfig.load("pfeiffer") # AdapterConfig.load("pfeiffer+inv")
model.add_adapter(task_name, config=adapter_config)

In [28]:
model.train_adapter([task_name])
model.set_active_adapters(task_name)

In [29]:
column_names = raw_datasets["train"].column_names
text_column_name = "text"
max_seq_length = tokenizer.model_max_length

In [30]:
def tokenize_function(examples):
    return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names
)

Loading cached processed dataset at C:/Users/Thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559\cache-c997e72a580614a1.arrow
Loading cached processed dataset at C:/Users/Thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559\cache-15ca0dcbcc2c756a.arrow


In [31]:
def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= max_seq_length:
        total_length = (total_length // max_seq_length) * max_seq_length
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True
)

Loading cached processed dataset at C:/Users/Thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559\cache-83306ecabf824b2c.arrow
Loading cached processed dataset at C:/Users/Thomas/.cache/huggingface/datasets/wikipedia/20221120.am-date=20221120,language=am/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559\cache-191f54503cc493bf.arrow


In [32]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)

metric = load_metric("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = labels.reshape(-1)
    preds = preds.reshape(-1)
    mask = labels != -100
    labels = labels[mask]
    preds = preds[mask]
    return metric.compute(predictions=preds, references=labels)

  metric = load_metric("accuracy")


In [33]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=0.15)

In [None]:
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)


In [None]:
train_result = trainer.train()
trainer.save_model()

metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
metrics = trainer.evaluate()

metrics["eval_samples"] = len(eval_dataset)

try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
    
metrics["perplexity"] = perplexity

trainer.save_metrics("eval", metrics)

In [35]:
perplexity=1.1111

In [42]:
model.save_adapter(language_code + f'_output/lm_{language_code}_{perplexity:.3f}', task_name)

adapter_loc = f'{language_code}_output/lm_{language_code}_{perplexity:.3f}'

In [46]:
this = sys.modules[__name__]
for n in dir():
    if n[0]!='_' and n not in ['this', 'adapter_loc', 'gc']: delattr(this, n)
del n
del this
gc.collect()

NameError: name 'gc' is not defined

In [48]:
import pandas as pd
from transformers import AutoAdapterModel, AdapterConfig, AutoTokenizer, AutoConfig
import torch
import numpy as np
from sklearn.metrics import f1_score
from transformers.adapters.composition import Stack

In [49]:
en_train = pd.read_csv('translated_train_all.csv')
en_test = pd.read_csv('translated_test_all.csv')

am_train = pd.read_csv('am_train_translated.csv')
am_train, am_dev, am_test = np.split(
    am_train.sample(frac=1, random_state=42), [int(.7*len(am_train)), int(.8*len(am_train))])

In [50]:
combined_train = pd.concat([
    en_train[['text', 'labels']]
])

combined_test = pd.concat([
    am_test[['eng_translated', 'label']].rename(columns={'eng_translated':'text', 'label':'labels'}),
    am_dev[['eng_translated', 'label']].rename(columns={'eng_translated':'text', 'label':'labels'}),
    en_test[['text', 'labels']]
])
test_split_lengths = [('am_test', len(am_test)), ('am_dev', len(am_dev)), ('en_test', len(en_test))]

In [51]:

label2id = {"positive":0, "neutral":1, 'negative':2}
id2label = {0:"positive", 1:"neutral", 2:'negative'}

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def encode_batch(row):
    text = ' '.join(filter(lambda x:x[0]!='@', row.text.split() if type(row.text)==str else []))
    out = tokenizer(text, max_length=100, truncation=True, padding="max_length", return_tensors='pt')
    out['labels'] = torch.LongTensor([label2id[row.labels]])[0]
    return out

train = combined_train.apply(encode_batch, axis=1).reset_index()[0]
test = combined_test.apply(encode_batch, axis=1).reset_index()[0]

len(train), len(test)

(9583, 4934)

In [57]:
config = AutoConfig.from_pretrained(
    "xlm-roberta-base",
)
model = AutoAdapterModel.from_pretrained(
    "xlm-roberta-base",
    config=config,
)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaAdapterModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for prediction

In [58]:
lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)
en = model.load_adapter("en/wiki@ukp", config=lang_adapter_config)
task_adapter = model.load_adapter(adapter_loc, config=lang_adapter_config)

model.add_adapter("sa")
model.train_adapter("sa")

model.add_classification_head("sa", num_labels=3)
model.set_active_adapters("sa")

model.active_adapters = Stack(en, "sa")

In [59]:
import numpy as np
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=7,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True
#     # The next line is important to ensure the dataset labels are properly passed to the model
#     remove_unused_columns=False,
)

def compute_scores(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    i, output = 0, dict()
    for name, split_length in test_split_lengths:
        s = np.s_[i:i+split_length]
        split_preds = preds[s]
        split_labels = p.label_ids[s]
        output[f'{name}_acc'] = (split_preds==split_labels).mean()
        output[f'{name}_weighted_f1'] = f1_score(split_labels, split_preds, average='weighted')
        i += split_length
    return output

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    compute_metrics=compute_scores,
)

In [60]:
trainer.train()

***** Running training *****
  Num examples = 9583
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2100


Step,Training Loss
200,0.8615
400,0.627
422,0.627


KeyboardInterrupt: 

In [61]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 4934
  Batch size = 32


{'eval_loss': 0.8290937542915344,
 'eval_am_test_acc': 0.48705096073517123,
 'eval_am_test_weighted_f1': 0.4924214692738381,
 'eval_am_dev_acc': 0.5091819699499165,
 'eval_am_dev_weighted_f1': 0.5092245327146802,
 'eval_en_test_acc': 0.7673677501593371,
 'eval_en_test_weighted_f1': 0.7665843056520154}

In [62]:
del trainer
gc.collect()

NameError: name 'gc' is not defined

In [63]:
model.active_adapters = Stack(task_adapter, "sa")

In [64]:
combined_test = pd.concat([
    am_test[['tweet', 'label']].rename(columns={'tweet':'text', 'label':'labels'}),
    am_dev[['tweet', 'label']].rename(columns={'tweet':'text', 'label':'labels'}),
    en_test[['translated', 'labels']].rename(columns={'translated':'text'})
])
test = combined_test.apply(encode_batch, axis=1).reset_index()[0]

In [65]:
eval_trainer = AdapterTrainer(
    model=model,
    args=TrainingArguments(output_dir="./eval_output", remove_unused_columns=False,),
    eval_dataset=test,
    compute_metrics=compute_scores
)
eval_trainer.evaluate()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Evaluation *****
  Num examples = 4934
  Batch size = 8


{'eval_loss': 1.113443374633789,
 'eval_am_test_acc': 0.5639097744360902,
 'eval_am_test_weighted_f1': 0.5625499693255975,
 'eval_am_dev_acc': 0.5575959933222037,
 'eval_am_dev_weighted_f1': 0.556561055231311,
 'eval_en_test_acc': 0.6124920331421287,
 'eval_en_test_weighted_f1': 0.5802936844466843,
 'eval_runtime': 18.0254,
 'eval_samples_per_second': 273.724,
 'eval_steps_per_second': 34.229}