In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "5"

In [1]:
from transformers import BertTokenizerFast
from transformers import (
    Seq2SeqAdapterTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EncoderDecoderModel,
    AdapterConfig,
    AutoTokenizer
)
import datasets
from transformers.adapters.training import AdapterArguments, setup_adapter_training

In [2]:
dataset = datasets.load_dataset("../src/data/newsela")

Repo card metadata block was not found. Setting CardData to empty.


In [3]:
dataset["train"] = dataset["train"].select([*range(0, 10, 1)])
dataset["validation"] = dataset["validation"].select([*range(0, 10, 1)])
dataset["test"] = dataset["test"].select([*range(0, 10, 1)])

In [4]:
print(dataset['train']['src'][0])
print(dataset['train']['tgt'][0])

It found that eight of the 60 countries with particularly high child mortality had lowered their rates by more than two-thirds since 1990: Malawi, Bangladesh, Liberia, Tanzania, Ethiopia, East Timor, Niger and Eritrea.
It found that eight of the 60 countries with particularly high child death rates had lowered their rates by more than two-thirds since 1990: Malawi, Bangladesh, Liberia, Tanzania, Ethiopia, East Timor, Niger and Eritrea.


In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [6]:
tokenized_inputs = dataset["train"].map(
    lambda x: tokenizer(x["src"], truncation=True),
    batched=True,
    remove_columns=["src", "tgt"],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])

In [7]:
tokenized_targets = dataset["train"].map(
    lambda x: tokenizer(x["tgt"], truncation=True),
    batched=True,
    remove_columns=["src", "tgt"],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])

In [8]:
print(max_source_length)
print(max_target_length)

51
63


In [9]:
def preprocess_function(examples, padding="max_length"):
    inputs = examples["src"]
    targets = examples["tgt"]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_dataset = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=["src", "tgt"]
)

# EncoderDecoder Model

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-multilingual-cased",
    "bert-base-multilingual-cased",
    tie_encoder_decoder=True
)

model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

# This is new
bad_words = ['[CLS]']
bad_words_ids = [tokenizer.vocab[token] for token in bad_words]
model.config.bad_words_ids = [bad_words_ids]

#model.model_name = "bert2bert"
#model.config.hidden_size = 768

In [None]:
model

# BertGeneration (Alternative)

In [11]:
from transformers import BertGenerationEncoder, BertGenerationDecoder

In [12]:
encoder = BertGenerationEncoder.from_pretrained(
    "bert-base-multilingual-cased",
    bos_token_id=101,
    eos_token_id=102
)
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained(
    "bert-base-multilingual-cased",
    add_cross_attention=True,
    is_decoder=True,
    bos_token_id=101,
    eos_token_id=102
)
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
#model.model_name = "bert2bert"
#model.config.hidden_size = 768

You are using a model of type bert to instantiate a model of type bert-generation. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertGenerationEncoder: ['bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'bert.embeddings.token_type_embeddings.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertGenerationEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertGenerationEncoder from the checkpoint of a model that you expect t

# Adapters

In [13]:
from transformers.adapters.training import AdapterArguments, setup_adapter_training
from transformers.adapters.configuration import AdapterConfig
from transformers.adapters import Stack

## TA Setup using setup_adapter_training

In [14]:
# Without language adapter
setup_adapter_training(
    model=model,
    adapter_args=AdapterArguments(train_adapter=True),
    adapter_name="simplification",
    adapter_config_kwargs={"reduction_factor": 2}
)

('simplification', None)

In [15]:
model.has_adapters()

True

## Manual TA setup

In [None]:
adapter_config = AdapterConfig.load(
    "pfeiffer",
    reduction_factor=2,
)

# Add a new adapter
model.add_adapter("simplification", config=adapter_config, set_active=True)

#model.add_causal_lm_head("simplification", overwrite_ok=True)

# Activate the adapter
model.train_adapter(["simplification"])

In [None]:
model

## Manual TA setup with LA adapter

In [None]:
lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)
lang_adapter = model.load_adapter(
    "en/wiki@ukp",
    config=lang_adapter_config,
    model_name="bert-base-multilingual-cased"
)

adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)
model.add_adapter("simplification", config=adapter_config)

model.train_adapter(["simplification"])
model.active_adapters = Stack(lang_adapter, "simplification")

## TA Setup using setup_adapter_training and LA

In [None]:
setup_adapter_training(
    model=model,
    adapter_args=AdapterArguments(
        train_adapter=True,
        load_lang_adapter="en/wiki@ukp",
        lang_adapter_config="pfeiffer"
    ),
    adapter_name="simplification",
    adapter_config_kwargs={"reduction_factor": 2},
    #adapter_load_kwargs={"model_name": "bert-base-multilingual-cased"}
)

# Training

In [None]:
# Custom optimizer
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer=AdamW(model.parameters(), lr=0.00025)
scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=2000,
    num_training_steps=3000)
optimizers = optimizer, scheduler

In [16]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

In [17]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="no",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    #fp16=True, 
    output_dir="./bert2bert/",
    max_steps=3000,
    logging_steps=50,
    save_strategy="no",
    # eval_steps=50,
    learning_rate=3e-4,
    warmup_ratio=0.1,
    optim="adamw_torch"
)

trainer = Seq2SeqAdapterTrainer(
    tokenizer=tokenizer,
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    #optimizers=(optimizers)
)
train_results = trainer.train()

max_steps is given, it will override any value given in num_train_epochs
The following columns in the training set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 3000
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3000
  Number of trainable parameters = 0
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
trainer.save_model()

metrics = train_results.metrics
trainer.log_metrics(split="train", metrics=metrics)
trainer.save_metrics(split="train", metrics=metrics)
trainer.save_state()
model.save_adapter("./bert2bert/simplification", "simplification")

## Predictions

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
model = EncoderDecoderModel.from_pretrained("./bert2bert/")

In [None]:
predict_results = trainer.predict(tokenized_dataset['train'].select(range(5)), max_length=50, num_beams=5)
predictions = tokenizer.batch_decode(
    predict_results.predictions,
    # Should set to true
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
)

In [None]:
for x, y in zip(dataset['train'].select(range(10)), predictions):
    print('IN:', x['src'])
    print('TGT:', x['tgt'])
    print('OUT:', y)
    print('='*10)