In [None]:
%%capture
!pip install adapters datasets

# import required

In [None]:
import torch
import math
from huggingface_hub import login
from datasets import load_dataset
from transformers import set_seed, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, TrainingArguments
from adapters import AdapterTrainer, AdapterConfig, init

# training task adapters

In [None]:
SEED = 42
HF_KEY = "hf_xxx"
hf_dataset_path = "xxx"
output_dir = "./output_dir"
hf_domain_adapter_path = "xxx"
adapter_save_name = "xxx"
hf_adapter_upload_path = "xxx"

adapter_type = "pfeiffer"
lr = 1e-4
num_epochs = 5
r_factor = 16

In [None]:
set_seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
login(HF_KEY)
checkpoint = "gpt2-medium"

In [None]:
qa_dataset = load_dataset(hf_dataset_path, split="train")
qa_dataset = qa_dataset.shuffle(SEED)
qa_dataset = qa_dataset.train_test_split(test_size=0.15)

In [None]:
def preprocess(examples):
  examples["text"] = "Question: " + examples["question"] + "Answer: " + examples["answer"]
  return examples
qa_dataset = qa_dataset.map(preprocess)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  encoding = tokenizer(batch["text"], truncation=True, max_length=256)
  return encoding
tokenizer.pad_token = tokenizer.eos_token
column_names = qa_dataset["train"].column_names
qa_data = qa_dataset.map(encode_batch, remove_columns=column_names, batched=True)

In [None]:
block_size = 64
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
  # Concatenate all texts.
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated_examples[list(examples.keys())[0]])
  # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
  # customize this part to your needs.
  total_length = (total_length // block_size) * block_size
  # Split by chunks of max_len.
  result = {
    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
    for k, t in concatenated_examples.items()
  }
  result["labels"] = result["input_ids"].copy()
  return result

qa_data = qa_data.map(group_texts,batched=True,)
qa_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
model = AutoModelForCausalLM.from_pretrained(checkpoint)
init(model)
model.load_adapter(hf_adapter_path, load_as='domain', with_head=False)

In [None]:
adapter_config = AdapterConfig.load(adapter_type, reduction_factor=r_factor)
model.add_adapter("task", config=adapter_config)
model.train_adapter("task")
model.active_adapters = Stack('domain', 'task')
model.adapter_to(device)
# print(model.adapter_summary())

In [None]:
training_args = TrainingArguments(
    output_dir= output_dir,
    overwrite_output_dir=True,
    do_train=True,
    remove_unused_columns=False,
    learning_rate=lr,
    num_train_epochs=num_epochs,
    report_to="none",
    )

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=qa_data["train"],
    eval_dataset=qa_data["test"],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )

In [None]:
trainer.train()

In [None]:
eval_result = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_result['eval_loss']):.2f}")

In [None]:
model.push_adapter_to_hub(hf_adapter_upload_path, adapter_name=adapter_save_name)
# model.save_adapter("legal_domain", "domain")