In [6]:
!pip install peft evaluate >> /dev/null

This guide will show you how to train a roberta-large model (but you can also use any of the GPT, OPT, or BLOOM models) with p-tuning on the mrpc configuration of the GLUE benchmark

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoderConfig,
)
from datasets import load_dataset
import evaluate
import torch

model_name_or_path = "roberta-large"
task = "mrpc"

num_epochs = 5
lr = 1e-3

batch_size = 32

In [2]:
# dataset = load_dataset("glue", task, verification_mode=False, split="train")
dataset = load_dataset("/home/kamal/.cache/huggingface/datasets/glue/mrpc")

dataset["train"][0]

Found cached dataset arrow (/home/kamal/.cache/huggingface/datasets/arrow/mrpc-04ea723e68ea8cd6/0.0.0/74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137)


  0%|          | 0/3 [00:00<?, ?it/s]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [3]:
metric = evaluate.load("glue", task)

In [4]:
# Copied
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [5]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)

In [6]:
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
    return outputs

In [7]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

Loading cached processed dataset at /home/kamal/.cache/huggingface/datasets/arrow/mrpc-04ea723e68ea8cd6/0.0.0/74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137/cache-4009905aefe5f717.arrow


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/kamal/.cache/huggingface/datasets/arrow/mrpc-04ea723e68ea8cd6/0.0.0/74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137/cache-5c895c56b9867ec8.arrow


In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
                                        padding="longest")

P-tuning uses a prompt encoder to optimize the prompt parameters, so you’ll need to initialize the PromptEncoderConfig with several arguments:

**task_type:** the type of task you’re training on, in this case it is sequence classification or SEQ_CLS

**num_virtual_tokens:** the number of virtual tokens to use, or in other words, the prompt

**encoder_hidden_size:** the hidden size of the encoder used to optimize the prompt parameters

In [9]:
peft_config = PromptEncoderConfig(task_type="SEQ_CLS",
                                  num_virtual_tokens=20,
                                  encoder_hidden_size=128)

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path,
                                                           return_dict=True, resume_download=True)

Downloading model.safetensors:  91%|#########1| 1.30G/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_int8_training, get_peft_config, prepare_model_for_kbit_training

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type='nf4'
)

quant_config

BitsAndBytesConfig {
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": false,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [12]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 2,403,588 || all params: 356,713,732 || trainable%: 0.6738142617957864


In [13]:
training_args = TrainingArguments(
    output_dir="/home/kamal/training_files/roberta-large-peft-p-tuning",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.619769,0.686377,0.802771
2,No log,0.599194,0.689855,0.805808


TrainOutput(global_step=230, training_loss=0.6460639621900476, metrics={'train_runtime': 88.2541, 'train_samples_per_second': 83.124, 'train_steps_per_second': 2.606, 'total_flos': 1074939532117632.0, 'train_loss': 0.6460639621900476, 'epoch': 2.0})

In [17]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer

peft_model_id = "/home/kamal/training_files/roberta-large-peft-p-tuning/checkpoint-230/"
config = PeftConfig.from_pretrained(peft_model_id)

In [20]:
classes = ["not equivalent", "equivalent"]

sentence1 = "Coast redwood trees are the tallest trees on the planet and can grow over 300 feet tall."
sentence2 = "The coast redwood trees, which can attain a height of over 300 feet, are the tallest trees on earth."

inputs = tokenizer(sentence1,
                   sentence2,
                   truncation=True,
                   padding="longest",
                   return_tensors="pt").to('cuda')

In [21]:
with torch.no_grad():
    outputs = model(**inputs).logits
    print(outputs)

paraphrased_text = torch.softmax(outputs, dim=1).tolist()[0]
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrased_text[i] * 100))}%")

tensor([[-0.5868, -0.3539]], device='cuda:0')
not equivalent: 44%
equivalent: 56%
