# Using VB-LoRA for sequence classification

In this example, we fine-tune Roberta on a sequence classification task using VB-LoRA.

## Imports

In [1]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_model,
    VBLoRAConfig,
    PeftType,
)

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Parameters

In [2]:
batch_size = 32
model_name_or_path = "roberta-large"
task = "rte"
peft_type = PeftType.VBLORA
device = "cuda"
num_epochs = 40
rank = 4       
max_length = 128
num_vectors = 90
vector_length = 256
torch.manual_seed(0)

<torch._C.Generator at 0x7f8f2e458750>

In [3]:
peft_config = VBLoRAConfig(
    task_type="SEQ_CLS", 
    r=rank,
    topk=1,
    target_modules=['key','value','query','output.dense','intermediate.dense'],
    num_vectors=num_vectors,
    vector_length=vector_length,
    save_only_topk_weights=True,
    vblora_dropout=0.,
)
head_lr = 2e-3
vector_bank_lr = 1e-3
logits_lr = 1e-2

## Loading data

In [4]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [5]:
datasets = load_dataset("glue", task)
metric = evaluate.load("glue", task)

In [6]:
def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=max_length)
    return outputs


tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
# transformers library
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [7]:
def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")


# Instantiate dataloaders.
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size
)

## Preparing the VB-LoRA model

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, max_length=None, )
model = get_peft_model(model, peft_config)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:

from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.trainer_pt_utils import get_parameter_names

decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS)
decay_parameters = [name for name in decay_parameters if "bias" not in name]
vector_bank_parameters = [name for name, _ in model.named_parameters() if "vector_bank" in name]
logits_parameters = [name for name, _ in model.named_parameters() if "logits" in name ]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters and \
                    n not in logits_parameters and n not in vector_bank_parameters],
        "weight_decay": 0.1,
        "lr": head_lr,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters and \
                    n not in logits_parameters and n not in vector_bank_parameters],
        "weight_decay": 0.0,
        "lr": head_lr,
    },
    {
        "params": [p for n, p in model.named_parameters() if n in vector_bank_parameters],
        "lr": vector_bank_lr,
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if n in logits_parameters],
        "lr": logits_lr,
        "weight_decay": 0.0,
    },
]

optimizer = AdamW(optimizer_grouped_parameters)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=(len(train_dataloader) * num_epochs),
)

## Training

In [10]:
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(f"epoch {epoch}:", eval_metric)

  0%|          | 0/78 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.20it/s]


epoch 0: {'accuracy': 0.48014440433212996}


100%|██████████| 78/78 [00:34<00:00,  2.27it/s]
100%|██████████| 9/9 [00:01<00:00,  5.16it/s]


epoch 1: {'accuracy': 0.5270758122743683}


100%|██████████| 78/78 [00:34<00:00,  2.25it/s]
100%|██████████| 9/9 [00:01<00:00,  5.13it/s]


epoch 2: {'accuracy': 0.5523465703971119}


100%|██████████| 78/78 [00:34<00:00,  2.25it/s]
100%|██████████| 9/9 [00:01<00:00,  5.11it/s]


epoch 3: {'accuracy': 0.5379061371841155}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.11it/s]


epoch 4: {'accuracy': 0.6389891696750902}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 5: {'accuracy': 0.776173285198556}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 6: {'accuracy': 0.7870036101083032}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 7: {'accuracy': 0.8411552346570397}


100%|██████████| 78/78 [00:34<00:00,  2.23it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 8: {'accuracy': 0.8231046931407943}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 9: {'accuracy': 0.8122743682310469}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 10: {'accuracy': 0.8483754512635379}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 11: {'accuracy': 0.8483754512635379}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 12: {'accuracy': 0.8736462093862816}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 13: {'accuracy': 0.8592057761732852}


100%|██████████| 78/78 [00:35<00:00,  2.23it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 14: {'accuracy': 0.8339350180505415}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.11it/s]


epoch 15: {'accuracy': 0.8447653429602888}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 16: {'accuracy': 0.8592057761732852}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 17: {'accuracy': 0.8375451263537906}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 18: {'accuracy': 0.8736462093862816}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 19: {'accuracy': 0.851985559566787}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 20: {'accuracy': 0.8592057761732852}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 21: {'accuracy': 0.855595667870036}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 22: {'accuracy': 0.8267148014440433}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 23: {'accuracy': 0.8411552346570397}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 24: {'accuracy': 0.8375451263537906}


100%|██████████| 78/78 [00:35<00:00,  2.23it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 25: {'accuracy': 0.8375451263537906}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 26: {'accuracy': 0.8447653429602888}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 27: {'accuracy': 0.8339350180505415}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 28: {'accuracy': 0.8628158844765343}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 29: {'accuracy': 0.8592057761732852}


100%|██████████| 78/78 [00:34<00:00,  2.23it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 30: {'accuracy': 0.8483754512635379}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 31: {'accuracy': 0.8447653429602888}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 32: {'accuracy': 0.8447653429602888}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 33: {'accuracy': 0.855595667870036}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 34: {'accuracy': 0.8447653429602888}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


epoch 35: {'accuracy': 0.8483754512635379}


100%|██████████| 78/78 [00:34<00:00,  2.24it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 36: {'accuracy': 0.8628158844765343}


100%|██████████| 78/78 [00:34<00:00,  2.23it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 37: {'accuracy': 0.8483754512635379}


100%|██████████| 78/78 [00:34<00:00,  2.23it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]


epoch 38: {'accuracy': 0.8483754512635379}


100%|██████████| 78/78 [00:34<00:00,  2.23it/s]
100%|██████████| 9/9 [00:01<00:00,  5.09it/s]

epoch 39: {'accuracy': 0.851985559566787}





## Share adapters on the 🤗 Hub

In [11]:
account_id = ...  # your Hugging Face Hub account ID|

In [None]:
model.push_to_hub(f"{account_id}/roberta-large-peft-vblora")

## Load adapters from the Hub

You can also directly load adapters from the Hub using the commands below:

In [14]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoTokenizer

In [15]:
peft_model_id = f"{account_id}/roberta-large-peft-vblora"
config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Load the model
inference_model = PeftModel.from_pretrained(inference_model, peft_model_id)

In [17]:
inference_model.to(device)
inference_model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    batch.to(device)
    with torch.no_grad():
        outputs = inference_model(**batch)
    predictions = outputs.logits.argmax(dim=-1)
    predictions, references = predictions, batch["labels"]
    metric.add_batch(
        predictions=predictions,
        references=references,
    )

eval_metric = metric.compute()
print(eval_metric)

  0%|          | 0/9 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 9/9 [00:01<00:00,  5.32it/s]

{'accuracy': 0.851985559566787}



