# Lightweight Fine-Tuning a BERT foundation model

* Foundation model: distilbert-bert-uncased
* PEFT technique: LoRA
* Fine-tuning dataset: https://huggingface.co/datasets/dair-ai/emotion

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset, Dataset
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import numpy as np
import json

### Load BERT model and tokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=6,
    id2label={0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"},  
    label2id={"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5}
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load and prepare dataset

In [3]:
splits = ["train", "test"]
ds = {split: ds for split, ds in zip(splits, load_dataset("emotion", split=splits))}

dataset_train = ds["train"].shuffle(seed=42).select(range(2000))
dataset_test = ds["test"].select(range(500))

In [4]:
dataset_train[1]

{'text': 'i had pocket qq and was feeling pretty confident lol', 'label': 1}

### Save and load test dataset to be sure to use the same data for comparing base model and lora model later

In [7]:
with open("../data/dataset_test.json", 'w') as f:
    json.dump(dataset_test.to_dict(), f)

In [16]:
testdata_dict = json.load(open("../data/dataset_test.json"))
dataset_test = Dataset.from_dict(testdata_dict)

### Tokenize datasets

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

In [6]:
tokenized_dataset_train = dataset_train.map(preprocess_function, batched=True)
tokenized_dataset_test = dataset_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
tokenized_dataset_train[1]

{'text': 'i had pocket qq and was feeling pretty confident lol',
 'label': 1,
 'input_ids': [101,
  1045,
  2018,
  4979,
  1053,
  4160,
  1998,
  2001,
  3110,
  3492,
  9657,
  8840,
  2140,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [24]:
tokenized_ds_train_lora = tokenized_dataset_train.remove_columns(['text'])
tokenized_ds_test_lora = tokenized_dataset_test.remove_columns(['text'])

In [43]:
tokenized_ds_train_lora[0]

{'label': 4,
 'input_ids': [101,
  2096,
  9670,
  1999,
  1996,
  2406,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


### Define metrics and evaluate foundation model

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [15]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/emotion_classification",
        learning_rate=2e-3,
        per_device_train_batch_size=10,
        per_device_eval_batch_size=10,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

eval_results_base_model = trainer.evaluate()

  trainer = Trainer(


In [16]:
eval_results_base_model["eval_accuracy"]

0.154

### LoRA Fine-Tuning

#### Define and train LoRA model

In [8]:
from peft import LoraConfig, get_peft_model

In [19]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_lin", "k_lin", "v_lin"],  # layers to train
    task_type='SEQ_CLS',
    lora_dropout=0.1,
    bias="none",
    modules_to_save=[],
)

In [20]:
lora_model = get_peft_model(model, lora_config)

In [21]:
lora_model.print_trainable_parameters()

trainable params: 1,037,574 || all params: 67,995,660 || trainable%: 1.5259


In [25]:
lora_trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/emotion_classification",
        learning_rate=2e-3,
        per_device_train_batch_size=10,
        per_device_eval_batch_size=10,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        label_names=["labels"],
        remove_unused_columns=False,
    ),
    train_dataset=tokenized_ds_train_lora,
    eval_dataset=tokenized_ds_test_lora,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

lora_trainer.train()

  lora_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.57696,0.78
2,No log,0.394855,0.874
3,0.614200,0.420071,0.862




TrainOutput(global_step=600, training_loss=0.5514509995778402, metrics={'train_runtime': 369.7358, 'train_samples_per_second': 16.228, 'train_steps_per_second': 1.623, 'total_flos': 813985652736000.0, 'train_loss': 0.5514509995778402, 'epoch': 3.0})

In [27]:
lora_trainer.evaluate()



{'eval_loss': 0.3948548138141632,
 'eval_accuracy': 0.874,
 'eval_runtime': 11.6977,
 'eval_samples_per_second': 42.744,
 'eval_steps_per_second': 4.274,
 'epoch': 3.0}

### Save LoRA model

In [28]:
lora_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [29]:
lora_model.save_pretrained("../models/lora_distil-bert")

### Load saved LoRA model

In [25]:
from peft import AutoPeftModelForSequenceClassification

In [26]:
lora_model_saved = AutoPeftModelForSequenceClassification.from_pretrained("../models/lora_distil-bert", num_labels=6)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
lora_model_saved

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

### Evaluate saved LoRA model

In [34]:
lora_trainer_saved = Trainer(
    model=lora_model_saved,
    args=TrainingArguments(
        output_dir="./data/emotion_classification",
        learning_rate=2e-3,
        per_device_train_batch_size=10,
        per_device_eval_batch_size=10,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        label_names=["labels"],
        remove_unused_columns=False,
    ),
    train_dataset=tokenized_ds_train_lora,
    eval_dataset=tokenized_ds_test_lora,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

eval_results_lora_model = lora_trainer_saved.evaluate()

  lora_trainer_saved = Trainer(


### Model performance comparison

In [35]:
print("Base model: ", eval_results_base_model["eval_accuracy"])
print("LoRA model: ", eval_results_lora_model["eval_accuracy"])

Base model:  0.154
LoRA model:  0.874
