In [25]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import numpy as np
from peft import get_peft_model, TaskType, LoraConfig, PeftModel, AutoPeftModelForSequenceClassification, AutoPeftModel

In [26]:
MODEL="distilbert/distilbert-base-uncased"
DATASET="dair-ai/emotion"
id2label = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}
label2id = {v:k for k,v in id2label.items()}
METRIC="accuracy"
TASK='text-classification'
NUM_LABELS=len(id2label)

In [27]:
# load HF dataset
data = load_dataset(DATASET)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [28]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenized_data = data.map(lambda d: tokenizer(d["text"], truncation=True), batched=True, num_proc=8)
tokenized_data['train']

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 16000
})

In [29]:
# define classification metric
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load(METRIC)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
task_evaluator = evaluate.evaluator(TASK)

In [30]:
# load pretrained HF model
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# evalate pretrained model
task_evaluator.compute(model, data['validation'], METRIC, tokenizer=tokenizer, label_mapping=label2id)

`data` is a preloaded Dataset! Ignoring `subset` and `split`.
Device set to use cuda:0


{'accuracy': 0.1175,
 'total_time_in_seconds': 4.693019475000028,
 'samples_per_second': 426.16486265486634,
 'latency_in_seconds': 0.0023465097375000143}

In [32]:
# train the model without PEFT
training_args = TrainingArguments(
    output_dir="temp",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    bf16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2492,0.21469,0.9225
2,0.1423,0.169093,0.927


TrainOutput(global_step=2000, training_loss=0.3226890068054199, metrics={'train_runtime': 42.8717, 'train_samples_per_second': 746.414, 'train_steps_per_second': 46.651, 'total_flos': 389287358125632.0, 'train_loss': 0.3226890068054199, 'epoch': 2.0})

In [33]:
# evalate the model trained without PEFT
task_evaluator.compute(model, data['validation'], METRIC, tokenizer=tokenizer, label_mapping=label2id)

`data` is a preloaded Dataset! Ignoring `subset` and `split`.
Device set to use cuda:0


{'accuracy': 0.9375,
 'total_time_in_seconds': 6.5266354670000055,
 'samples_per_second': 306.43660276606624,
 'latency_in_seconds': 0.003263317733500003}

In [34]:
# Create PEFT model from the original pretrained model
peft_config = LoraConfig(r=8, task_type=TaskType.SEQ_CLS, target_modules=["q_lin", "k_lin","v_lin"])
model1 = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model1 = get_peft_model(model1, peft_config)
model1.print_trainable_parameters()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 816,390 || all params: 67,774,476 || trainable%: 1.2046


In [35]:
# Train the PEFT model
training_args = TrainingArguments(
    output_dir="temp",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    bf16=True,
)

trainer = Trainer(
    model=model1,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0347,0.834933,0.702
2,0.7039,0.622181,0.7835
3,0.5993,0.536248,0.808
4,0.5285,0.484937,0.8255
5,0.4923,0.446033,0.834
6,0.4582,0.420857,0.8455
7,0.428,0.401469,0.853
8,0.4144,0.3883,0.8575
9,0.4168,0.380471,0.861
10,0.4157,0.378105,0.861


TrainOutput(global_step=10000, training_loss=0.582420980834961, metrics={'train_runtime': 180.6776, 'train_samples_per_second': 885.555, 'train_steps_per_second': 55.347, 'total_flos': 1982445802331904.0, 'train_loss': 0.582420980834961, 'epoch': 10.0})

In [42]:
# save the lora weights
model1.save_pretrained("lora_weights")

In [43]:
# check directory contents
!du -sh lora_weights/*

4.0K	lora_weights/adapter_config.json
3.2M	lora_weights/adapter_model.safetensors
8.0K	lora_weights/README.md


In [44]:
# load the original pretrained model again
model2 = AutoPeftModelForSequenceClassification.from_pretrained("./lora_weights", num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model2 = model2.merge_and_unload()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
# evaluate fine-tuned model
task_evaluator.compute(model2, data['validation'], METRIC, tokenizer=tokenizer, label_mapping=label2id)

`data` is a preloaded Dataset! Ignoring `subset` and `split`.
Device set to use cuda:0


{'accuracy': 0.8745,
 'total_time_in_seconds': 4.847098949999918,
 'samples_per_second': 412.6179433576519,
 'latency_in_seconds': 0.0024235494749999587}