# QLoRA Training

First let's get the necessary imports. The difference with the LoRA is ```bitsandbytes```.

In [None]:
!pip install -q -U datasets
!pip install -q -U transformers
!pip install -q accelerate
!pip install -q bitsandbytes
!pip install -q peft
!pip install -q evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/491.5 kB[0m [31m20.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/491.5 kB[0m [31m20.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/491.5 kB[0m [31m20.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is t

## Imports

Let us first import them!

New imports:

* ```BitsAndBytesConfig```: Core component of QLoRA! This sets up quantization parameters (e.g., 4-bit quantization with bnb_4bit), enabling efficient fine-tuning on large models with limited hardware (like a single GPU).

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EvalPrediction,
    BitsAndBytesConfig
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

## 💾 Dataset Preparation

For this tutorial, we will be using a hate-speech classification dataset.

In [None]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [None]:
!kaggle datasets download waalbannyantudre/hate-speech-detection-curated-dataset
!unzip hate-speech-detection-curated-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/waalbannyantudre/hate-speech-detection-curated-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading hate-speech-detection-curated-dataset.zip to /content
 62% 71.0M/114M [00:00<00:00, 339MB/s]
100% 114M/114M [00:00<00:00, 419MB/s] 
Archive:  hate-speech-detection-curated-dataset.zip
  inflating: HateSpeechDataset.csv   
  inflating: HateSpeechDatasetBalanced.csv  


### 🔍 Observation

The size is pretty big, about 720k data. In order to save time, I decided to only use 0.0015 or the original data, which leaves us with 1089. Next we follow up with the same preprocessing steps, similar to what we did with LoRA.

In [None]:
import pandas as pd

df = pd.read_csv('/content/HateSpeechDatasetBalanced.csv')
print(df.shape)
df.head(10)

(726119, 2)


Unnamed: 0,Content,Label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1
5,yeah retard haha,1
6,the ching chong chung stuff,1
7,the dead what a slut still warm when she tweet...,1
8,let your tweets be harmless it will not affect...,1
9,these latinos who have a problem with immigrat...,1


In [None]:
df, _ = train_test_split(df, train_size=0.0015, stratify=df["Label"], random_state=1)
print(df.shape)

(1089, 2)


In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["Content"].tolist(),
    df["Label"].tolist(),
    test_size=0.1,
    random_state=1
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts,
    train_labels,
    test_size=0.11,
    random_state=1
)

In [None]:
print("Train size:", len(train_texts))
print("Validation size:", len(val_texts))
print("Test size:", len(test_texts))

Train size: 872
Validation size: 108
Test size: 109


In [None]:
train_data = Dataset.from_dict({"text": train_texts, "labels": train_labels})
val_data = Dataset.from_dict({"text": val_texts, "labels": val_labels})

## 🤔 Without Quantization vs With Quantization?

Let's see what happens if we directly load the model without quantization.



In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    # device_map="auto"
)
model.config.pad_token_id = model.config.eos_token_id
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### 🔍 Obseravtions

It crashed... 😞

On a different GPU (22.5 GB), I found that Llama-3-8B occupied 18.4GB, exceeding our current GPU's memory (15.0GB).

Now let's see how we can load the model WITH quantization. 🤩

### Quantization with BnB (bits and bytes)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.pad_token_id = model.config.eos_token_id
model = prepare_model_for_kbit_training(model)

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 🔍 Observations

Now we've saved more than half of memory, only using 7GBs instead of 18GBs

In [None]:
!nvidia-smi

Wed May 28 01:39:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P0             30W /   70W |    7106MiB /  15360MiB |     15%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Training Setup

Like our previous LoRA tutorial.

In [None]:
print(model)

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Lla

In [None]:
def tokenize_func(example):
    return tokenizer(
        example['text'],
        truncation=True,
        padding=False
    )

train_data = train_data.map(tokenize_func, batched=True)
val_data = val_data.map(tokenize_func, batched=True)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

In [None]:
print(train_data)

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 872
})


In [None]:
train_data = train_data.map(lambda x: {"labels": int(x["labels"])})
val_data = val_data.map(lambda x: {"labels": int(x["labels"])})

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

In [None]:
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="binary")
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
print(train_data)

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 872
})


## LoRA

QLoRA still has LoRA in it, so let's set it up. Pretty similar to our LoRA tutorial.

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 6,823,936 || all params: 7,511,756,800 || trainable%: 0.0908


In [None]:
print(model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
  

## Training

In [None]:
training_args = TrainingArguments(
    output_dir="./hate_speech_cls",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=10,
    logging_dir="./lora_logs",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to="none",
    fp16=True,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
test_data = Dataset.from_dict({"text": test_texts, "labels": test_labels})
test_data = test_data.map(tokenize_func, batched=True)
test_data = test_data.map(lambda x: {"labels": int(x["labels"])})
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

### Hold up ❗

Before we begin, why don't we see how our model performs without the fine-tuning...

In [None]:
from sklearn.metrics import accuracy_score, classification_report

output = trainer.predict(test_data)
preds = output.predictions.argmax(axis=-1)
labels = output.label_ids

accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

print("Fine-tuned Model Performance:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")

print("Classification Report:")
print(classification_report(labels, preds, target_names=["non-hate", "hate"]))

Fine-tuned Model Performance:
Accuracy : 0.5138
Precision: 0.6471
Recall   : 0.3492
F1 Score : 0.4536
Classification Report:
              precision    recall  f1-score   support

    non-hate       0.45      0.74      0.56        46
        hate       0.65      0.35      0.45        63

    accuracy                           0.51       109
   macro avg       0.55      0.54      0.51       109
weighted avg       0.57      0.51      0.50       109



Just as we expected, it performed pretty bad... No worries, let's go on and continue with our training!

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9619,1.005256,0.564815,0.54717,0.557692,0.552381
2,0.6076,0.788133,0.685185,0.6875,0.634615,0.66
3,0.4827,0.711864,0.731481,0.767442,0.634615,0.694737
4,0.2196,0.663491,0.75,0.777778,0.673077,0.721649
5,0.2431,0.680435,0.740741,0.772727,0.653846,0.708333


TrainOutput(global_step=275, training_loss=0.5849493533914739, metrics={'train_runtime': 2304.4455, 'train_samples_per_second': 1.892, 'train_steps_per_second': 0.119, 'total_flos': 1.6480015155265536e+16, 'train_loss': 0.5849493533914739, 'epoch': 5.0})

In [None]:
!nvidia-smi

Tue May 27 09:12:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   81C    P0             39W /   70W |    7246MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
print(f"[Peak Allocated]: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB")
print(f"[Peak Reserved]:  {torch.cuda.max_memory_reserved() / 1024**2:.2f} MB")

[Peak Allocated]: 6959.82 MB
[Peak Reserved]:  7120.00 MB


In [None]:
from sklearn.metrics import accuracy_score, classification_report
output = trainer.predict(test_data)
preds = output.predictions.argmax(axis=-1)
labels = output.label_ids

accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

print("Fine-tuned Model Performance:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")

print("Classification Report:")
print(classification_report(labels, preds, target_names=["non-hate", "hate"]))

Fine-tuned Model Performance:
Accuracy : 0.7890
Precision: 0.7857
Recall   : 0.8730
F1 Score : 0.8271
Classification Report:
              precision    recall  f1-score   support

    non-hate       0.79      0.67      0.73        46
        hate       0.79      0.87      0.83        63

    accuracy                           0.79       109
   macro avg       0.79      0.77      0.78       109
weighted avg       0.79      0.79      0.79       109

