# QLoRA Implementation - Aaryaman Bajaj



In [1]:
# QLoRA Implementation - Aaryaman Bajaj
# Install required packages for QLoRA (4-bit quantization)
!pip install -q transformers datasets accelerate peft bitsandbytes scikit-learn

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import time
import random
import bitsandbytes as bnb

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
set_seed(42)

In [3]:
# Configuration
MODEL_NAME = "distilbert-base-uncased"
BATCH_SIZE = 16
LEARNING_RATE = 3e-4
NUM_EPOCHS = 3
MAX_LENGTH = 128
WARMUP_RATIO = 0.1

# LoRA configuration
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.1
TARGET_MODULES = ["q_lin", "v_lin"]  # DistilBERT attention layers

# Choose quantization mode: '4bit' for true QLoRA, '8bit' for 8-bit, 'none' for no quantization
QUANTIZATION_MODE = '4bit'  # Options: '4bit', '8bit', 'none'

In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB
Memory: 85.17 GB


In [5]:
print(f"Using device: {DEVICE}")
initial_memory = 0.0
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    initial_memory = torch.cuda.memory_allocated() / 1024**2
    print(f"Initial GPU memory: {initial_memory:.2f} MB")

Using device: cuda
Initial GPU memory: 0.00 MB


In [6]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Loading SST-2 dataset...")
dataset = load_dataset("glue", "sst2")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Loading SST-2 dataset...


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Training samples: 67349
Validation samples: 872


In [7]:
# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Tokenizing datasets...


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [8]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

Training batches: 4210
Validation batches: 55


In [9]:
# Configure quantization based on mode
# FIX: Skip classification head modules that cause issues with quantization
# The pre_classifier and classifier are dense layers in DistilBERT's classification head
# that don't work well with bitsandbytes quantization

model_load_start = time.time()

if QUANTIZATION_MODE == '4bit':
    print("Loading 4-bit quantized model (True QLoRA)...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        # FIX: Skip the classification head layers from quantization
        llm_int8_skip_modules=["pre_classifier", "classifier"]
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        num_labels=2,
        device_map="auto"
    )
    USE_DEVICE_MAP = True

elif QUANTIZATION_MODE == '8bit':
    print("Loading 8-bit quantized model...")
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        # FIX: Skip the classification head layers from quantization
        llm_int8_skip_modules=["pre_classifier", "classifier"]
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        num_labels=2,
        device_map="auto"
    )
    USE_DEVICE_MAP = True

else:  # No quantization
    print("Loading model without quantization (standard LoRA)...")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2
    )
    model = model.to(DEVICE)
    USE_DEVICE_MAP = False

model_load_time = time.time() - model_load_start
print(f"Model loaded in {model_load_time:.2f} seconds")

if torch.cuda.is_available():
    memory_after_model = torch.cuda.memory_allocated() / 1024**2
    print(f"Memory after loading model: {memory_after_model:.2f} MB")
    print(f"Memory used by model: {memory_after_model - initial_memory:.2f} MB")

Loading 4-bit quantized model (True QLoRA)...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded in 2.43 seconds
Memory after loading model: 67.85 MB
Memory used by model: 67.85 MB


In [10]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters before LoRA: {total_params:,}")
print(f"Trainable parameters before LoRA: {trainable_params:,}")

Total parameters before LoRA: 45,721,346
Trainable parameters before LoRA: 24,446,210


In [11]:
# Prepare model for k-bit training (only needed for quantized models)
if QUANTIZATION_MODE in ['4bit', '8bit']:
    model = prepare_model_for_kbit_training(model)
    print("Model prepared for quantized training")
else:
    print("Skipping k-bit preparation (not using quantization)")

Model prepared for quantized training


In [12]:
# Configure LoRA
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)

print(f"Applying LoRA with rank={LORA_R}, alpha={LORA_ALPHA}")

Applying LoRA with rank=8, alpha=16


In [13]:
# Apply LoRA to the model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Count parameters after LoRA
total_params_after = sum(p.numel() for p in model.parameters())
trainable_params_after = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters after LoRA: {total_params_after:,}")
print(f"Trainable parameters after LoRA: {trainable_params_after:,}")
print(f"Percentage of trainable parameters: {100 * trainable_params_after / total_params_after:.2f}%")

if torch.cuda.is_available():
    memory_after_lora = torch.cuda.memory_allocated() / 1024**2
    print(f"\nMemory after applying LoRA: {memory_after_lora:.2f} MB")

trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925

Total parameters after LoRA: 46,460,932
Trainable parameters after LoRA: 739,586
Percentage of trainable parameters: 1.59%

Memory after applying LoRA: 117.96 MB


In [14]:
# FIX: Only pass trainable parameters to optimizer
optimizer = optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=LEARNING_RATE
)

# Calculate training steps
num_training_steps = NUM_EPOCHS * len(train_loader)
num_warmup_steps = int(WARMUP_RATIO * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

print(f"Total training steps: {num_training_steps}")
print(f"Warmup steps: {num_warmup_steps}")

Total training steps: 12630
Warmup steps: 1263


In [15]:
# Initialize metrics storage
metrics = {
    "train_loss": [],
    "train_accuracy": [],
    "val_loss": [],
    "val_accuracy": [],
    "epoch_time": [],
    "throughput": [],
    "memory_usage": []
}

In [16]:
# FIX: Training function with proper device handling for quantized models
def train_epoch(model, dataloader, optimizer, scheduler, device, use_device_map=False):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        # FIX: When using device_map='auto', tensors should go to the model's device
        if use_device_map:
            # Get the device of the model's first parameter
            model_device = next(model.parameters()).device
            input_ids = batch["input_ids"].to(model_device)
            attention_mask = batch["attention_mask"].to(model_device)
            labels = batch["label"].to(model_device)
        else:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        preds = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix({"loss": loss.item()})

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)

    return avg_loss, accuracy

In [17]:
# FIX: Evaluation function with proper device handling
def evaluate(model, dataloader, device, use_device_map=False):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            if use_device_map:
                model_device = next(model.parameters()).device
                input_ids = batch["input_ids"].to(model_device)
                attention_mask = batch["attention_mask"].to(model_device)
                labels = batch["label"].to(model_device)
            else:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)

    return avg_loss, accuracy

In [18]:
# Main training loop
quant_mode_str = "QLoRA (4-bit)" if QUANTIZATION_MODE == '4bit' else "8-bit LoRA" if QUANTIZATION_MODE == '8bit' else "Standard LoRA"
print("\n" + "="*50)
print(f"Starting {quant_mode_str} Training")
print("="*50)

best_val_accuracy = 0

for epoch in range(NUM_EPOCHS):
    print(f"\n\nEpoch {epoch + 1}/{NUM_EPOCHS}")
    print("-" * 30)

    # Track time and memory
    epoch_start = time.time()
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    # Training - FIX: Pass USE_DEVICE_MAP flag
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, DEVICE, USE_DEVICE_MAP)

    # Evaluation - FIX: Pass USE_DEVICE_MAP flag
    val_loss, val_acc = evaluate(model, val_loader, DEVICE, USE_DEVICE_MAP)

    # Calculate metrics
    epoch_time = time.time() - epoch_start
    throughput = len(train_dataset) / epoch_time

    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / 1024**2
    else:
        peak_memory = 0

    # Store metrics
    metrics["train_loss"].append(train_loss)
    metrics["train_accuracy"].append(train_acc)
    metrics["val_loss"].append(val_loss)
    metrics["val_accuracy"].append(val_acc)
    metrics["epoch_time"].append(epoch_time)
    metrics["throughput"].append(throughput)
    metrics["memory_usage"].append(peak_memory)

    # Update best accuracy
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc

    # Print results
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    print(f"Epoch Time: {epoch_time:.2f}s | Throughput: {throughput:.1f} samples/s")
    print(f"Peak Memory: {peak_memory:.2f} MB")

print("\n" + "="*50)
print(f"Training Complete! Best Validation Accuracy: {best_val_accuracy:.4f}")
print("="*50)


Starting QLoRA (4-bit) Training


Epoch 1/3
------------------------------


  return fn(*args, **kwargs)
Training: 100%|██████████| 4210/4210 [04:33<00:00, 15.37it/s, loss=0.0955]
Evaluating: 100%|██████████| 55/55 [00:01<00:00, 49.42it/s]


Train Loss: 0.2995 | Train Acc: 0.8712
Val Loss: 0.3080 | Val Acc: 0.8842
Epoch Time: 275.08s | Throughput: 244.8 samples/s
Peak Memory: 331.02 MB


Epoch 2/3
------------------------------


  return fn(*args, **kwargs)
Training: 100%|██████████| 4210/4210 [04:32<00:00, 15.45it/s, loss=0.102]
Evaluating: 100%|██████████| 55/55 [00:01<00:00, 50.36it/s]


Train Loss: 0.2127 | Train Acc: 0.9182
Val Loss: 0.3506 | Val Acc: 0.8968
Epoch Time: 273.69s | Throughput: 246.1 samples/s
Peak Memory: 331.02 MB


Epoch 3/3
------------------------------


  return fn(*args, **kwargs)
Training: 100%|██████████| 4210/4210 [04:34<00:00, 15.33it/s, loss=0.0749]
Evaluating: 100%|██████████| 55/55 [00:01<00:00, 50.89it/s]

Train Loss: 0.1784 | Train Acc: 0.9331
Val Loss: 0.3439 | Val Acc: 0.8899
Epoch Time: 275.70s | Throughput: 244.3 samples/s
Peak Memory: 331.02 MB

Training Complete! Best Validation Accuracy: 0.8968





In [19]:
# Final Results Summary
quant_str = "4-bit NF4 (QLoRA)" if QUANTIZATION_MODE == '4bit' else "8-bit" if QUANTIZATION_MODE == '8bit' else "None (FP32)"
print("\n" + "="*60)
print(f"{quant_mode_str.upper()} TRAINING FINAL RESULTS")
print("="*60)
print(f"\nBest Validation Accuracy: {best_val_accuracy:.4f}")
print(f"Average Epoch Time: {np.mean(metrics['epoch_time']):.2f} seconds")
print(f"Average Throughput: {np.mean(metrics['throughput']):.1f} samples/second")
print(f"Peak GPU Memory Usage: {max(metrics['memory_usage']):.2f} MB")
print(f"\nModel Configuration:")
print(f"  Quantization: {quant_str}")
print(f"  LoRA Rank: {LORA_R}")
print(f"  LoRA Alpha: {LORA_ALPHA}")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  Total Trainable Parameters: {trainable_params_after:,}")
print(f"  Percentage Trainable: {100 * trainable_params_after / total_params_after:.2f}%")


QLORA (4-BIT) TRAINING FINAL RESULTS

Best Validation Accuracy: 0.8968
Average Epoch Time: 274.82 seconds
Average Throughput: 245.1 samples/second
Peak GPU Memory Usage: 331.02 MB

Model Configuration:
  Quantization: 4-bit NF4 (QLoRA)
  LoRA Rank: 8
  LoRA Alpha: 16
  Batch Size: 16
  Learning Rate: 0.0003
  Total Trainable Parameters: 739,586
  Percentage Trainable: 1.59%


In [20]:
# Generate Benchmark Table for Comparison
import pandas as pd

method_name = "QLoRA" if QUANTIZATION_MODE == '4bit' else "8bit-LoRA" if QUANTIZATION_MODE == '8bit' else "LoRA"

benchmark_data = {
    "Method": method_name,
    "Rank": LORA_R,
    "Accuracy": f"{best_val_accuracy:.4f}",
    "GPU_Mem_MB": f"{max(metrics['memory_usage']):.2f}",
    "Latency_s": f"{np.mean(metrics['epoch_time']):.2f}",
    "Throughput_samples/s": f"{np.mean(metrics['throughput']):.1f}",
    "Trainable_Params": trainable_params_after,
    "Total_Params": total_params_after
}

benchmark_df = pd.DataFrame([benchmark_data])
print("\nBenchmark Results (for comparison with other methods):")
print(benchmark_df.to_string(index=False))

# Save results
filename = f"{method_name.lower()}_r{LORA_R}_results.csv"
benchmark_df.to_csv(filename, index=False)
print(f"\nResults saved to '{filename}'")


Benchmark Results (for comparison with other methods):
Method  Rank Accuracy GPU_Mem_MB Latency_s Throughput_samples/s  Trainable_Params  Total_Params
 QLoRA     8   0.8968     331.02    274.82                245.1            739586      46460932

Results saved to 'qlora_r8_results.csv'


In [21]:
# Save the model (optional - uncomment if needed)
# model.save_pretrained(f"./{method_name.lower()}_distilbert_sst2")
# tokenizer.save_pretrained(f"./{method_name.lower()}_distilbert_sst2")
# print(f"Model and tokenizer saved to './{method_name.lower()}_distilbert_sst2'")

In [22]:
# Inference Latency Test
def measure_inference_latency(model, tokenizer, text="This movie was absolutely fantastic!", num_runs=100):
    model.eval()

    # Get the correct device
    if USE_DEVICE_MAP:
        model_device = next(model.parameters()).device
    else:
        model_device = DEVICE

    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH)
    inputs = {k: v.to(model_device) for k, v in inputs.items()}

    # Warmup
    for _ in range(10):
        with torch.no_grad():
            _ = model(**inputs)

    # Measure
    latencies = []
    for _ in range(num_runs):
        start = time.time()
        with torch.no_grad():
            _ = model(**inputs)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        latencies.append((time.time() - start) * 1000)  # ms

    latencies = np.array(latencies)
    print(f"\nInference Latency ({num_runs} runs):")
    print(f"  Mean: {np.mean(latencies):.2f} ms")
    print(f"  Median: {np.median(latencies):.2f} ms")
    print(f"  P95: {np.percentile(latencies, 95):.2f} ms")

    return latencies

latencies = measure_inference_latency(model, tokenizer)


Inference Latency (100 runs):
  Mean: 18.43 ms
  Median: 18.30 ms
  P95: 18.98 ms


In [23]:
# Memory comparison estimate
if QUANTIZATION_MODE == '4bit':
    # 4-bit is roughly 8x smaller than FP32
    estimated_fp32_memory = max(metrics['memory_usage']) * 8
    savings = 100 - (max(metrics['memory_usage']) / estimated_fp32_memory * 100)
    print(f"\nEstimated memory savings vs FP32: ~{savings:.1f}%")
elif QUANTIZATION_MODE == '8bit':
    # 8-bit is roughly 4x smaller than FP32
    estimated_fp32_memory = max(metrics['memory_usage']) * 4
    savings = 100 - (max(metrics['memory_usage']) / estimated_fp32_memory * 100)
    print(f"\nEstimated memory savings vs FP32: ~{savings:.1f}%")
else:
    print("\nNo quantization applied - using FP32 precision")


Estimated memory savings vs FP32: ~87.5%
