In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
import time
import psutil
import os

In [None]:
set_seed(42)

In [None]:
# Configuration

# https://huggingface.co/datasets/stanfordnlp/sst2

MODEL_NAME = "distilbert-base-uncased"
DATASET_NAME = "sst2"
LORA_RANK = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.1
BATCH_SIZE = 16
LEARNING_RATE = 3e-4
NUM_EPOCHS = 3
MAX_LENGTH = 128

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [None]:
print(f"Using device: {DEVICE}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Initial GPU Memory: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

Using device: cuda
GPU: Tesla T4
Initial GPU Memory: 0.00 MB


In [None]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load dataset
print("Loading dataset...")
dataset = load_dataset("glue", DATASET_NAME)
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    )


Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Loading dataset...


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
# Tokenize datasets
print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Tokenizing datasets...


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Original model parameters: {total_params:,}")

Original model parameters: 66,955,010


In [None]:
# https://huggingface.co/docs/peft/en/package_reference/lora

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=["q_lin", "v_lin"],
    bias="none"
)

In [None]:
# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925


In [None]:
model.to(DEVICE)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

In [None]:
# Training metrics storage
training_metrics = {
    "epoch": [],
    "train_loss": [],
    "train_accuracy": [],
    "val_loss": [],
    "val_accuracy": [],
    "epoch_time": [],
    "gpu_memory_mb": [],
    "throughput_samples_per_sec": []
}

In [None]:
def train_epoch(model, loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    epoch_start = time.time()

    progress_bar = tqdm(loader, desc="Training")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        predictions = torch.argmax(logits, dim=-1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        progress_bar.set_postfix({"loss": loss.item(), "acc": correct/total})

    epoch_time = time.time() - epoch_start
    avg_loss = total_loss / len(loader)
    accuracy = correct / total
    throughput = total / epoch_time

    return avg_loss, accuracy, epoch_time, throughput

In [None]:
def evaluate(model, loader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(loader)
    accuracy = correct / total

    return avg_loss, accuracy

In [None]:
print("\nStarting training...")
for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")

    train_loss, train_acc, epoch_time, throughput = train_epoch(
        model, train_loader, optimizer, scheduler, DEVICE
    )

    val_loss, val_acc = evaluate(model, val_loader, DEVICE)

    if torch.cuda.is_available():
        gpu_memory = torch.cuda.max_memory_allocated(0) / 1024**2
        torch.cuda.reset_peak_memory_stats(0)
    else:
        gpu_memory = 0

    training_metrics["epoch"].append(epoch + 1)
    training_metrics["train_loss"].append(train_loss)
    training_metrics["train_accuracy"].append(train_acc)
    training_metrics["val_loss"].append(val_loss)
    training_metrics["val_accuracy"].append(val_acc)
    training_metrics["epoch_time"].append(epoch_time)
    training_metrics["gpu_memory_mb"].append(gpu_memory)
    training_metrics["throughput_samples_per_sec"].append(throughput)

    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    print(f"Epoch Time: {epoch_time:.2f}s | Throughput: {throughput:.2f} samples/s")
    print(f"GPU Memory: {gpu_memory:.2f} MB")


Starting training...

Epoch 1/3


Training: 100%|██████████| 4210/4210 [07:52<00:00,  8.91it/s, loss=0.205, acc=0.87]
Evaluating: 100%|██████████| 55/55 [00:03<00:00, 17.81it/s]


Train Loss: 0.2987 | Train Acc: 0.8703
Val Loss: 0.3059 | Val Acc: 0.8911
Epoch Time: 472.57s | Throughput: 142.52 samples/s
GPU Memory: 795.99 MB

Epoch 2/3


Training: 100%|██████████| 4210/4210 [07:56<00:00,  8.83it/s, loss=0.0468, acc=0.919]
Evaluating: 100%|██████████| 55/55 [00:03<00:00, 17.84it/s]


Train Loss: 0.2106 | Train Acc: 0.9192
Val Loss: 0.3464 | Val Acc: 0.8922
Epoch Time: 476.77s | Throughput: 141.26 samples/s
GPU Memory: 795.99 MB

Epoch 3/3


Training: 100%|██████████| 4210/4210 [07:56<00:00,  8.83it/s, loss=0.0221, acc=0.934]
Evaluating: 100%|██████████| 55/55 [00:03<00:00, 17.81it/s]

Train Loss: 0.1756 | Train Acc: 0.9343
Val Loss: 0.3338 | Val Acc: 0.8933
Epoch Time: 476.94s | Throughput: 141.21 samples/s
GPU Memory: 795.99 MB





In [None]:
# Final results, also convert to pandas
print("\n" + "="*50)
print("TRAINING COMPLETED")
print("="*50)
print("\nFinal Metrics:")
print(f"Best Validation Accuracy: {max(training_metrics['val_accuracy']):.4f}")
print(f"Average Epoch Time: {np.mean(training_metrics['epoch_time']):.2f}s")
print(f"Average Throughput: {np.mean(training_metrics['throughput_samples_per_sec']):.2f} samples/s")
print(f"Peak GPU Memory: {max(training_metrics['gpu_memory_mb']):.2f} MB")


TRAINING COMPLETED

Final Metrics:
Best Validation Accuracy: 0.8933
Average Epoch Time: 475.43s
Average Throughput: 141.66 samples/s
Peak GPU Memory: 795.99 MB


In [None]:
# Benchmark table
print("\n" + "="*50)
print("BENCHMARK TABLE")
print("="*50)
print(f"Method: LoRA")
print(f"Rank: {LORA_RANK}")
print(f"Final Validation Accuracy: {training_metrics['val_accuracy'][-1]:.4f}")
print(f"Peak GPU Memory (MB): {max(training_metrics['gpu_memory_mb']):.2f}")
print(f"Avg Epoch Time (s): {np.mean(training_metrics['epoch_time']):.2f}")
print(f"Avg Throughput (samples/s): {np.mean(training_metrics['throughput_samples_per_sec']):.2f}")



BENCHMARK TABLE
Method: LoRA
Rank: 8
Final Validation Accuracy: 0.8933
Peak GPU Memory (MB): 795.99
Avg Epoch Time (s): 475.43
Avg Throughput (samples/s): 141.66


In [None]:
# Save model
# print("\nSaving model...")
# model.save_pretrained("lora_model")
# tokenizer.save_pretrained("lora_model")
# print("Model saved to 'lora_model' directory")

In [None]:
print("\n" + "="*50)
print("INFERENCE LATENCY TEST")
print("="*50)
model.eval()
test_batch = next(iter(val_loader))
input_ids = test_batch["input_ids"][:1].to(DEVICE)
attention_mask = test_batch["attention_mask"][:1].to(DEVICE)

latencies = []
with torch.no_grad():
    for _ in range(100):
        start = time.time()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        latencies.append(time.time() - start)

print(f"Average Inference Latency: {np.mean(latencies)*1000:.2f} ms")
print(f"Median Inference Latency: {np.median(latencies)*1000:.2f} ms")
print(f"P95 Inference Latency: {np.percentile(latencies, 95)*1000:.2f} ms")


INFERENCE LATENCY TEST
Average Inference Latency: 8.85 ms
Median Inference Latency: 8.39 ms
P95 Inference Latency: 11.03 ms
