In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import load_dataset
from peft import UILinLoRAConfig, get_peft_model, PeftModel
import numpy as np
import torch
import evaluate

In [3]:
torch.set_printoptions(threshold=torch.inf)  # Display all elements
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.device_count()

1

In [4]:
BASE_ID = "roberta-base"
tok  = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True)

quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)
# quant_config = BitsAndBytesConfig(load_in_8bit=True)
base_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_ID,
    num_labels=2,
    # quantization_config=quant_config,
    device_map="auto")

uilinlora_cfg = UILinLoRAConfig(
        target_modules=["classifier.dense"],
        uilinlora_alpha=1.0,
        uilinlora_dropout=0.0,
        fan_in_fan_out=False,
        rank=128,
        bias="none")
model = get_peft_model(base_model, uilinlora_cfg)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ids shape: torch.Size([128])


In [5]:
print(model)

PeftModel(
  (base_model): UILinLoRAModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): Linear(in_features=768, out_features=768, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                )
  

In [6]:
for name, module in model.named_modules():
    if hasattr(module, "uilinlora_sigma"):
        print(f"\nAdapter module: {name}")
        for adapter_name in module.uilinlora_sigma.keys():
            print(f"  Adapter name: {adapter_name}")
            print(f"    Σ shape: {module.uilinlora_sigma[adapter_name].shape}")
            print(f"    D shape: {module.uilinlora_D[adapter_name].shape}")
            print(f"    E shape: {module.uilinlora_E[adapter_name].shape}")
            print(f"    U shape: {getattr(module, f'{adapter_name}_U').shape}")
            print(f"    V shape: {getattr(module, f'{adapter_name}_V').shape}")



Adapter module: base_model.model.classifier.dense
  Adapter name: default
    Σ shape: torch.Size([128])
    D shape: torch.Size([768])
    E shape: torch.Size([768])
    U shape: torch.Size([768, 128])
    V shape: torch.Size([128, 768])


In [7]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.shape}")


base_model.model.classifier.dense.uilinlora_sigma.default: torch.Size([128])
base_model.model.classifier.dense.uilinlora_D.default: torch.Size([768])
base_model.model.classifier.dense.uilinlora_E.default: torch.Size([768])
base_model.model.classifier.dense.uilinlora_bias.default: torch.Size([768])


In [8]:
for name, buf in model.named_buffers():
    print(f"{name}: {buf.shape}")


base_model.model.roberta.embeddings.position_ids: torch.Size([1, 514])
base_model.model.roberta.embeddings.token_type_ids: torch.Size([1, 514])
base_model.model.classifier.dense.default_U: torch.Size([768, 128])
base_model.model.classifier.dense.default_V: torch.Size([128, 768])


In [9]:
def print_trainable_params(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable parameters: {trainable:,}")
    print(f"Total parameters: {total:,}")
    print(f"Trainable %: {100 * trainable / total:.8f}%")

print_trainable_params(model)


Trainable parameters: 2,432
Total parameters: 125,240,194
Trainable %: 0.00194187%


In [10]:
# stop

In [11]:
model.config.pad_token_id = tok.pad_token_id

# ---------- data ----------
raw_ds = load_dataset("glue", "sst2")

def tokenize(batch):
    natural   = tok(batch["sentence"], add_special_tokens=True)
    true_lens = [len(ids) for ids in natural["input_ids"]]

    padded = tok(
        batch["sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

    padded["real_length"] = true_lens
    return padded

tokenized_ds = raw_ds.map(
    tokenize,
    batched=True,
    remove_columns=["sentence", "idx"]
)

# rename + set Torch format
tokenized_ds = tokenized_ds.rename_column("label", "labels")
tokenized_ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels", "real_length"],
)

# ---------- stats ----------
max_len = max(tokenized_ds["train"]["real_length"])
print(f"Longest raw sentence: {max_len} tokens")


# ---------- data ----------
raw_datasets = load_dataset("glue", "sst2")
def tokenize_function(example):
    return tok(example["sentence"], truncation=True, padding="max_length", max_length=100)

# Tokenize the entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
del raw_datasets
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Longest raw sentence: 67 tokens


In [12]:
# ---------- trainer ----------
args = TrainingArguments(
        output_dir="uilinlora-sst2",
        per_device_train_batch_size=32,
        num_train_epochs=1,
        learning_rate=3e-3,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=50)


trainer = Trainer(model=model,
                  args=args,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["validation"])

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
stop

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions[1]
preds = np.argmax(logits, axis=-1)
metric = evaluate.load("accuracy")

tokenized_datasets["validation"]["labels"]
metric.compute(predictions=preds, references=tokenized_datasets["validation"]["labels"])

In [None]:
# after trainer.train()
adapter_dir = "diag_adapter"
model.save_pretrained(adapter_dir, safe_serialization=True)  # adapter only
tok.save_pretrained(adapter_dir)                             # optional, for easy reload

In [None]:
base = AutoModelForSequenceClassification.from_pretrained(
           BASE_ID, load_in_4bit=True, device_map="auto")

model = PeftModel.from_pretrained(base, "uilinlora_adapter").to(device)

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions[1]
preds = np.argmax(logits, axis=-1)
metric = evaluate.load("accuracy")

tokenized_datasets["validation"]["labels"]
metric.compute(predictions=preds, references=tokenized_datasets["validation"]["labels"])

In [None]:
# # # Debugging things

# # core_model   = model.get_base_model()        # → LlamaForSequenceClassification
# # llama_blocks = core_model.model.layers       # → ModuleList of decoder layers
# # qproj_0      = llama_blocks[0].self_attn.q_proj

# # print(type(qproj_0))          # should be your Linear4bit / Linear8bitLt
# # print(qproj_0.weight.shape)   # should be (out, in)  e.g.  (4096, 2048)

# model.train()
# with torch.amp.autocast("cuda"):
#     batch = tok(["hello"], return_tensors="pt").to(0)
#     out = model(**batch, labels=torch.tensor([1]).to(0))

# loss = out.loss.to(torch.float32)
# loss.backward()


# # Check grads manually
# for name, param in model.named_parameters():
#     if param.requires_grad and param.grad is not None:
#         print(f"{name} has non-zero grad: {param.grad.abs().mean().item():.6f}")

# model.train()
# batch = tok(["hello"], return_tensors="pt").to(0)
# out = model(**batch, labels=torch.tensor([1]).to(0))

# print("loss requires grad?", out.loss.requires_grad)
# print("loss grad_fn?", out.loss.grad_fn)

# for n, p in model.named_parameters():
#     if p.requires_grad:
#         print(n, p.shape)

#         # model.train()
# # batch = tok(["hello"], return_tensors="pt").to(0)
# # out = model(**batch, labels=torch.tensor([1]).to(0))
# # print(out.loss.grad_fn)  # should NOT be None



In [None]:
# For non trained model accuracy 0.4919
# For r=128 one epoch lr 3e-3 accuracy 0.932
# For r=128 two epochs lr 3e-3 accuracy 0.939