In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import load_dataset
from peft import UILinLoRAConfig, get_peft_model, PeftModel
import numpy as np
import torch
import evaluate

In [3]:
torch.set_printoptions(threshold=torch.inf)  # Display all elements
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.device_count()

1

In [4]:
BASE_ID = "roberta-base"
tok  = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True)

quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)
# quant_config = BitsAndBytesConfig(load_in_8bit=True)
base_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_ID,
    num_labels=2,
    # quantization_config=quant_config,
    device_map="auto")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
uilinlora_cfg = UILinLoRAConfig(
        target_modules=["query", "value"],
        uilinlora_alpha=1.0,
        uilinlora_dropout=0.0,
        fan_in_fan_out=False,
        rank=128)
model = get_peft_model(base_model, uilinlora_cfg)

In [6]:
model.classifier.requires_grad_(True)

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

In [7]:
for name, param in model.named_parameters():
    if "classifier" in name:
        print(name)
        print(param.shape)
        param.requires_grad = True

base_model.model.classifier.dense.weight
torch.Size([768, 768])
base_model.model.classifier.dense.bias
torch.Size([768])
base_model.model.classifier.out_proj.weight
torch.Size([2, 768])
base_model.model.classifier.out_proj.bias
torch.Size([2])


In [6]:
# model.train()
# batch = tok(["hello"], return_tensors="pt").to(0)
# out = model(**batch, labels=torch.tensor([1]).to(0))

# print("loss requires grad?", out.loss.requires_grad)
# print("loss grad_fn?", out.loss.grad_fn)



loss requires grad? True
loss grad_fn? <NllLossBackward0 object at 0x7f1ac7dceb60>




In [7]:
stop

NameError: name 'stop' is not defined

In [7]:
print(model)

PeftModel(
  (base_model): UILinLoRAModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): UILinLoRALayer(Linear(in_features=768, out_features=768, bias=True))
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): UILinLoRALayer(Linear(in_features=768, out_features=768, bias=True))
                  (dropout): Dropout(p=0.1, inp

In [8]:
for name, module in model.named_modules():
    if hasattr(module, "uilinlora_sigma"):
        print(f"\nAdapter module: {name}")
        for adapter_name in module.uilinlora_sigma.keys():
            print(f"  Adapter name: {adapter_name}")
            print(f"    Σ shape: {module.uilinlora_sigma[adapter_name].shape}")
            print(f"    D shape: {module.uilinlora_D[adapter_name].shape}")
            print(f"    E shape: {module.uilinlora_E[adapter_name].shape}")
            print(f"    U shape: {getattr(module, f'{adapter_name}_U').shape}")
            print(f"    V shape: {getattr(module, f'{adapter_name}_V').shape}")



Adapter module: base_model.model.roberta.encoder.layer.0.attention.self.query
  Adapter name: default
    Σ shape: torch.Size([128])
    D shape: torch.Size([768])
    E shape: torch.Size([768])
    U shape: torch.Size([768, 128])
    V shape: torch.Size([128, 768])

Adapter module: base_model.model.roberta.encoder.layer.0.attention.self.value
  Adapter name: default
    Σ shape: torch.Size([128])
    D shape: torch.Size([768])
    E shape: torch.Size([768])
    U shape: torch.Size([768, 128])
    V shape: torch.Size([128, 768])

Adapter module: base_model.model.roberta.encoder.layer.1.attention.self.query
  Adapter name: default
    Σ shape: torch.Size([128])
    D shape: torch.Size([768])
    E shape: torch.Size([768])
    U shape: torch.Size([768, 128])
    V shape: torch.Size([128, 768])

Adapter module: base_model.model.roberta.encoder.layer.1.attention.self.value
  Adapter name: default
    Σ shape: torch.Size([128])
    D shape: torch.Size([768])
    E shape: torch.Size([768])


In [7]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.shape}")

base_model.model.roberta.encoder.layer.0.attention.self.query.uilinlora_sigma.default: torch.Size([128])
base_model.model.roberta.encoder.layer.0.attention.self.query.uilinlora_D.default: torch.Size([768])
base_model.model.roberta.encoder.layer.0.attention.self.query.uilinlora_E.default: torch.Size([768])
base_model.model.roberta.encoder.layer.0.attention.self.value.uilinlora_sigma.default: torch.Size([128])
base_model.model.roberta.encoder.layer.0.attention.self.value.uilinlora_D.default: torch.Size([768])
base_model.model.roberta.encoder.layer.0.attention.self.value.uilinlora_E.default: torch.Size([768])
base_model.model.roberta.encoder.layer.1.attention.self.query.uilinlora_sigma.default: torch.Size([128])
base_model.model.roberta.encoder.layer.1.attention.self.query.uilinlora_D.default: torch.Size([768])
base_model.model.roberta.encoder.layer.1.attention.self.query.uilinlora_E.default: torch.Size([768])
base_model.model.roberta.encoder.layer.1.attention.self.value.uilinlora_sigma.d

In [1]:
for name, buf in model.named_buffers():
    print(f"{name}: {buf.shape}")


NameError: name 'model' is not defined

In [9]:
def print_trainable_params(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable parameters: {trainable:,}")
    print(f"Total parameters: {total:,}")
    print(f"Trainable %: {100 * trainable / total:.8f}%")

print_trainable_params(model)


Trainable parameters: 632,066
Total parameters: 124,687,106
Trainable %: 0.50692170%


In [13]:
# stop

In [10]:
model.config.pad_token_id = tok.pad_token_id

# ---------- data ----------
raw_ds = load_dataset("glue", "sst2")

def tokenize(batch):
    natural   = tok(batch["sentence"], add_special_tokens=True)
    true_lens = [len(ids) for ids in natural["input_ids"]]

    padded = tok(
        batch["sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

    padded["real_length"] = true_lens
    return padded

tokenized_ds = raw_ds.map(
    tokenize,
    batched=True,
    remove_columns=["sentence", "idx"]
)

# rename + set Torch format
tokenized_ds = tokenized_ds.rename_column("label", "labels")
tokenized_ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels", "real_length"],
)

# ---------- stats ----------
max_len = max(tokenized_ds["train"]["real_length"])
print(f"Longest raw sentence: {max_len} tokens")


# ---------- data ----------
raw_datasets = load_dataset("glue", "sst2")
def tokenize_function(example):
    return tok(example["sentence"], truncation=True, padding="max_length", max_length=100)

# Tokenize the entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
del raw_datasets
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Longest raw sentence: 67 tokens


In [11]:
# ---------- trainer ----------
args = TrainingArguments(
        output_dir="uilinlora-sst2",
        per_device_train_batch_size=32,
        num_train_epochs=1,
        learning_rate=3e-3,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=50)


trainer = Trainer(model=model,
                  args=args,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["validation"])

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
stop

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions[1]
preds = np.argmax(logits, axis=-1)
metric = evaluate.load("accuracy")

tokenized_datasets["validation"]["labels"]
metric.compute(predictions=preds, references=tokenized_datasets["validation"]["labels"])

In [None]:
# # after trainer.train()
# adapter_dir = "uilinlora_adapter"
# model.save_pretrained(adapter_dir, safe_serialization=True)  # adapter only
# tok.save_pretrained(adapter_dir)                             # optional, for easy reload

In [None]:
# BASE_ID = "roberta-base"
# base = AutoModelForSequenceClassification.from_pretrained(
#            BASE_ID, num_labels=2, device_map="auto")

# model = PeftModel.from_pretrained(base, "uilinlora_adapter").to(device)
# tokenizer = AutoTokenizer.from_pretrained("uilinlora_adapter", use_fast=True)

In [None]:
# predictions = trainer.predict(tokenized_datasets["validation"])
# logits = predictions.predictions[1]
# preds = np.argmax(logits, axis=-1)
# metric = evaluate.load("accuracy")

# tokenized_datasets["validation"]["labels"]
# metric.compute(predictions=preds, references=tokenized_datasets["validation"]["labels"])

In [None]:
# # # Debugging things

# # core_model   = model.get_base_model()        # → LlamaForSequenceClassification
# # llama_blocks = core_model.model.layers       # → ModuleList of decoder layers
# # qproj_0      = llama_blocks[0].self_attn.q_proj

# # print(type(qproj_0))          # should be your Linear4bit / Linear8bitLt
# # print(qproj_0.weight.shape)   # should be (out, in)  e.g.  (4096, 2048)

# model.train()
# with torch.amp.autocast("cuda"):
#     batch = tok(["hello"], return_tensors="pt").to(0)
#     out = model(**batch, labels=torch.tensor([1]).to(0))

# loss = out.loss.to(torch.float32)
# loss.backward()


# # Check grads manually
# for name, param in model.named_parameters():
#     if param.requires_grad and param.grad is not None:
#         print(f"{name} has non-zero grad: {param.grad.abs().mean().item():.6f}")

# model.train()
# batch = tok(["hello"], return_tensors="pt").to(0)
# out = model(**batch, labels=torch.tensor([1]).to(0))

# print("loss requires grad?", out.loss.requires_grad)
# print("loss grad_fn?", out.loss.grad_fn)

# for n, p in model.named_parameters():
#     if p.requires_grad:
#         print(n, p.shape)

#         # model.train()
# # batch = tok(["hello"], return_tensors="pt").to(0)
# # out = model(**batch, labels=torch.tensor([1]).to(0))
# # print(out.loss.grad_fn)  # should NOT be None



In [None]:
# For non trained model accuracy 0.4919
# For r=128 one epoch lr 3e-3 accuracy 0.932
# For r=128 two epochs lr 3e-3 accuracy 0.939

In [23]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch, evaluate
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, BitsAndBytesConfig, Trainer
)
from peft import UILinLoRAConfig, get_peft_model, LoraConfig, TaskType

In [2]:
torch.set_printoptions(threshold=float("inf"))
accuracy = evaluate.load("accuracy")

In [17]:
# ---------------------------  custom trainer  --------------------------- #
class UILinLoRATrainer(Trainer):
    def __init__(self, *args, head_lr=1e-3, adapter_lr=4e-3, **kw):
        super().__init__(*args, **kw)
        self.head_lr, self.adapter_lr = head_lr, adapter_lr

    def create_optimizer(self):                       # two learning rates
        if self.optimizer is None:
            head, adapter = [], []
            for n, p in self.model.named_parameters():
                if p.requires_grad:
                    (head if "classifier" in n else adapter).append(p)
            groups = [{"params": head,    "lr": self.head_lr},
                      {"params": adapter, "lr": self.adapter_lr}]
            self.optimizer = torch.optim.AdamW(groups)
        return self.optimizer

# ---------------------------  helpers  --------------------------- #
def prepare_sst2_dataset(tokenizer, max_len=128):
    ds = load_dataset("glue", "sst2")
    ds = ds.map(
        lambda ex: tokenizer(ex["sentence"],
                             truncation=True,
                             padding="max_length",
                             max_length=max_len),
        batched=True)
    ds = ds.rename_column("label", "labels")
    ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    return ds





In [29]:
base_id = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(base_id, use_fast=True)

base = AutoModelForSequenceClassification.from_pretrained(
    base_id, num_labels=2, device_map="auto"
)

uilinlora_cfg = UILinLoRAConfig(
        target_modules=["query", "value"],
        rank=128,
        uilinlora_alpha=1.0,
        uilinlora_dropout=0.0,
        fan_in_fan_out=False,
        init_uilinlora_weights=True,
        task_type=TaskType.SEQ_CLS)
# model = get_peft_model(base, uilinlora_cfg)

# uilinlora_cfg = UILinLoRAConfig(
#     r=8,                              # LoRA rank
#     lora_alpha=16,                   # Scaling factor
#     target_modules=["query", "value"],  # Module names to inject LoRA into (for RoBERTa/BERT)
#     lora_dropout=0.1,                # Dropout applied to LoRA layers during training
#     bias="none",                     # "none", "all", or "lora_only"
#     task_type=TaskType.SEQ_CLS      # Task type: SEQ_CLS = sequence classification
# )

model = get_peft_model(base, uilinlora_cfg)
model.classifier.requires_grad_(True)   # make head trainable
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
data = prepare_sst2_dataset(tokenizer)

In [30]:
train_args = TrainingArguments(
    output_dir="uilinlora-roberta-base-sst2",
    per_device_train_batch_size=64,
    num_train_epochs=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    logging_steps=50,
    save_total_limit=1,
    seed=42,
)


def compute_metrics(eval_pred):
    print(">>> compute_metrics called <<<")

    output, labels = eval_pred

    print("type(output):", type(output))
    print("output shape:", output.shape if hasattr(output, "shape") else "no shape")
    print("labels type:", type(labels))
    print("labels shape:", labels.shape if hasattr(labels, "shape") else "no shape")

    return {"eval_accuracy": 0.123456}  # dummy value to ensure metric is returned

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=data["train"].select(range(500)),
    eval_dataset=data["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # head_lr=1e-3,
    # adapter_lr=4e-3,
)

trainer.train()
print("Best-epoch accuracy:",
        trainer.evaluate()["eval_accuracy"])



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.693457,0.123456
2,No log,0.692987,0.123456


>>> compute_metrics called <<<
type(output): <class 'numpy.ndarray'>
output shape: (872, 2)
labels type: <class 'numpy.ndarray'>
labels shape: (872,)
>>> compute_metrics called <<<
type(output): <class 'numpy.ndarray'>
output shape: (872, 2)
labels type: <class 'numpy.ndarray'>
labels shape: (872,)


>>> compute_metrics called <<<
type(output): <class 'numpy.ndarray'>
output shape: (872, 2)
labels type: <class 'numpy.ndarray'>
labels shape: (872,)
Best-epoch accuracy: 0.123456


In [15]:
data["validation"]

Dataset({
    features: ['sentence', 'labels', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 872
})

In [7]:
print(trainer.evaluate().keys())


dict_keys([])


In [13]:
metrics = trainer.evaluate()
print(metrics)


{}


In [16]:
output = trainer.predict(data["validation"])
print("Predictions:", output)


Predictions: PredictionOutput(predictions=(array([0.54974294, 0.5569541 , 0.5817625 , 0.58381367, 0.56774193,
       0.5726411 , 0.53230274, 0.5903519 , 0.6318204 , 0.5852609 ,
       0.6440132 , 0.6711867 , 0.5880575 , 0.5995588 , 0.66562164,
       0.54330385, 0.54622316, 0.6644735 , 0.5960767 , 0.6308874 ,
       0.6160513 , 0.62608516, 0.6061301 , 0.61669475, 0.6227592 ,
       0.6202483 , 0.63846993, 0.61098   , 0.6022206 , 0.5886936 ,
       0.5375526 , 0.6374905 , 0.5742527 , 0.59934634, 0.62058896,
       0.6051544 , 0.62632877, 0.57997006, 0.52677023, 0.54231435,
       0.62219393, 0.63591254, 0.56213444, 0.5444094 , 0.5546863 ,
       0.55685973, 0.57151675, 0.60551995, 0.57812214, 0.59635806,
       0.64679545, 0.56645155, 0.61685014, 0.55208635, 0.53913295,
       0.5945992 , 0.5503514 , 0.56540185, 0.57217705, 0.579527  ,
       0.56046504, 0.60466975, 0.54691017, 0.55767715, 0.654204  ,
       0.6263826 , 0.55782646, 0.60765135, 0.64373773, 0.5305957 ,
       0.58095646, 