In [None]:
#====Check CUDA Device Availability===#

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

#===Run this if multiple GPUs are available===#

# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"


# import torch
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print('Device:', device)
# print('Current cuda device: ', torch.cuda.current_device())
# print('Count of using GPUs:', torch.cuda.device_count())

In [None]:
#===Import Libraries===#


import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, RobertaTokenizerFast
from datasets import load_dataset
import numpy as np
from datasets import load_dataset
from transformers import RobertaTokenizerFast
import math
from pathlib import Path
import random
import numpy as np
from safetensors.torch import load_file

In [None]:
#===Set Seed for Reproducibility===#

def seed_everything(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False

seed_everything()

In [None]:
#=======LoRA Class======#

"""
Here we are going to create a LORA linear class and since we aren't modifying all the layers in roberta, we later extract the exact layer we want to modify with its own ran via apply_lora_by_names.

"""

class LoraLinear(nn.Module):
    def __init__(self, in_features, out_features, r=0, lora_alpha=1.0, lora_dropout=0.1, bias=True, freeze_base=True):
        """
        We will  be training weight matrices via nn.Linear. In and out features have the same size as the original layer.
        LoRA parameters A and B are initialized to zero. The scaling factor is set to lora_alpha/r.
        r, and lora_alpha will be customized to each layer from apply_lora_by_names.
        The LoRA dropout is applied to the input of the A matrix.
        we aren't modifying the original weight so freeze_base is set to True.
        
        """
        super().__init__()
        self.r = r
        self.lora_alpha = 2*r
        self.scaling = lora_alpha / r if r > 0 and r != 0 else 1  
        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        
        if r > 0:
            self.lora_A = nn.Parameter(torch.zeros(r, in_features))       # A: (r x in_features)
            self.lora_B = nn.Parameter(torch.zeros(out_features, r))      # B: (out_features x r)
            nn.init.kaiming_uniform_(self.lora_B, a=math.sqrt(5))         
        else:
            # If r=0, LoRA is disabled (just a frozen linear layer)
            self.lora_A = None
            self.lora_B = None

        if freeze_base:
            self.weight.requires_grad = False
            if self.bias is not None:
                self.bias.requires_grad = False
        self.lora_dropout = nn.Dropout(p=lora_dropout) if lora_dropout > 0 else nn.Identity()

    def forward(self, x):
        result = F.linear(x, self.weight, self.bias)
        if self.r > 0:
            lora_out = torch.matmul(self.lora_dropout(x), self.lora_A.T)
            lora_out = torch.matmul(lora_out, self.lora_B.T)
            result = result + lora_out * self.scaling
        return result


def apply_lora_by_names(model, rank_dict, freeze=True, task_type="seq_cls"):
    """
    Replace Linear layers in the model with LoraLinear according to rank_dict.
    Supports: seq_cls , causal_lm , seq2seq.
    """

    for param_name, r in rank_dict.items():
        if not param_name.endswith(".weight"):
            continue
        module_name = param_name[:-len(".weight")]
        parent_name, child_name = module_name.rsplit(".", 1)
        parent_module = model.get_submodule(parent_name)
        if not hasattr(parent_module, child_name):
            continue
        old_linear = getattr(parent_module, child_name)
        if not isinstance(old_linear, nn.Linear):
            continue

        if r == 0:
            if freeze:
                old_linear.weight.requires_grad = False
                if old_linear.bias is not None:
                    old_linear.bias.requires_grad = False
            continue

        new_linear = LoraLinear(
            old_linear.in_features,
            old_linear.out_features,
            r=r,
            lora_alpha=2*r,
            lora_dropout=0.0,
            bias=(old_linear.bias is not None),
            freeze_base=freeze
        )
        new_linear.weight.data.copy_(old_linear.weight.data)
        if old_linear.bias is not None:
            new_linear.bias.data.copy_(old_linear.bias.data)
        setattr(parent_module, child_name, new_linear)

    if freeze:
        for name, param in model.named_parameters():
            if "lora_" in name:
                continue
            if (task_type == "seq_cls" and name.startswith("classifier")) \
               or (task_type == "causal_lm" and name.startswith("score")) \
               or (task_type == "seq2seq" and name.startswith("lm_head")):
                continue
            param.requires_grad = False


In [None]:
# #=======Ranks dict========#
""""
You can customize the ranks for each layer here.
The rank_dict is a dictionary where the keys are the parameter names and the values are the ranks
"""

manual_ranks = [
    (43, 64),
    (9, 59),
    (8, 47),
    (3, 43),
    (4, 50),
    (1, 61),
    (48, 62),
    (46, 60),
    (37, 52),
    (25, 42),
    (10, 49),
    (14, 53)
]

rank_dict = {}
for i, (q_rank, v_rank) in enumerate(manual_ranks):
    rank_dict[f"roberta.encoder.layer.{i}.attention.self.query.weight"] = q_rank
    rank_dict[f"roberta.encoder.layer.{i}.attention.self.value.weight"] = v_rank


In [None]:
#=======Load dataset=======#

raw_datasets = load_dataset("glue", "sst2")
train_data = raw_datasets["train"].shuffle(seed=42).select(range(60000))  # adjust range per GPU

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
max_length = 128

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True, 
                     padding="max_length", max_length=max_length)

train_dataset = train_data.map(tokenize_function, batched=True, 
                               remove_columns=[col for col in train_data.column_names if col != "label"])


In [None]:
#=======Load base  and train =======#

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
apply_lora_by_names(model, rank_dict, freeze=True, task_type="seq_cls")

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

training_args = TrainingArguments(
    output_dir="/home/himani/SL_SST",   
    per_device_train_batch_size=4,
    num_train_epochs=3,
    optim="adamw_hf",
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_strategy="epoch",                  
    save_strategy="epoch",                    
    evaluation_strategy="no",
    max_grad_norm=1.0,  
    max_steps=-1,
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)
trainer.train()


In [None]:
#====Evaluation in Benchmarks===#

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ckpt_path = "/home/himani/SL_SST2/Experiment_8/model.safetensors"
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
apply_lora_by_names(model, rank_dict, freeze=True)
state_dict = load_file(ckpt_path)
model.load_state_dict(state_dict, strict=True)
model.to(device).eval()

# ======Check if u loaded LoRA properly====== #
missing_lora_keys = []
for name, param in model.named_parameters():
    if "lora_A" in name or "lora_B" in name:
        if param.data.abs().sum() == 0:
            missing_lora_keys.append(name)

if missing_lora_keys:
    print("\n[WARNING] The following LoRA parameters are all zeros (likely not loaded or not trained):")
    for key in missing_lora_keys:
        print(f" - {key}")
else:
    print("\n[INFO] All LoRA parameters appear non-zero and properly loaded.")

# ======Benchmark datasets====== #
benchmarks = {
    "sst2": load_dataset("glue", "sst2", split="validation"),
    "imdb": load_dataset("imdb", split="test")
}

# ======Preprocessing====== #
def preprocess_eval_function(examples, dataset_name):
    if dataset_name == "sst2":
        encoded = tokenizer(
            examples["sentence"], truncation=True,
            padding="max_length", max_length=128
        )
    elif dataset_name == "imdb":
        encoded = tokenizer(
            examples["text"], truncation=True,
            padding="max_length", max_length=128
        )
    else:
        raise ValueError("Unsupported dataset name.")
    encoded["labels"] = examples["label"]
    return encoded

# ======Accuracy Calculation====== #
def compute_accuracy(dataset):
    correct = 0
    total = len(dataset)
    for i in range(0, total, 32):
        batch = dataset[i:i+32]
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = outputs.logits.argmax(dim=-1)
        correct += (preds == labels).sum().item()
    return correct / total


results = {}
in_distribution = []
out_distribution = []

for name, dataset in benchmarks.items():
    print(f"\nEvaluating {name}...")
    tokenized = dataset.map(
        lambda x: preprocess_eval_function(x, name),
        batched=True,
        remove_columns=[col for col in dataset.column_names if col not in ["label", "sentence", "text"]]
    )
    tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    acc = compute_accuracy(tokenized)
    results[name] = acc
    if name == "sst2":
        in_distribution.append(acc)
    else:
        out_distribution.append(acc)

# ======Results ======#
print("\n=== Evaluation Results ===")
for name, acc in results.items():
    print(f"{name}: {acc:.4f}")
print(f"Avg In-Distribution Accuracy: {np.mean(in_distribution):.4f}")
print(f"Avg Out-of-Distribution Accuracy: {np.mean(out_distribution):.4f}")
