In [None]:
import os
os.environ['HF_HOME'] = '/home/m_azimi/.cache/huggingface'

In [None]:
!python /home/m_azimi/SHA_Diagonal/train.py

In [None]:
stsb-uniqu: [91.8, 91.2, 92.1, 91, 91.5] -> 91.5
cola-unique: [67.9, 64.8, 67.9, 64.3, 68.6] -> 66.7

roberta-base:

MRPC = [89.71, 88.24, 88.24, 88.24, 89.22]

RTE = [79.71, 78.26, 78.26,77.54, 77.53]

STSB = [90.1, 90.3, 89.85, 90.1, 89.9]

roberta-large:

STSB = [91.9, 92.1, 91.9, 91.9, 91.8]

CoLA = [69.3, 69.3, 64.5, 65.3, 67]

In [None]:
import os
os.environ['HF_HOME'] = '/home/m_azimi/.cache/huggingface'
os.environ['HF_LOCK_DIR'] = '/home/m_azimi/.cache/huggingface/locks'

import config, GLUE_data_setup, peft_module, engine, utils
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import warnings

best_checkpoint_path = "/home/m_azimi/SHA_Diagonal/best_model_checkpoint.pt"


num_labels = 3 if config.CONFIG.task.startswith("mnli") else 1 if config.CONFIG.task == "stsb" else 2
model = AutoModelForSequenceClassification.from_pretrained(
    config.CONFIG.model_name, num_labels=num_labels, output_attentions=False,
    output_hidden_states=False, cache_dir="/home/m_azimi/.cache/huggingface"
).to(config.CONFIG.device)

for layer in model.roberta.encoder.layer:
    #Create a new instance of your custom attention class with the same config and is_cross_attention settings
    custom_attn = peft_module.CustomRobertaSelfAttention(model.config)
    #Copy the weights from the original attention to the new custom attention
    custom_attn.load_state_dict(layer.attention.self.state_dict())
    #Replace the original attention with the custom one
    layer.attention.self = custom_attn

# Apply PEFT layers
peft_module.add_peft_layers(model=model) 
peft_module.freeze_model(model)


test_dataset = torch.load("/home/m_azimi/SHA_Diagonal/saved_data/test_dataset.pt")



test_loader = DataLoader(
    test_dataset, batch_size=config.CONFIG.valid_batch,
    num_workers=1, shuffle=False, pin_memory=True
)


# Optionally, load the best checkpoint and perform additional evaluations
model.load_state_dict(torch.load(best_checkpoint_path))
model.to(config.CONFIG.device)
final_test_loss, final_test_preds, final_test_true = engine.evaluate(model, test_loader)
final_test_metric = engine.eval_func(final_test_preds, final_test_true)
print(f'Final Test Metric: {final_test_metric}')

In [None]:
import math
from typing import List, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import config

class SHA_DIAGONAL(nn.Module):

    def __init__(
        self,
        in_dim: int,
        out_dim: int,
        r: int = config.CONFIG.r,
        lora_dropout: float = 0.1,
    ):
        super().__init__()
        self.r = r
        self.lora_dropout = nn.Dropout(lora_dropout)

        # recreate the linear layer and freeze it (the actual weight values will be copied in outside of this class)
        self.pretrained = nn.Linear(in_dim, out_dim, bias=True)
        self.pretrained.weight.requires_grad = False

        # create the down projection matrix and initialize with same method as in Hugging Face PEFT library
        self.down_proj = nn.Linear(in_dim, r, bias=False)
        self._kaiming_init(self.down_proj.weight, generator=torch.manual_seed(config.CONFIG.seed))

        # Add the custom DiagonalLinear layer
        self.diagonal_d = nn.Parameter(torch.ones(r), requires_grad=True)
        nn.init.constant_(self.diagonal_d, 0.1)

        # create the up projection matrix and initialize to zero
        self.up_proj = nn.Linear(r, out_dim, bias=False)
        self._kaiming_init(self.up_proj.weight, generator=torch.manual_seed(config.CONFIG.seed))

        # Add the custom DiagonalLinear layer
        self.diagonal_b = nn.Parameter(torch.zeros(out_dim), requires_grad=True)


    def forward(self, x):
        pretrained_out = self.pretrained(x)

        x_out = self.lora_dropout(x)

        down_project_out = self.down_proj(x_out)

        diagonal_d_out = down_project_out * self.diagonal_d

        up_project_out = self.up_proj(diagonal_d_out)

        diagonal_b_out = up_project_out * self.diagonal_b

        return pretrained_out + diagonal_b_out


    def _kaiming_init(self, tensor: torch.Tensor, generator: torch.Generator):

        fan = nn.init._calculate_correct_fan(tensor, mode="fan_in")
        gain = math.sqrt(2.0)
        std = gain / math.sqrt(fan)
        bound = math.sqrt(3.0) * std
        with torch.no_grad():
            return tensor.uniform_(-bound, bound, generator=generator)        



def freeze_model(model):
    for name, param in model.named_parameters():
        if "diagonal_d" not in name and "diagonal_b" not in name and "classifier" not in name:
            param.requires_grad = False


def create_peft(module):
    """Converts a linear module to a peft linear module."""
    k, d = module.weight.shape  # pytorch nn.Linear weights are transposed, that is why shape is (k, d) and not (d, k)
    peft = SHA_DIAGONAL(in_dim=d, out_dim=k)
    with torch.no_grad():
        peft.pretrained.weight.copy_(module.weight)
        peft.pretrained.bias.copy_(module.bias)
    return peft   



def add_peft_layers(
    model,
    module_names: Tuple=("query", "value"),
    ignore_layers: List[int]=[]
):
    module_types: Tuple=(nn.Linear,)

    # disable dropout in frozen layers
    for module in model.modules():
        if isinstance(module, nn.Dropout):
            module.p = 0.0
    # replace chosen linear modules with lora modules
    for name, module in model.named_children():
        if isinstance(module, module_types) and name in module_names:
            temp_peft = create_peft(module)
            setattr(model, name, temp_peft)
        else:
            ignore_layers_str = [str(i) for i in ignore_layers]
            if name not in ignore_layers_str:
                add_peft_layers(module, module_names, ignore_layers)     

In [None]:
class SHA_DIAGONAL(nn.Module):

    def __init__(
        self,
        in_dim: int,
        out_dim: int,
        lora_alpha: int = 8,
        lora_dropout: float = 0.1,
        r: int = config.CONFIG.r,
    ):
        super().__init__()
        self.r = r
        self.lora_alpha = lora_alpha
        self.lora_dropout = nn.Dropout(lora_dropout)

        # recreate the linear layer and freeze it (the actual weight values will be copied in outside of this class)
        self.pretrained = nn.Linear(in_dim, out_dim, bias=True)
        self.pretrained.weight.requires_grad = False

        # create the down projection matrix and initialize with same method as in Hugging Face PEFT library
        self.down_proj = nn.Linear(in_dim, r, bias=False)
        nn.init.kaiming_uniform_(self.down_proj.weight, a = math.sqrt(5))

        # create the up projection matrix and initialize to zero
        self.up_proj = nn.Linear(r, out_dim, bias=False)
        nn.init.constant_(self.up_proj.weight, 0)

        self.scaling = self.lora_alpha / self.r


    def forward(self, x):
        pretrained_out = self.pretrained(x)

        x_out = self.lora_dropout(x)

        down_project_out = self.down_proj(x_out)

        up_project_out = self.up_proj(down_project_out)

        up_project_out = up_project_out * self.scaling

        return pretrained_out + up_project_out



def freeze_model(model):
    for name, param in model.named_parameters():
        if "down_proj" not in name and "up_proj" not in name and "classifier" not in name:
            param.requires_grad = False



def create_peft(module):
    """Converts a linear module to a peft linear module."""
    k, d = module.weight.shape  # pytorch nn.Linear weights are transposed, that is why shape is (k, d) and not (d, k)
    peft = SHA_DIAGONAL(in_dim=d, out_dim=k)
    with torch.no_grad():
        peft.pretrained.weight.copy_(module.weight)
        peft.pretrained.bias.copy_(module.bias)
    return peft  



def add_peft_layers(
    model,
    module_names: Tuple=("query", "value"),
    ignore_layers: List[int]=[]
):
    module_types: Tuple=(nn.Linear,)

    # disable dropout in frozen layers
    for module in model.modules():
        if isinstance(module, nn.Dropout):
            module.p = 0.0
    # replace chosen linear modules with lora modules
    for name, module in model.named_children():
        if isinstance(module, module_types) and name in module_names:
            temp_peft = create_peft(module)
            setattr(model, name, temp_peft)
        else:
            ignore_layers_str = [str(i) for i in ignore_layers]
            if name not in ignore_layers_str:
                add_peft_layers(module, module_names, ignore_layers)                  