In [2]:
from copy import deepcopy

import torch
from petorch.prebuilt.lora import LoraLinear
from pydantic import PositiveInt, NonNegativeFloat
from torch import nn

from petorch.adapter import AdapterAPI, BaseModelAdaptionConfig, BaseAdapter

In [3]:
class Dummy(nn.Module):
    def __init__(self):
        super().__init__()
        self.input_shape = (8, 8)
        self.conv = nn.Conv2d(3, 3, 3, padding='same')
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(8 * 8 * 3, 256)
        self.fc2 = nn.Linear(256, 100)
        self.fc3 = nn.Linear(100, 16)

    def forward(self, input: torch.Tensor):
        assert input.shape[2:] == self.input_shape, f"{input.shape[:2]}-{self.input_shape}"
        x = self.conv(input)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x


class LoraLinearConfig(BaseModelAdaptionConfig):
    rank: PositiveInt = 8
    alpha: PositiveInt = 16
    dropout: NonNegativeFloat = 0.1

    def dispatch_adapter(self, fpname: str, base_layer: nn.Module, *args, **kwargs) -> BaseAdapter | None:
        if isinstance(base_layer, nn.Linear):
            return LoraLinear(cast(nn.Linear, base_layer), self)


model = Dummy()
org_model = deepcopy(model)
config = LoraLinearConfig()
sample = torch.rand([2, 3, 8, 8])
output = org_model(sample)
org_model

Dummy(
  (conv): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=192, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=16, bias=True)
)

In [4]:
print(AdapterAPI.add_adapter(model, config))
model

['fc1', 'fc2', 'fc3']


Dummy(
  (conv): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): AdaptedLayer(
    (base_layer): Linear(in_features=192, out_features=256, bias=True)
    (active_adapters): ModuleDict()
    (non_active_adapters): ModuleDict(
      (default): LoraLinear(
        (lora_A): Linear(in_features=192, out_features=8, bias=True)
        (lora_B): Linear(in_features=8, out_features=256, bias=True)
        (lora_dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc2): AdaptedLayer(
    (base_layer): Linear(in_features=256, out_features=100, bias=True)
    (active_adapters): ModuleDict()
    (non_active_adapters): ModuleDict(
      (default): LoraLinear(
        (lora_A): Linear(in_features=256, out_features=8, bias=True)
        (lora_B): Linear(in_features=8, out_features=100, bias=True)
        (lora_dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc3): AdaptedLayer(
    (base_layer): Linear(in_features

In [12]:
output2 = model(sample)
assert torch.all(output == output2)

In [13]:
try:
    AdapterAPI.activate_adapter(model, 'abc')
except ValueError as e:
    print(e)

Model does not have adapter named `abc`.


In [14]:
AdapterAPI.activate_adapter(model)

['default']

In [17]:
output3 = model(sample)
assert not torch.all(output == output3)

In [8]:
AdapterAPI.remove_adapter(model, "default")

In [9]:
output4 = model(sample)
torch.all(output == output4)

tensor(True)

In [10]:
from copy import deepcopy
from typing import cast

import torch
from pydantic import PositiveInt, NonNegativeFloat, BaseModel
from torch import nn

from petorch.adapter import BaseModelAdaptionConfig, BaseAdapter, AdapterAPI


class DummyAdapter(BaseAdapter):
    """
    Dummy LinearLora
    """

    def __init__(self, base_layer: nn.Linear, config: "BaseModelAdaptionConfig"):
        assert isinstance(base_layer, nn.Linear), f"Base layer must has type {nn.Linear}, got {type(base_layer)}."
        super().__init__(base_layer, config)

        self.lora_A = nn.Linear(base_layer.in_features, self.rank)
        self.lora_B = nn.Linear(self.rank, base_layer.out_features)
        self.lora_dropout = nn.Dropout(self.dropout)

        self.scale = getattr(self.config, "scale", None) or 1

    @property
    def rank(self) -> int:
        return self.config.rank

    @property
    def alpha(self) -> float:
        return self.config.alpha

    @property
    def dropout(self) -> float:
        return self.config.dropout

    @property
    def scaling(self) -> float:
        return self.scale * self.alpha / self.rank

    @classmethod
    def pre_validate_config(cls, config: "BaseModelAdaptionConfig") -> None:
        class ConfigValidator(BaseModel):
            rank: PositiveInt
            alpha: PositiveInt
            dropout: NonNegativeFloat
            adapter_name: str

        ConfigValidator.model_validate(config, from_attributes=True)

    def forward(self, batch_input: torch.Tensor, **kwargs) -> torch.Tensor:
        output = self.base_layer(batch_input)
        return (output + self.lora_B(self.lora_A(self.lora_dropout(batch_input))) * self.scaling)


class DummySubModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(8 * 8 * 3, 256)
        self.fc2 = nn.Linear(256, 100)

    def forward(self, input: torch.Tensor):
        x = self.fc1(input)
        x = self.fc2(x)
        return x


class Dummy(nn.Module):
    def __init__(self):
        super().__init__()
        self.input_shape = (8, 8)
        self.conv = nn.Conv2d(3, 3, 3, padding="same")
        self.flatten = nn.Flatten()
        self.sub_model = DummySubModel()
        self.fc = nn.Linear(100, 16)

    def forward(self, input: torch.Tensor):
        assert input.shape[2:] == self.input_shape, f"{input.shape[:2]}-{self.input_shape}"
        x = self.conv(input)
        x = self.flatten(x)
        x = self.sub_model(x)
        x = self.fc(x)
        return x


class DummyConfig(BaseModelAdaptionConfig):
    rank: PositiveInt = 8
    alpha: PositiveInt = 16
    dropout: NonNegativeFloat = 0.1

    def dispatch_adapter(self, fpname: str, base_layer: nn.Module, *args, **kwargs) -> BaseAdapter | None:
        if isinstance(base_layer, nn.Linear):
            return DummyAdapter(cast(nn.Linear, base_layer), self)


def test_api():
    adapter_name = "test_adapter"
    model = Dummy()
    org_model = deepcopy(model)
    config = DummyConfig(adapter_name=adapter_name)
    sample = torch.rand([2, 3, 8, 8])

    output1 = org_model(sample)

    # Test add_adapter
    target_fqn = ["fc", "sub_model.fc1", "sub_model.fc2"]
    target_fqn.sort()
    fqn = AdapterAPI.add_adapter(model, config)
    fqn.sort()
    assert fqn == target_fqn


test_api()


In [10]:
from transformers import Qwen2ForCausalLM, TorchAoConfig
from torchao.dtypes import to_nf4
from torchao.quantization import register_quantize_module_handler
from dataclasses import dataclass
from torchao.core.config import AOBaseConfig
import torch
from torch import nn
import types
from torchao.utils import get_model_size_in_bytes


@dataclass
class NF4Config(AOBaseConfig):
    block_size: int = 64
    scaler_block_size: int = 256


def linear_module_repr(module: nn.Linear):
    return f"in_features={module.weight.shape[1]}, out_features={module.weight.shape[0]}, weight={module.weight}, dtype={module.weight.dtype}"


@register_quantize_module_handler(NF4Config)
def _nf4_weight_only_transform(module: torch.nn.Module, config: NF4Config, ) -> torch.nn.Module:
    new_weight = to_nf4(module.weight, config.block_size, config.scaler_block_size)
    module.weight = nn.Parameter(new_weight, requires_grad=False)  # Freeze
    module.extra_repr = types.MethodType(linear_module_repr, module)
    return module


config = TorchAoConfig(NF4Config())

model = quantized_model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", )

quantized_model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", quantization_config=config)
model_size = get_model_size_in_bytes(model)
quantized_model_size = get_model_size_in_bytes(quantized_model)
print(model_size)  # 2520669824
print(quantized_model_size)  # 1273966688
print(quantized_model_size / model_size)  # 0.5054079974577425

2520669824
1273966688
0.5054079974577425


## Compare peft conv and einsum

In [None]:
import torch
from torch import nn
import time

kernel_size = stride = (1,) * 2
c = 512

base = nn.Conv2d(c, int(c / 2), 3, 1, 'same', bias=False)

lora_A = nn.Conv2d(c, 8, 3, 1, 'same', bias=False)
kernel_size = stride = (1,) * 2
lora_B = nn.Conv2d(8, int(c / 2), kernel_size, stride, bias=False)

print(base_w := base.weight.numel())
print(lora_w := (lora_A.weight.numel() + lora_B.weight.numel()))
print(lora_w / base_w)

print("-------------------------")

fw, einw = None, None
start = time.time()
for i in range(5000):
    fw = nn.functional.conv2d(lora_A.weight.transpose(0, 1), lora_B.weight).transpose(0, 1)
print("time for peft method: ", time.time() - start)

start = time.time()
# Faster
for i in range(1000):
    einw = torch.einsum("o r ..., r i ... -> o i ...", lora_B.weight, lora_A.weight)

print("time for einsum method: ", time.time() - start)
print(torch.allclose(fw, einw))
print(fw.shape == base.weight.shape == einw.shape)

In [None]:
sample = torch.randn([2, c, 256, 256])
output = base(sample) + lora_B(lora_A(sample))
output_fw = nn.functional.conv2d(sample, base.weight + fw, padding='same')
output_einw = nn.functional.conv2d(sample, base.weight + einw, padding='same')

print(torch.allclose(output, output_fw, atol=1e-5))  # False, max abs
print(torch.allclose(output, output_einw, atol=1e-5))  # True
print(torch.allclose(output_einw, output_fw, atol=1e-5))

print(abs((output - output_fw)).max())  # 3.0
print(abs(output - output_einw).max())  # 1.3e-5
print(abs(output_fw - output_einw).max())  # 3.0

## Lora embedding

In [31]:
from petorch.utilities import ParamWrapper
from torch import nn
import torch.nn.functional as F
import torch
import time

bl = nn.Embedding(1000, 100)
sample = torch.tensor([3, 10, 100, 888])

lora_A = ParamWrapper(weight=torch.empty([bl.num_embeddings, 8]))
lora_B = ParamWrapper(weight=torch.empty([8, bl.embedding_dim]))

print(lora_A.weight.shape)
print(lora_B.weight.shape)
delta_w = torch.einsum("nr, rd -> nd", lora_A.weight, lora_B.weight)
assert delta_w.shape == bl.weight.shape
base_output = bl(sample)

n=1000
peft_output = None

start = time.time()

start = time.time()
# Faster
for i in range (n):
    # Peft method
    peft_output = base_output + F.embedding(sample, lora_A.weight) @ lora_B.weight
print(time.time()-start)

for i in range(n):
    # Einsum
    delta_w = torch.einsum("nr, rd -> nd", lora_A.weight, lora_B.weight)
    output = base_output + F.embedding(sample, delta_w)
print(time.time()-start)


print(output.shape)
torch.allclose(output, peft_output)

torch.Size([1000, 8])
torch.Size([8, 100])
0.15405726432800293
1.1950695514678955
torch.Size([4, 100])


False

In [25]:
peft_delta = base_output + F.embedding(sample, lora_A.weight) @ lora_B.weight
torch.allclose(output, peft_delta)

tensor([[-3.3432e-01, -2.1172e+00,  4.3166e-01, -1.6295e+00, -1.1220e+00,
          1.7248e+00, -1.8272e-01,  2.0662e-01, -2.8791e-01, -6.0669e-01,
         -2.2729e-01, -1.7357e+00,  8.4946e-01, -5.9496e-02,  1.8398e-01,
          1.7075e-01, -3.8632e-02,  9.8571e-01, -4.9658e-01, -3.8940e-01,
         -7.1554e-01,  9.2620e-01,  6.1269e-01, -2.2790e-01, -6.1774e-02,
          5.4722e-02, -1.5909e-01,  1.0175e+00,  7.0021e-01, -1.7216e+00,
          8.1432e-01, -1.6610e+00, -3.9571e-02, -5.9692e-01,  6.8152e-01,
         -9.6059e-01,  7.6038e-01,  7.4547e-01,  3.4148e-02,  1.4748e+00,
         -3.8180e-01, -3.9374e-01, -1.9129e+00, -5.3340e-01, -1.2263e+00,
          3.9411e-01, -4.8989e-01,  7.5922e-01, -3.4971e-01, -3.0818e-01,
          8.7494e-01,  1.5366e+00, -3.0570e+11, -1.5567e+00, -3.2343e-01,
         -1.0933e+00, -1.1252e+00,  3.4023e-01,  2.3946e+00, -2.0378e-01,
          2.3632e+13, -2.6458e-01,  5.4479e-01, -1.6183e+00,  2.4298e+00,
         -1.5688e-01, -3.1651e-01, -1.

torch.Size([4, 100])

True

In [17]:
F.embedding(sample, lora_B.weight.T) # eA  = R in

IndexError: index out of range in self