In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from torch.nn import functional as F
from torch import nn

In [2]:
# generate_text = pipeline(model="databricks/dolly-v2-12b", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
# res = generate_text("Explain to me the difference between nuclear fission and fusion.")
# print(res[0]["generated_text"])

In [3]:
tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")
model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", device_map="auto", torch_dtype=torch.bfloat16)


In [4]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 2560)
    (layers): ModuleList(
      (0-31): 32 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): Linear(in_features=2560, out_features=7680, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear(in_features=10240, out_features=2560, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
  )
  (embed_out): Linear(in_features=2560, out_features=50280, bias=False)
)

In [5]:
from lora_adapters import LoraMergedLinear, apply_adapter

In [6]:
input_tensor = torch.randn(1, 2560).to("cuda").to(torch.bfloat16)

In [7]:
layer = model.gpt_neox.layers[0].attention.query_key_value

In [8]:
output = layer(input_tensor)

In [9]:
lora_layer = LoraMergedLinear(layer, rank=16, enable_lora=[True, False, True], fan_in_fan_out=False)

In [10]:
lora_layer

LoraMergedLinear(in_features=2560, out_features=7680, bias=True, rank=16, enable_lora=[True, False, True])

In [11]:
base_layer = nn.Linear(2560, 2560 * 3, bias=False).to("cuda").to(torch.bfloat16)
LoraMergedLinear(base_layer, rank=16, enable_lora=[True, False, True]).to_regular()


Linear(in_features=2560, out_features=7680, bias=False)

In [11]:

lora_output = lora_layer(input_tensor)

In [12]:
torch.equal(output, lora_output)

True

In [13]:
lora_layer.to_regular()

Linear(in_features=2560, out_features=7680, bias=True)

In [14]:
??lora_layer.forward

[0;31mSignature:[0m [0mlora_layer[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0minput[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note::
    Although the recipe for forward pass needs to be defined within
    this function, one should call the :class:`Module` instance afterwards
    instead of this since the former takes care of running the
    registered hooks while the latter silently ignores them.
[0;31mSource:[0m   
    [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0minput[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0mself[0m[0;34m.[0m[0mmerged[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0;32mreturn[0m [0mF[0m[0;34m.[0m[0mlinear[0m[0;34m([0m[0minput[0m[0;34m,[0m [0mself[0m[0;34m.[0m

In [15]:
model = apply_adapter(model, LoraMergedLinear, rank=16, regex_pattern=".*0.*query_key_value", enable_lora=[True, False, True])
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 2560)
    (layers): ModuleList(
      (0): GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): LoraMergedLinear(in_features=2560, out_features=7680, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear(in_features=10240, out_features=2560, bias=True)
          (act): GELUActivation()
        )
      )
      (1-9): 9 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elemen

In [16]:
input_tensor = torch.randint(50280, (2,77)).to("cuda")

In [17]:
model(input_tensor).logits.shape

torch.Size([2, 77, 50280])