In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from peft import (
    LoraConfig,
    PeftConfig,
    get_peft_model,
    TaskType
)

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [2]:
!pip show peft

Name: peft
Version: 0.7.1
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface/peft
Author: The HuggingFace team
Author-email: sourab@huggingface.co
License: Apache
Location: /Users/id4thomas/miniforge3/envs/torch2/lib/python3.10/site-packages
Requires: accelerate, huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch, tqdm, transformers
Required-by: 


In [3]:
## Load Model & Tokenizer
model_name = "EleutherAI/polyglot-ko-1.3b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.21it/s]


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(30080, 2048)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
  

In [4]:
lora_config = LoraConfig(
	task_type = TaskType.CAUSAL_LM,
	r =  8,
	lora_alpha = 16,
	lora_dropout = 0.1,
	inference_mode=False,
	# init_lora_weights = "gaussian"
	init_lora_weights = False # random init
)

In [5]:
lora_model = get_peft_model(model, lora_config)

In [6]:
## Analyze Lora Model
print(type(lora_model))
# print(lora_model.base_model.model.gpt_neox.layers[0])
print(lora_model.base_model.gpt_neox.layers[0].attention)

<class 'peft.peft_model.PeftModelForCausalLM'>
GPTNeoXAttention(
  (rotary_emb): GPTNeoXRotaryEmbedding()
  (query_key_value): lora.Linear(
    (base_layer): Linear(in_features=2048, out_features=6144, bias=True)
    (lora_dropout): ModuleDict(
      (default): Dropout(p=0.1, inplace=False)
    )
    (lora_A): ModuleDict(
      (default): Linear(in_features=2048, out_features=8, bias=False)
    )
    (lora_B): ModuleDict(
      (default): Linear(in_features=8, out_features=6144, bias=False)
    )
    (lora_embedding_A): ParameterDict()
    (lora_embedding_B): ParameterDict()
  )
  (dense): Linear(in_features=2048, out_features=2048, bias=True)
  (attention_dropout): Dropout(p=0.0, inplace=False)
)


In [22]:
## Layers Before Merge
# following https://github.com/huggingface/peft/blob/bd544bb2ceae4a2b272e583e69b8f5fcdb022ff5/src/peft/tuners/lora/layer.py#L330
qkv_bef = torch.clone(lora_model.base_model.model.gpt_neox.layers[0].attention.query_key_value.weight)
print(qkv_bef.shape)

lora_scale_val = lora_model.base_model.model.gpt_neox.layers[0].attention.query_key_value.scaling["default"]
lora_a = torch.clone(lora_model.base_model.model.gpt_neox.layers[0].attention.query_key_value.lora_A.default.weight)
lora_b = torch.clone(lora_model.base_model.model.gpt_neox.layers[0].attention.query_key_value.lora_B.default.weight)
print(lora_scale_val)
print(lora_a.shape, lora_b.shape)

torch.Size([6144, 2048])
2.0
torch.Size([8, 2048]) torch.Size([6144, 8])


In [24]:
merged_lora = lora_b.matmul(lora_a)
# merged_lora = lora_b@lora_a
merged_lora = merged_lora * lora_scale_val
print(merged_lora.shape)


merged_linear = qkv_bef+merged_lora
print(merged_linear.shape)

torch.Size([6144, 2048])
torch.Size([6144, 2048])


In [28]:
## MERGE
# https://cdn-lfs.huggingface.co/datasets/huggingface/documentation-images/4313422c5f2755897fb8ddfc5b99251358f679647ec0f2d120a3f1ff060defe7?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27lora_diagram.png%3B+filename%3D%22lora_diagram.png%22%3B&response-content-type=image%2Fpng&Expires=1702993287&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMjk5MzI4N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9kYXRhc2V0cy9odWdnaW5nZmFjZS9kb2N1bWVudGF0aW9uLWltYWdlcy80MzEzNDIyYzVmMjc1NTg5N2ZiOGRkZmM1Yjk5MjUxMzU4ZjY3OTY0N2VjMGYyZDEyMGEzZjFmZjA2MGRlZmU3P3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_&Signature=EMrBr7qZbWFT56xOWykG81wdmcCNckVdh0OnnRNF%7EMiQBqwkUmZcgMdX89hvxHHMN8I6dwhpyqDmi-Ar2MrFW8HllSX3PxY1cugNSOsCLvlnEOpoFcMl1aGWWsVzApO%7E2tajrI5eQXZ56u8lbWMYSNkADJUglKFD882DupJg2M8x4yOsUAyE1kGHvuMwcvTvaFCeccCBce0bpD3Uta30PvnT9NizZ49bKor2m3e1taHtZx4jjfxlPoHitzO15m4UCWPLAcEtBT5t50zlv%7EmYAFMWEXdYPUWHVB12OnYYP1a2aMJFOSVfoZ0l%7EA4oLFkBrcvRa7ivYsIkytMQTTtanw__&Key-Pair-Id=KVTP0A1DKRTAX
"""
X: (batch_size, 2048)
With Adapter: 
query_key_value(X) + lora_B(X*lora_A)
(b, 2048) * (2048, 6144) + (b, 2048)*(2048*8)*(8*6144)

Merged:
merged_layer = query_key_value + lora_B(lora_A)
merged_layer(X)
"""
merged_model = lora_model.merge_and_unload()

In [29]:
print(type(merged_model))
merged_model.gpt_neox.layers[0]

<class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM'>


GPTNeoXLayer(
  (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  (post_attention_dropout): Dropout(p=0.0, inplace=False)
  (post_mlp_dropout): Dropout(p=0.0, inplace=False)
  (attention): GPTNeoXAttention(
    (rotary_emb): GPTNeoXRotaryEmbedding()
    (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
    (dense): Linear(in_features=2048, out_features=2048, bias=True)
    (attention_dropout): Dropout(p=0.0, inplace=False)
  )
  (mlp): GPTNeoXMLP(
    (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
    (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
    (act): GELUActivation()
  )
)

In [30]:
## Test Difference with real merged layer
real_merged_linear = torch.clone(lora_model.base_model.model.gpt_neox.layers[0].attention.query_key_value.weight)
print(real_merged_linear.shape)

torch.Size([6144, 2048])


In [31]:
print("DIFF WITH MY MERGED", torch.sum(real_merged_linear-merged_linear))
print("DIFF WITH OG", torch.sum(real_merged_linear-qkv_bef))

DIFF WITH MY MERGED tensor(0., grad_fn=<SumBackward0>)
DIFF WITH OG tensor(11.0746)
