# peft 0.10.0 layer replication test
* 레이어 복사하여 깊이 늘리는 방식 테스트
* 복사된 레이어 웨이트 메모리 공유하는지 확인

## 뒷 구현 방식
* 레이어를 복사 후 (같은 레이어는 메모리 1개 만큼만 차지) lora adapter 달아줘서 어댑터만 학습
	* replicated layers do not take additional memory as they share the underlying weights


In [1]:
import os
# os.environ['KMP_DUPLICATE_LIB_OK']='True'
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [2]:
'''
decoder: yanolja/EEVE-Korean-Instruct-2.8B-v1.0
"num_hidden_layers": 32,
'''
plm_name = "yanolja/EEVE-Korean-Instruct-2.8B-v1.0"
plm_model = AutoModelForCausalLM.from_pretrained(
	plm_name,
	torch_dtype = torch.bfloat16
)
phi_target_modules = [
	"q_proj", "k_proj", "v_proj", "dense"
]

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.16s/it]


In [3]:
adapter1_config = LoraConfig(
	init_lora_weights="gaussian",
	r = 8,
	lora_alpha=16,
	lora_dropout=0.05,
	target_modules = phi_target_modules,
	## layer_replication 값: 아래서 부터 [0,3]: 0,1,2 + [2,7]: 2,3,4,5,6
	## -> 총 0,1,2,2,3,4,5,6 -> 8레이어 모델
	layer_replication=[[0,3], [2,7]]
)

In [4]:
model_a1 = get_peft_model(
	plm_model,
	peft_config = adapter1_config,
	adapter_name = "copied1"
)
model_a1.save_pretrained("adapters")

In [7]:
model_a1

PeftModel(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(58944, 2560)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-7): 8 x PhiDecoderLayer(
            (self_attn): PhiSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (copied1): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (copied1): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (copied1): Linear(in_features=8, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (b

In [10]:
layers = model_a1.base_model.model.model.layers
print(len(layers))

8


In [25]:
## Checking memory usage
## layer 2 & 3 should share memory (both from layer 2 of original model)
l1 = layers[2]
l2 = layers[3]
l3 = layers[4]

## weight example
l1w = l1.self_attn.q_proj.base_layer.weight
print("LAYER 1 WEIGHT:",l1w)

print("-"*30)
print("Checking if same tensor")
def is_layer_same(x, y):
	## check q_proj weight
	weight_x = x.self_attn.q_proj.base_layer.weight
	weight_y = y.self_attn.q_proj.base_layer.weight
	print(weight_x.data_ptr(), weight_y.data_ptr())
	return weight_x.data_ptr() == weight_y.data_ptr()
print("l1 vs l2", is_layer_same(l1, l2))
print("l1 vs l3", is_layer_same(l1, l3))

LAYER 1 WEIGHT: Parameter containing:
tensor([[ 0.0106,  0.0288, -0.0092,  ...,  0.0332, -0.0047, -0.0254],
        [ 0.0013, -0.0060,  0.0170,  ..., -0.0425,  0.0139, -0.0199],
        [-0.0028, -0.0187, -0.0078,  ...,  0.0025, -0.0183,  0.0093],
        ...,
        [-0.0156, -0.0244,  0.0049,  ...,  0.0010,  0.0258,  0.0039],
        [ 0.0100, -0.0281, -0.0479,  ...,  0.0312,  0.0125, -0.0063],
        [-0.0889,  0.0188, -0.0138,  ...,  0.0540, -0.0259, -0.0016]],
       dtype=torch.bfloat16)
------------------------------
Checking if same tensor
13802160128 13802160128
l1 vs l2 True
13802160128 13959446528
l1 vs l3 False


## Testing Encoder Models

In [29]:
## testing with encoder models
from transformers import AutoModelForSequenceClassification
'''
encoder: klue/roberta-base
"num_hidden_layers": 12,
## roberta gives err:
ValueError: Could not locate the layers attribute in the model. Expected Llama, Bert or Falcon compatible architectures.

encoder: klue/bert-base
"num_hidden_layers": 12,
## roberta gives err:
ValueError: Could not locate the layers attribute in the model. Expected Llama, Bert or Falcon compatible architectures.
'''
# plm_name = "klue/roberta-base"
plm_name = "klue/bert-base"
plm_model = AutoModelForSequenceClassification.from_pretrained(
	plm_name,
	torch_dtype = torch.bfloat16
)
roberta_target_modules = [
	"query", "key", "value"
]

config.json: 100%|██████████| 425/425 [00:00<00:00, 1.43MB/s]
model.safetensors: 100%|██████████| 445M/445M [01:11<00:00, 6.27MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
adapter2_config = LoraConfig(
	init_lora_weights="gaussian",
	r = 8,
	lora_alpha=16,
	lora_dropout=0.05,
	target_modules = roberta_target_modules,
	## layer_replication 값: 아래서 부터 [0,3]: 0,1,2 + [2,7]: 2,3,4,5,6
	## -> 총 0,1,2,2,3,4,5,6 -> 8레이어 모델
	layer_replication=[[0,3], [2,7]]
)

In [31]:
model_a2 = get_peft_model(
	plm_model,
	peft_config = adapter2_config,
	adapter_name = "copied2"
)
model_a2.save_pretrained("adapters")

In [32]:
model_a2

PeftModel(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(32000, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-7): 8 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (copied2): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (copied2): Linear(in_features=768, out

In [33]:
layers = model_a2.base_model.model.bert.encoder.layer
print(len(layers))

8


In [34]:
## Checking memory usage
## layer 2 & 3 should share memory (both from layer 2 of original model)
l1 = layers[2]
l2 = layers[3]
l3 = layers[4]

## weight example
l1w = l1.attention.self.query.base_layer.weight
print("LAYER 1 WEIGHT:",l1w)

print("-"*30)
print("Checking if same tensor")
def is_layer_same(x, y):
	## check q_proj weight
	weight_x = x.attention.self.query.base_layer.weight
	weight_y = y.attention.self.query.base_layer.weight
	print(weight_x.data_ptr(), weight_y.data_ptr())
	return weight_x.data_ptr() == weight_y.data_ptr()
print("l1 vs l2", is_layer_same(l1, l2))
print("l1 vs l3", is_layer_same(l1, l3))

LAYER 1 WEIGHT: Parameter containing:
tensor([[ 0.0273, -0.0165,  0.0703,  ...,  0.0520, -0.0200, -0.0452],
        [ 0.0033,  0.0144,  0.0615,  ...,  0.0232,  0.0457,  0.0359],
        [ 0.0309, -0.0039,  0.0342,  ..., -0.0422, -0.0267, -0.0095],
        ...,
        [ 0.0566, -0.0047, -0.0654,  ..., -0.0283, -0.0206, -0.0286],
        [-0.0308,  0.0320,  0.0364,  ...,  0.0444, -0.0515, -0.0114],
        [-0.0247,  0.0247,  0.0090,  ...,  0.0070,  0.0347,  0.0214]],
       dtype=torch.bfloat16)
------------------------------
Checking if same tensor
4875386880 4875386880
l1 vs l2 True
4875386880 4889542656
l1 vs l3 False
