In [1]:
!pip show peft

Name: peft
Version: 0.10.0
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface/peft
Author: The HuggingFace team
Author-email: sourab@huggingface.co
License: Apache
Location: /Users/id4thomas/miniforge3/envs/torch2/lib/python3.10/site-packages
Requires: accelerate, huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch, tqdm, transformers
Required-by: 


# peft 0.10.0 multi-lora inference Test
* 여러 adapter가 모두 load 되어있는 상태에서 서로 다른 어댑터 추론 호출

In [11]:
import os
# os.environ['KMP_DUPLICATE_LIB_OK']='True'
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
plm_name = "yanolja/EEVE-Korean-Instruct-2.8B-v1.0"
plm_model = AutoModelForCausalLM.from_pretrained(
	plm_name,
	torch_dtype = torch.bfloat16
)
phi_target_modules = [
	"q_proj", "k_proj", "v_proj", "dense"
]

config.json: 100%|██████████| 927/927 [00:00<00:00, 1.90MB/s]
model.safetensors.index.json: 100%|██████████| 35.7k/35.7k [00:00<00:00, 9.54MB/s]
model-00001-of-00002.safetensors: 100%|██████████| 4.97G/4.97G [01:19<00:00, 62.2MB/s]
model-00002-of-00002.safetensors: 100%|██████████| 669M/669M [00:11<00:00, 60.1MB/s]
Downloading shards: 100%|██████████| 2/2 [01:32<00:00, 46.04s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.08s/it]


In [4]:
## Adapter1
adapter1_config = LoraConfig(
	init_lora_weights="gaussian",
	r = 8,
	lora_alpha=16,
	lora_dropout=0.05,
	target_modules = phi_target_modules
)

model_a1 = get_peft_model(
	plm_model,
	peft_config = adapter1_config,
	adapter_name = "a1"
)
model_a1.save_pretrained("adapters/a1")

In [5]:
adapter2_config = LoraConfig(
	init_lora_weights="gaussian",
	r = 4,
	lora_alpha=8,
	lora_dropout=0.05,
	target_modules = phi_target_modules
)

model_a2 = get_peft_model(
	plm_model,
	peft_config = adapter2_config,
	adapter_name = "a2"
)
model_a2.save_pretrained("adapters/a2")

In [30]:
print(os.listdir("adapters"))
print(os.listdir("adapters/a1"), os.listdir("adapters/a1/a1"))
print(os.listdir("adapters/a2"), os.listdir("adapters/a2/a2")) # accidently saved 2 adapters

['a2', 'a1']
['README.md', 'a1'] ['adapter_model.safetensors', 'adapter_config.json']
['a2', 'README.md', 'a1'] ['adapter_model.safetensors', 'adapter_config.json']


In [15]:
model_a2.eval()

PeftModel(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(58944, 2560)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (a1): Dropout(p=0.05, inplace=False)
                  (a2): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (a1): Linear(in_features=2560, out_features=8, bias=False)
                  (a2): Linear(in_features=2560, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (a1): Linear(in_features=8, out_features=2560, bias=False)
                  (a2): Linear(in_features=4, out_features=2560, bias=Fals

In [12]:
tokenizer = AutoTokenizer.from_pretrained("yanolja/EEVE-Korean-Instruct-2.8B-v1.0")

tokenizer_config.json: 100%|██████████| 1.89k/1.89k [00:00<00:00, 3.11MB/s]
tokenizer.json: 100%|██████████| 2.57M/2.57M [00:01<00:00, 2.28MB/s]
special_tokens_map.json: 100%|██████████| 565/565 [00:00<00:00, 6.01MB/s]


In [None]:
model_a2

In [18]:
inputs = tokenizer(["안녕하세요", "안녕하세요2", "안녕하세요3"], return_tensors = "pt", padding = True)

In [19]:
output = model_a2(**inputs, adapter_names=["a1", "a2", "__base__"])

In [21]:
## len(adapter_names)==len(inputs) 여야함 각 샘플별로 어댑터 정의
generated = model_a2.generate(**inputs,  adapter_names=["a1", "a2", "__base__"], max_new_tokens = 20)

Setting `pad_token_id` to `eos_token_id`:58943 for open-end generation.


In [25]:
tokenizer.batch_decode(generated)

['</s><s> 안녕하세요! 저는 웹사이트를 만들고, 콘텐츠를 만들고, 디자이너로 일',
 '<s> 안녕하세요2! 저는 웹사이트를 만들고자 하는 예비 웹디자이너입니다. 저는',
 '<s> 안녕하세요3D프린팅 챌린지 팀입니다. 3D프린팅 챌린지']

In [20]:
output

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 7.5000,  7.1875,  3.0156,  ..., -1.4297, -1.3125,  3.8125],
         [ 2.8906,  3.2031,  2.9062,  ..., -0.4531,  0.3574,  5.0312],
         [ 8.6875,  5.6562,  6.8750,  ...,  2.8594,  4.2500,  5.1250],
         [17.8750, 11.1875,  1.3906,  ...,  2.5469,  3.0625,  6.9375],
         [19.1250,  8.2500,  5.1562,  ...,  0.5000,  2.7031,  7.0625]],

        [[ 2.8906,  3.2031,  2.9062,  ..., -0.4531,  0.3574,  5.0312],
         [ 8.6875,  5.6250,  6.8125,  ...,  2.8750,  4.2188,  5.2188],
         [17.6250, 11.0000,  1.4062,  ...,  2.4844,  3.0938,  6.8125],
         [19.0000,  8.3125,  5.1562,  ...,  0.4590,  2.5781,  7.0312],
         [14.8750,  6.2812,  5.3125,  ..., -1.0625,  1.2031,  3.3594]],

        [[ 2.8906,  3.2031,  2.9062,  ..., -0.4531,  0.3574,  5.0312],
         [ 8.6875,  5.6250,  6.8125,  ...,  2.8750,  4.2188,  5.2188],
         [17.6250, 11.0000,  1.4062,  ...,  2.4844,  3.0938,  6.8125],
         [19.0000,  8.3125,  5.1