In [1]:
import torch
from functions import get_loader, get_model

device = 'cuda' if torch.cuda.is_available() else 'cpu'
_, _, loader = get_loader()
model, _, _ = get_model()

#保存原模型参数
model.save_pretrained('model/save_pretrained')

model.classifier

Linear(in_features=768, out_features=10, bias=True)

In [2]:
from peft import LoraConfig, TaskType, get_peft_model, LoftQConfig

#lora就是插入一些胶水层,来影响原模型的计算结果
#比如原来有一层的计算过程如下:
#[b, 1024] -> [b, 1024]
#加入lora后变成如下:
#[b, 1024] -> [b, 8] -> [b, 1024] -> [b, 1024]
#中间的两层就是lora层, 这些层可以在前面,中间或者后面.
#这中间的8, 就是下面的参数r
#训练时只训练这些lora层,从而缩小计算量,经过实验论证,也是能得到比较好的结果的

config = LoraConfig(
    #任务类型, SEQ_CLS,SEQ_2_SEQ_LM,CAUSAL_LM,TOKEN_CLS,QUESTION_ANS,FEATURE_EXTRACTION
    task_type=TaskType.SEQ_CLS,
    #是否是推理模式.
    inference_mode=False,
    #降秩矩阵的尺寸,这个参数会影响训练的参数量
    r=8,
    #降秩矩阵的缩放系数,不影响参数量
    lora_alpha=32,
    #降秩矩阵的dropout
    lora_dropout=0.1,
    #指定要对原模型中的那一部分添加lora层,默认是所有位置
    target_modules=['classifier'],
)

model = get_peft_model(model, config)

model.print_trainable_parameters()

model.classifier

trainable params: 13,914 || all params: 251,267,508 || trainable%: 0.005537524573213023


ModulesToSaveWrapper(
  (original_module): lora.Linear(
    (base_layer): Linear(in_features=768, out_features=10, bias=True)
    (lora_dropout): ModuleDict(
      (default): Dropout(p=0.1, inplace=False)
    )
    (lora_A): ModuleDict(
      (default): Linear(in_features=768, out_features=8, bias=False)
    )
    (lora_B): ModuleDict(
      (default): Linear(in_features=8, out_features=10, bias=False)
    )
    (lora_embedding_A): ParameterDict()
    (lora_embedding_B): ParameterDict()
  )
  (modules_to_save): ModuleDict(
    (default): lora.Linear(
      (base_layer): Linear(in_features=768, out_features=10, bias=True)
      (lora_dropout): ModuleDict(
        (default): Dropout(p=0.1, inplace=False)
      )
      (lora_A): ModuleDict(
        (default): Linear(in_features=768, out_features=8, bias=False)
      )
      (lora_B): ModuleDict(
        (default): Linear(in_features=8, out_features=10, bias=False)
      )
      (lora_embedding_A): ParameterDict()
      (lora_embedding_B):

In [3]:
import datetime

#正常训练
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
model.to(device)

now = datetime.datetime.now()
for i, data in enumerate(loader):
    for k, v in data.items():
        data[k] = v.to(device)
    out = model(**data)
    out.loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    optimizer.zero_grad()

    if i % 1 == 0:
        labels = data['labels']
        logits = out['logits'].argmax(1)
        acc = (labels == logits).sum().item() / len(labels)

        print(i, len(loader), out.loss.item(), acc)

datetime.datetime.now() - now

0 62 2.336458683013916 0.03125
1 62 2.335559368133545 0.03125
2 62 2.3020806312561035 0.125
3 62 2.3629519939422607 0.15625
4 62 2.3952343463897705 0.09375
5 62 2.3703486919403076 0.03125
6 62 2.3296892642974854 0.09375
7 62 2.2450196743011475 0.15625
8 62 2.204324722290039 0.25
9 62 2.306922197341919 0.125
10 62 2.308375597000122 0.0625
11 62 2.3042044639587402 0.125
12 62 2.22579026222229 0.25
13 62 2.248403549194336 0.15625
14 62 2.2131221294403076 0.25
15 62 2.1622021198272705 0.25
16 62 2.1667230129241943 0.1875
17 62 2.12541127204895 0.28125
18 62 2.1942343711853027 0.21875
19 62 2.086634874343872 0.34375
20 62 2.1510114669799805 0.25
21 62 2.159356117248535 0.375
22 62 2.141340732574463 0.34375
23 62 2.131850481033325 0.34375
24 62 2.1174545288085938 0.21875
25 62 2.087817668914795 0.53125
26 62 2.1108903884887695 0.375
27 62 2.0811893939971924 0.46875
28 62 2.055372953414917 0.40625
29 62 2.0447239875793457 0.46875
30 62 2.0550787448883057 0.34375
31 62 2.0166432857513428 0.531

datetime.timedelta(seconds=12, microseconds=194751)

In [4]:
#peft保存,保存的文件会很小,因为只保存了lora层
model.save_pretrained('model/peft.save_pretrained')

model.base_model.classifier.modules_to_save.default.weight



Parameter containing:
tensor([[ 0.0019, -0.0047,  0.0170,  ...,  0.0103, -0.0181, -0.0162],
        [ 0.0281,  0.0129,  0.0396,  ..., -0.0123,  0.0515, -0.0117],
        [-0.0530, -0.0161, -0.0173,  ..., -0.0548,  0.0034, -0.0369],
        ...,
        [-0.0228, -0.0049,  0.0235,  ..., -0.0174,  0.0303,  0.0107],
        [-0.0392,  0.0481,  0.0245,  ...,  0.0204, -0.0020,  0.0287],
        [ 0.0116, -0.0089, -0.0318,  ...,  0.0126, -0.0058, -0.0059]],
       device='cuda:0', requires_grad=True)

In [5]:
from transformers import BertForSequenceClassification
from peft import PeftConfig, PeftModel

#重启初始化原模型
model = BertForSequenceClassification.from_pretrained('model/save_pretrained')

#加载保存的config
PeftConfig.from_pretrained('model/peft.save_pretrained')

#插入保存的lora层
model = PeftModel.from_pretrained(model,
                                  './model/peft.save_pretrained',
                                  is_trainable=True)

model.base_model.classifier.modules_to_save.default.weight

Parameter containing:
tensor([[ 0.0019, -0.0047,  0.0170,  ...,  0.0103, -0.0181, -0.0162],
        [ 0.0281,  0.0129,  0.0396,  ..., -0.0123,  0.0515, -0.0117],
        [-0.0530, -0.0161, -0.0173,  ..., -0.0548,  0.0034, -0.0369],
        ...,
        [-0.0228, -0.0049,  0.0235,  ..., -0.0174,  0.0303,  0.0107],
        [-0.0392,  0.0481,  0.0245,  ...,  0.0204, -0.0020,  0.0287],
        [ 0.0116, -0.0089, -0.0318,  ...,  0.0126, -0.0058, -0.0059]],
       requires_grad=True)

In [6]:
#测试模型性能
def test(model):
    model.to(device)
    data = next(iter(loader))
    for k, v in data.items():
        data[k] = v.to(device)
    with torch.no_grad():
        outs = model(**data)
    acc = (outs.logits.argmax(1) == data.labels).sum().item() / len(
        data.labels)
    return acc


test(model)

1.0

In [7]:
#合并lora层到原始模型中,效果不会改变
model_merge = model.merge_and_unload()

type(model_merge), test(model_merge)

(transformers.models.bert.modeling_bert.BertForSequenceClassification, 1.0)