In [1]:
# 导入必要的库和函数
from transformers import AutoModelForSeq2SeqLM, AutoModelForSequenceClassification  # 用于加载和处理序列到序列的语言模型
# 导入必要的库
from adapters import AdapterTrainer
from transformers import TrainingArguments, EvalPrediction
import numpy as np
from transformers import RobertaConfig
from transformers import RobertaTokenizer
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from adapters import AutoAdapterModel, AdapterArguments, AdapterTrainer, AdapterConfig, ConfigUnion, LoRAConfig, SeqBnConfig, PrefixTuningConfig
from datasets import load_dataset


In [2]:
# 定义一些参数
model_name_or_path = "roberta-base"
task_name = "rotten_tomatoes"
output_dir = "out/roberta-base-rotten_tomatoes/"

# 加载数据集
# dataset = load_dataset("glue", task_name)
dataset = load_dataset("rotten_tomatoes")

Using the latest cached version of the dataset since rotten_tomatoes couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\13061\.cache\huggingface\datasets\rotten_tomatoes\default\1.0.0\c699dc617d02c6738bbcb70f35c1875d20011526 (last modified on Thu Mar  7 11:17:13 2024).


In [24]:
# 添加lora和adapter
# 加载预训练的序列到序列语言模型
model = AutoAdapterModel.from_pretrained(model_name_or_path)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

print_trainable_parameters(model)

# 配置PEFT的参数
lora_config = LoRAConfig(
    r=8,  # 设置LoRA的rank
    alpha=32,  # LoRA的alpha值，决定参数增加的数量
    dropout=0.1,  # LoRA层的dropout比例
    leave_out=[6, 7, 8, 9, 10, 11],  # 指定需要转换的层 #important
)



bn_config = SeqBnConfig(
    reduction_factor=5,  # 设置瓶颈维度
    leave_out=[6, 7, 8, 9, 10, 11]  # 指定需要转换的层 #important
)

config_list=[lora_config, bn_config]
peft_config=ConfigUnion(*[config_list[i] for i in range(len(config_list))])


model.add_adapter("rotten_tomatoes",peft_config)
model.train_adapter("rotten_tomatoes")
model.set_active_adapters("rotten_tomatoes")



print_trainable_parameters(model)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 125288025 || all params: 125288025 || trainable%: 100.00
trainable params: 2155158 || all params: 126851055 || trainable%: 1.70


In [21]:
# 仅添加一个lora
# 加载预训练的序列到序列语言模型
model = AutoAdapterModel.from_pretrained(model_name_or_path)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

print_trainable_parameters(model)

# 配置PEFT的参数
lora_config = LoRAConfig(
    r=8,  # 设置LoRA的rank
    alpha=32,  # LoRA的alpha值，决定参数增加的数量
    dropout=0.1,  # LoRA层的dropout比例
    leave_out=[6, 7, 8, 9, 10, 11],  # 指定需要转换的层 #important
)


# bn_config = SeqBnConfig(

#     reduction_factor=0.2,  # 设置LoRA的rank

#     leave_out=[0, 1, 2, 3, 4, 5]  # 指定需要转换的层 #important

# )

# config_list=[lora_config, bn_config]
# peft_config=ConfigUnion(*[config_list[i] for i in range(len(config_list))])


model.add_adapter("rotten_tomatoes", lora_config)
model.train_adapter("rotten_tomatoes")
model.set_active_adapters("rotten_tomatoes")


print_trainable_parameters(model)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 125288025 || all params: 125288025 || trainable%: 100.00
trainable params: 739584 || all params: 125435481 || trainable%: 0.59


In [25]:
# 打印模型中模块的名字
for name, module in model.named_modules():
    print(name)


roberta
roberta.embeddings
roberta.embeddings.word_embeddings
roberta.embeddings.position_embeddings
roberta.embeddings.token_type_embeddings
roberta.embeddings.LayerNorm
roberta.embeddings.dropout
roberta.encoder
roberta.encoder.layer
roberta.encoder.layer.0
roberta.encoder.layer.0.attention
roberta.encoder.layer.0.attention.self
roberta.encoder.layer.0.attention.self.query
roberta.encoder.layer.0.attention.self.query.loras
roberta.encoder.layer.0.attention.self.query.loras.rotten_tomatoes
roberta.encoder.layer.0.attention.self.query.loras.rotten_tomatoes.lora_dropout
roberta.encoder.layer.0.attention.self.key
roberta.encoder.layer.0.attention.self.key.loras
roberta.encoder.layer.0.attention.self.value
roberta.encoder.layer.0.attention.self.value.loras
roberta.encoder.layer.0.attention.self.value.loras.rotten_tomatoes
roberta.encoder.layer.0.attention.self.value.loras.rotten_tomatoes.lora_dropout
roberta.encoder.layer.0.attention.self.dropout
roberta.encoder.layer.0.attention.self.pr

In [6]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [52]:
from torchinfo import summary

print(str(summary(model.base_model, depth=10)))

Layer (type:depth-idx)                                                           Param #
RobertaModel                                                                     --
├─RobertaEmbeddings: 1-1                                                         --
│    └─Embedding: 2-1                                                            38,603,520
│    └─Embedding: 2-2                                                            394,752
│    └─Embedding: 2-3                                                            768
│    └─LayerNorm: 2-4                                                            1,536
│    └─Dropout: 2-5                                                              --
├─RobertaEncoder: 1-2                                                            --
│    └─ModuleList: 2-6                                                           --
│    │    └─RobertaLayer: 3-1                                                    --
│    │    │    └─RobertaAttention: 4-1                

In [35]:
# for name in model.state_dict():
#   print(name)
from torch.nn.parameter import Parameter



weights = model.state_dict()['roberta.encoder.layer.0.attention.self.query.loras.rotten_tomatoes.lora_A']
print(weights)
# 将权重设置为全 0
weights.zero_()

# 将修改后的权重重新赋值给模型中的对应模块
model.roberta.encoder.layer[0].attention.self.query.loras.rotten_tomatoes.lora_A = Parameter(data=weights, requires_grad=False)

# # 将模块设置为不可训练
# for param in model.roberta.encoder.layer[0].attention.self.query.loras.rotten_tomatoes.lora_A.parameters():
#     param.requires_grad = False

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [29]:
num_small_values = torch.lt(torch.abs(weights), 0.001).sum()
print(num_small_values)

tensor(174)
