In [7]:
from src.run_model import PEFTModel
from src.peft_search_space import PEFTSearchSpace
from src.dataset_wrapper import PEFTDataset
import torch
from pruning_methods import prune_model
from utils.gpu_memory_plot import get_free_gpu_memory
from src.controllers.baseline_wrapper import baseline_wrapper_single, prune_wrapper_single, baseline_wrapper_double, prune_wrapper_double

import yaml
import argparse
import logging
import time
from copy import deepcopy
import os
import json
import random
import numpy as np

logger = logging.getLogger('controller')
path_method='../method_configs/adapter_lora.yaml'
path_task='../task_configs/glue.yaml'
with open(path_method, 'r') as file:
    method_configs = yaml.safe_load(file)
with open(path_task, 'r') as file:
    task_configs = yaml.safe_load(file)

logger.info(
    f'Start exp for {path_task}:{task_configs}\n{path_method}:{method_configs}')

print(method_configs)
print(task_configs)

if 'LORA' in method_configs:
    peft_type = 'LORA'
else:
    peft_type = 'ADAPTER'
print(peft_type)

ds_meta = task_configs['DATASETS'][0]
dataset_name = ds_meta['DATASET_NAME']
task_name = ds_meta['TASK_NAME']
configs = deepcopy(method_configs)
configs['LOSS'] = ds_meta['LOSS']

dataset = PEFTDataset(
    dataset_name, task_name, train_size=2000, test_size=400).get_dataset()

model = PEFTModel(configs, dataset).half()

{'LORA': [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32], 'LORA_LR': '1e-6', 'ADAPTER': [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128], 'ADAPTER_LR': '1e-7', 'EPOCHS': 3, 'PRUNE_EPOCHS': 1, 'PRUNE_TURN': 6, 'BACK_BONE': 'roberta-large'}
{'DATASETS': [{'TASK_NAME': 'qnli', 'LOSS': 'cross_entropy', 'DATASET_NAME': 'glue'}, {'TASK_NAME': 'rte', 'LOSS': 'cross_entropy', 'DATASET_NAME': 'glue'}, {'TASK_NAME': 'wnli', 'LOSS': 'cross_entropy', 'DATASET_NAME': 'glue'}, {'TASK_NAME': 'cola', 'LOSS': 'cross_entropy', 'DATASET_NAME': 'glue'}, {'TASK_NAME': 'sst2', 'LOSS': 'cross_entropy', 'DATASET_NAME': 'glue'}, {'TASK_NAME': 'mrpc', 'LOSS': 'cross_entropy', 'DATASET_NAME': 'glue'}, {'TASK_NAME': 'qqp', 'LOSS': 'cross_entropy', 'DATASET_NAME': 'glue'}], 'PRUNE_METHODS': ['zeros', 'values_below_threshold', 'snip', 'minimum_weight', 'activation', 'gradient'], 'TRAIN_SIZE': 200

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


number of label classes: 2


  torch.nn.init.normal(
  torch.nn.init.normal(
  torch.nn.init.normal(


In [6]:
print(dataset['train']['label'])

[1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 

In [13]:
model.model.train()
optimizer_grouped_parameters = [{
    "params": [
        name for name, param in model.model.named_parameters() if "lora" in name
    ],
    "lr": 1e-5,
}, {
    "params": [
        name for name, param in model.model.named_parameters()
        if "adapter" in name
    ],
    "lr": 1e-6,
}, {
    "params": [
        name for name, param in model.model.named_parameters()
        if 'heads' in name
    ],
    "lr": 1e-5
}]
print(optimizer_grouped_parameters)
print(sum([len(i['params']) for i in optimizer_grouped_parameters]))
print(len(list(model.model.parameters())))
# optimizer = torch.optim.AdamW(optimizer_grouped_parameters)

[{'params': ['roberta.encoder.layer.0.attention.self.query.loras.my_module.lora_A', 'roberta.encoder.layer.0.attention.self.query.loras.my_module.lora_B', 'roberta.encoder.layer.0.attention.self.value.loras.my_module.lora_A', 'roberta.encoder.layer.0.attention.self.value.loras.my_module.lora_B', 'roberta.encoder.layer.1.attention.self.query.loras.my_module.lora_A', 'roberta.encoder.layer.1.attention.self.query.loras.my_module.lora_B', 'roberta.encoder.layer.1.attention.self.value.loras.my_module.lora_A', 'roberta.encoder.layer.1.attention.self.value.loras.my_module.lora_B', 'roberta.encoder.layer.2.attention.self.query.loras.my_module.lora_A', 'roberta.encoder.layer.2.attention.self.query.loras.my_module.lora_B', 'roberta.encoder.layer.2.attention.self.value.loras.my_module.lora_A', 'roberta.encoder.layer.2.attention.self.value.loras.my_module.lora_B', 'roberta.encoder.layer.3.attention.self.query.loras.my_module.lora_A', 'roberta.encoder.layer.3.attention.self.query.loras.my_module.lo

In [None]:
from pruning_methods import prune_model
idx, idt = prune_model(
    model.model,
    task_name='my_module',
    opts=['lora', 'adapter'],
    p_method='gradient',
    top_p=12,
    print_names=True,
    gradients=gradients)
logger.info(f'Pruned layer: {idx, idt}')
search_list[int(idx)] = 0

In [7]:
params_need_record = [
    param for name, param in model.model.named_parameters()
    if param.requires_grad
]
print(len(params_need_record))
print(len(list(model.model.parameters())))

132
422


In [8]:
groups= [name for name, para in model.model.named_parameters()]
print(groups)

['roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.query.loras.my_module.lora_A', 'roberta.encoder.layer.0.attention.self.query.loras.my_module.lora_B', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.self.value.loras.my_module.lora_A', 'roberta.encoder.layer.0.attention.self.value.loras.my_module.lora_B', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', '