# bitfit

对模型中的bias进行微调

In [1]:
import torch

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq

In [2]:
dataset = Dataset.load_from_disk("../../datas/alpaca_data_zh")

In [3]:
dataset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 26858
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained("../../models/bloom-1b4")

In [7]:
def process_func(example):
    MAX_LENGTH = 512

    instruction = "\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: "
    inputs = tokenizer(instruction)
    response = tokenizer(example["output"])
    input_ids = inputs["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = inputs["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(inputs["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]

    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [8]:
tokenized_data = dataset.map(process_func, remove_columns=dataset.column_names)

Map:   0%|          | 0/26858 [00:00<?, ? examples/s]

In [11]:
model = AutoModelForCausalLM.from_pretrained("../../models/bloom-1b4")

In [13]:
params_num = 0

for name, params in model.named_parameters():
    if "bias" not in name:
        params.requires_grad = False
    else:
        params_num += params.numel()

print(params_num)

544768


In [14]:
params_num / sum(params.numel() for params in model.parameters())

0.000418051659240749

In [15]:
args = TrainingArguments(
    output_dir="../../caches/PEFT-bloom1b4-bitfit",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    logging_steps=100,
    num_train_epochs=1,
    optim="adafactor"
)

In [16]:
trainer = Trainer(
    args=args,
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer),
    train_dataset=tokenized_data
)

  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss
