# Bloom 微调


In [None]:
##!pip install torch transformers==4.42.4 peft==0.11.1 datasets==2.20.0 accelerate==0.32.1 bitsandbytes==0.43.1 faiss-cpu==1.7.4 tensorboard==2.14.0

In [None]:
# 导入数据集
from datasets import Dataset
import json


# 自定义函数来解析 JSON Lines 数据
def parse_jsonl(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            json_obj = json.loads(line)
            data.append(
                {
                    "input_text": json_obj["input_text"],
                    "target_text": json_obj["target_text"],
                }
            )
    return data


# 解析并加载数据集
# train_data = parse_jsonl("train_output.jsonl")
# test_data = parse_jsonl("test_output.jsonl")

# 解析并加载数据集
train_data = parse_jsonl("train_output.jsonl")
test_data = parse_jsonl("test_output.jsonl")

# 得到微调数据集
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)


# 打印训练集和测试集的基本信息
print("训练集信息:")
print(train_dataset)
print("测试集信息:")
print(test_dataset)

# 查看训练集前五个样本
print("训练集前五个样本:")
for i in range(5):
    print(train_dataset[i])

# 查看测试集前五个样本
print("测试集前五个样本:")
for i in range(5):
    print(test_dataset[i])

In [None]:
# 加载预训练模型和分词器
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)


model = AutoModelForCausalLM.from_pretrained(
    "bloom-1b4-zh",
    trust_remote_code=True,
)


tokenizer = AutoTokenizer.from_pretrained("bloom-1b4-zh", trust_remote_code=True)
print(model)


print(tokenizer)


# sum(param.numel() for param in model.parameters())

In [None]:
num_param = 0
for name, param in model.named_parameters():
    if "bias" not in name:
        param.requires_grad = False
    else:
        num_param += param.numel()
print(num_param)
print(num_param / sum(param.numel() for param in model.parameters()))

In [None]:
# 定义格式化函数
def tokenize_function(examples):
    inputs = tokenizer(
        examples["input_text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    targets = tokenizer(
        examples["target_text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    inputs["labels"] = targets["input_ids"]
    return inputs


# 设置较小的批处理大小
batch_size = 2

# 应用tokenizer到数据集
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
)
test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=batch_size,
)

In [4]:
# 配置微调模型
from peft import LoraConfig, TaskType, get_peft_model


# 选择微调配置
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=32,
    lora_alpha=16,
    target_modules=[
        "transformer.word_embeddings",
        "transformer.h.0.self_attention.query_key_value",
        "transformer.h.0.self_attention.dense",
        "transformer.h.0.mlp.dense_h_to_4h",
        "transformer.h.0.mlp.dense_4h_to_h",
        "lm_head",
        # 根据实际的模块名称更新
    ],
    lora_dropout=0.05,
    bias="none",
)


model = get_peft_model(model, peft_config)

In [None]:
model

In [None]:
model.print_trainable_parameters()

# 配置训练参数


In [5]:
args = TrainingArguments(
    output_dir="./BloomFineTune",
    weight_decay=0.01,  # 权重衰减
    logging_dir="./logs",  # 日志目录
    logging_steps=10,  # 日志记录频率
    learning_rate=5e-5,  # 学习率
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    fp16=True,  # 使用半精度浮点数
)

# 创建训练器


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

# 模型训练


In [None]:
trainer.train()