#### 本文选择微调的基础模型是Llama2-chat-13B-Chinese-50W
##### 微调数据集：Belle_open_source_0.5M.json
原始数据集共有50万条数据，格式：{"instruction":"xxxx", "input":"", "output":"xxxx"}
！wget https://huggingface.co/datasets/BelleGroup/train_0.5M_CN/resolve/main/Belle_open_source_0.5M.json
链接: https://pan.baidu.com/s/1HrUkq7Wb9dbZSfoz6BB7Wg?pwd=h697 提取码: h697 
--来自百度网盘超级会员v4的分享

In [3]:
# 数据集处理
!python split_json.py
# 拼接好的数据集格式：{"text":"### Human: xxxx ### Assistant: xxx"}

In [None]:
# 安装依赖包
!pip install -q huggingface_hub
!pip install -q -U trl transformers accelerate peft
!pip install -q -U datasets bitsandbytes einops wandb

In [None]:
# 初始化wanb
import wandb
wandb.init()

In [None]:
# 导入相关包
from datasets import load_dataset
import torch,einops
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
# 加载拼接好的数据集
dataset = load_dataset("json",data_files="./Belle_open_source_0.5M_changed_test.json",split="train")

In [None]:
# 配置模型
base_model_name ="./Llama2-chat-13B-Chinese-50W"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,#在4bit上，进行量化
    bnb_4bit_use_double_quant=True,# 嵌套量化，每个参数可以多节省0.4位
    bnb_4bit_quant_type="nf4",#NF4（normalized float）或纯FP4量化 博客说推荐NF4
    bnb_4bit_compute_dtype=torch.float16,
)

In [None]:
# GPU
device_map = {"": 0}
#有多个gpu时，为：device_map = {"": [0,1,2,3……]}

In [None]:
# 加载本地模型
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,#本地模型名称
    quantization_config=bnb_config,#上面本地模型的配置
    device_map=device_map,#使用GPU的编号
    trust_remote_code=True,
    use_auth_token=True
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
# 配置QLora
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
# 对本地模型，把长文本拆成最小的单元词（即token）
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# 训练参数设置
output_dir = "./results"
training_args = TrainingArguments(
    report_to="wandb",
    output_dir=output_dir,#训练后输出目录
    per_device_train_batch_size=4,#每个GPU的批处理数据量
    gradient_accumulation_steps=4,#在执行反向传播/更新过程之前，要累积其梯度的更新步骤数
    learning_rate=2e-4,#超参、初始学习率。太大模型不稳定，太小则模型不能收敛
    logging_steps=10,#两个日志记录之间的更新步骤数
    max_steps=100#要执行的训练步骤总数
)
max_seq_length = 512
#TrainingArguments 的参数详解：https://blog.csdn.net/qq_33293040/article/details/117376382
 
trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)

In [None]:
# 开始微调训练
trainer.train()

In [None]:
# 保存训练好的模型
import os
output_dir = os.path.join(output_dir, "final_checkpoint")
trainer.model.save_pretrained(output_dir)

In [None]:
# 模型合并
！python model_hebing.py