# PEFT高效微调  
  
  
### 一，准备数据集

In [1]:
!pwd


/home/jovyan


In [1]:
%pip install datasets


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from hugging

In [3]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='law_train_short.json')
# 格式参考:{"instruction": "解释机器学习", "input": "", "output": "机器学习是..."}
dataset = dataset["train"].train_test_split(test_size=0.1)  #切分数据集的0.1作为验证

print(dataset["train"][:5])

FileNotFoundError: Unable to find '/home/jovyan/law_train_short.json'

### 二，使用Hugging Face的BitsAndBytesConfig配置4比特量化加载大语言模型

In [3]:
%pip install transformers torch

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [4]:
from transformers import BitsAndBytesConfig
import torch
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                       # 启用4比特量化加载,将模型权重量化为4位,显存减少至FP32的1/8
    bnb_4bit_quant_type="nf4",               # 使用NormalFloat4量化类型
    bnb_4bit_compute_dtype=torch.float16,    # 计算时使用float16精度,指定计算时使用半精度,避免纯4位计算可能导致的精度损失
    llm_int8_enable_fp32_cpu_offload=True    # 允许将部分计算卸载到CPU的FP32精度
)

### 三，加载模型与分词器

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir = "./Qwen2.5-1.5B-Instruct",
    device_map="auto",
    quantization_config=bnb_config      #启动4bit量化配置
)

### 四，LoRA参数配置与关键参数优化  
通常在RTX3090上7B模型全量微调需80GB显存,LoRA仅需约10G  
 不同任务只需替换约0.1%的LoRA参数 


In [7]:
from peft import LoraConfig, get_peft_model

# 配置LoRA微调参数
lora_config = LoraConfig(
    r=8,  # 低秩矩阵的秩,决定可训练参数数量,8-64之间,根据显存调整
    lora_alpha=16, # 缩放因子,控制低秩矩阵对原始权重的调整幅度,一般设为r的2倍
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Qwen2.5的注意力模块
    bias="none",     # 不微调偏置参数
    task_type="CAUSAL_LM",  # 任务类型为因果语言建模
    lora_dropout=0.05   # LoRA层的Dropout概率，防止过拟合
)

# 将LoRA适配器注入原始模型
model = get_peft_model(model, lora_config)
# 打印可训练参数占比
model.print_trainable_parameters()  

trainable params: 2,179,072 || all params: 1,545,893,376 || trainable%: 0.1410


### 五，数据预处理

In [8]:
def tokenize_function(examples):
    # 设置最大序列长度为256,可根据显存设置成512,1024
    max_length = 256
    # 将instruction/input/output字段拼接为结构化文本：
    # 示例输出："Instruction: 翻译句子\nInput: Hello\nOutput: 你好"
    texts = [f"Instruction: {q}\nInput: {i}\nOutput: {o}" 
             for q, i, o in zip(examples["instruction"], examples["input"], examples["output"])]
    # 使用预训练分词器处理文本:
    # truncation=True 自动截断超长文本
    # max_length 限制最大token数
    # padding="longest" 动态填充到当前batch中最长文本长度(节省显存)
    return tokenizer(texts, truncation=True, max_length=max_length, padding="longest")
# 应用分词函数到整个数据集：
# batched=True 启用批量处理(提升效率)
# batch_size=4 每批处理4个样本(根据显存调整)
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=4)

Map: 100%|██████████| 7/7 [00:00<00:00, 514.70 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 143.79 examples/s]


### 六，配置wandb监控

In [None]:
import wandb
from datetime import datetime

wandb.init(
    project="Qwen2.5-Finetune",
    name=f"qwen2.5-1.5b-lora-{datetime.now().strftime('%Y%m%d-%H%M')}",
    config={
        "model": "Qwen2.5-1.5B-Instruct",
        "peft_method": "LoRA",
        "lora_rank": 32,
        "batch_size": 4
    }
)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models


In [1]:
from transformers import TrainingArguments, Trainer,DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./qwen_finetuned",     # 模型/日志/检查点保存路径
    num_train_epochs=3,                # 训练总轮次(小数据集3-5轮,大数据集1-2轮)
    per_device_train_batch_size=4,     # 每个GPU的batch size(根据GPU显存调整)
    per_device_eval_batch_size=8,      # 评估批次大小
    gradient_accumulation_steps=2,     # 梯度累积步数,使得实际有效batch_size=4×2=8,缓解显存不足(模拟更大batch size)
    learning_rate=3e-5,                # 学习率,LLM微调推荐2e-5~5e-5
    weight_decay=0.01,                 # L2正则化系数(防过拟合)
    warmup_ratio=0.05,                 # 预热比例,5%训练步用于学习率线性预热
    fp16=True,                         # 混合精度训练(A100/V100建议启用)
    evaluation_strategy="epoch",       # 每轮结束后评估验证集
    save_strategy="epoch",             # 每轮保存检查点
    logging_steps=50,                  # 50步记录loss/lr等指标
    report_to="wandb",                 # 集成wandb可视化
    optim="adamw_torch_fused",         # 梯度裁剪阈值
    max_grad_norm=1.0,                 # 梯度裁剪
    run_name=wandb.run.name            # 继承W&B实验名称
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,    # 必须与模型匹配的分词器
    mlm=False,              # 禁用Masked Language Modeling
    pad_to_multiple_of=8    # 填充长度对齐8的倍数(优化GPU显存利用率)
)



  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'wandb' is not defined

### 七，启动训练,训练后保存模型

In [2]:
trainer = Trainer(
    model=model,                                    # 加载的预训练模型(如Qwen/Llama等),支持PEFT微调后的模型
    args=training_args,                             # 训练参数配置
    train_dataset=tokenized_dataset["train"],       # 训练数据集
    eval_dataset=tokenized_dataset["test"],         # 验证数据集
    data_collator=data_collator                     # 动态填充batch
)


trainer.train()     # 执行训练流程(自动按training_args配置运行)
wandb.finish()      # 训练完成后关闭Wandb
# 使用LoRA/QLoRA,默认仅保存适配器参数(若使用QLoRA等4-bit量化，需额外保存量化配置)
model.save_pretrained("D:\\AIProjects\\modelscope\\finetuned\\qwen_finetuned") 

NameError: name 'model' is not defined

### 八，将微调后的LoRA适配器与基模合并

In [9]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 加载基础模型路径
base_model = "D:\\AIProjects\\modelscope\\Qwen\\Qwen2___5-1___5B-Instruct"
# 指定LoRA适配器保存路径(包含训练好的lora权重和adapter_config.json)
adapter_path = "D:\\AIProjects\\modelscope\\finetuned\\qwen_finetuned"

# 加载基础语言模型
model = AutoModelForCausalLM.from_pretrained(
    base_model, 
    torch_dtype=torch.bfloat16,     # 使用bfloat16平衡精度与显存
    device_map="auto"               # 自动分配设备(多GPU时自动切片)
    )

# 注入LoRA适配器到基础模型
model = PeftModel.from_pretrained(
    model, 
    adapter_path    # 加载训练好的LoRA权重
    )

# 将LoRA权重合并到基础模型并移除PEFT包装
model = model.merge_and_unload()
# 加载与基础模型匹配的分词器
tokenizer = AutoTokenizer.from_pretrained(base_model)

### 九，提问

In [None]:
def generate_response(instruction, input_text=""):
    # 构建结构化提示模板(Instruction/Input/Output格式)
    prompt = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"
    # 指定使用第一个CUDA设备
    device = torch.device('cuda:0')
    # 使用分词器处理文本,返回PyTorch张量格式
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    # 调用模型生成回答
    outputs = model.generate(
        **inputs,               # 解包输入张量(包含input_ids和attention_mask)
        max_new_tokens=256,     # 限制生成新token数量
        temperature=0.1,        # 控制随机性,0.7平衡创造性与合理性(0.1-1.0，值越大越多样)
        top_p=0.9               # 核采样参数
    )
    # 解码生成结果并提取"Output:"后的内容
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Output:")[-1]

print(generate_response("张三贪污了五百万"))

ValueError: `temperature` (=0.0) has to be a strictly positive float, otherwise your next token scores will be invalid. If you're looking for greedy decoding strategies, set `do_sample=False`.

### 十，提供API给系统集成

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
import nest_asyncio  # 解决Jupyter事件循环冲突
import uvicorn  # ASGI服务器

# 必须添加以下两行才能在Jupyter中运行异步服务
nest_asyncio.apply()  
app = FastAPI()

class Request(BaseModel):
    instruction: str
    input_text: str = ""

@app.post("/predict")
async def predict(request: Request):
    return {"response": generate_response(request.instruction, request.input_text)}

# 在Jupyter中启动服务器
uvicorn.run(app, host="127.0.0.1", port=8000)