In [1]:
from peft import prepare_model_for_kbit_training,LoraConfig,get_peft_model,TaskType
from transformers import AutoModelForCausalLM,AutoTokenizer,TrainingArguments,Trainer,default_data_collator,BitsAndBytesConfig
from datasets import load_dataset
import torch
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


### QLoRa配置

In [2]:
_compute_dtype_map = {
    'fp32': torch.float32,
    'fp16': torch.float16,
    'bf16': torch.bfloat16
}

# QLoRA 量化配置
q_config = BitsAndBytesConfig(load_in_4bit=True,
                              bnb_4bit_quant_type='nf4',
                              bnb_4bit_use_double_quant=True,
                              bnb_4bit_compute_dtype=_compute_dtype_map['fp32'],
                             )

### 模型加载

In [3]:
model_name="Qwen/Qwen3-0.6B"
model=AutoModelForCausalLM.from_pretrained(model_name,device_map='auto',torch_dtype="auto",quantization_config=q_config)
tokenizer=AutoTokenizer.from_pretrained(model_name)

Loading weights: 100%|██████████| 311/311 [00:02<00:00, 123.67it/s, Materializing param=model.norm.weight]                              


In [4]:
kbit_model = prepare_model_for_kbit_training(model)

### LoRa配置

In [5]:
lora_config = LoraConfig(
    r=4,  # LoRA矩阵的秩
    lora_alpha=32,  # LoRA alpha参数
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # 要应用LoRA的模块
    lora_dropout=0.05,  # Dropout概率
    bias="none",  # 是否训练偏置
    task_type="CAUSAL_LM",  # 任务类型
)

In [6]:
qlora_model = get_peft_model(kbit_model, lora_config)

In [7]:
qlora_model.print_trainable_parameters()

trainable params: 1,146,880 || all params: 752,779,264 || trainable%: 0.1524


In [8]:
qlora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024)
        (layers): ModuleList(
          (0-27): 28 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.

### 加载模型

In [9]:
data=load_dataset('json',data_files='../../dataset/chinese_law_ft_dataset.json',split="train[:1000]")

In [10]:
dataset = data.train_test_split(
    train_size=0.7,
    shuffle=True,
    seed=7
)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'id'],
        num_rows: 700
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'id'],
        num_rows: 300
    })
})

### 数据预处理

In [12]:
def process_fun(example):
    content=[]
    for instruction,input,output in zip(example['instruction'],example['input'],example['output']):
        if input.strip():
            text=f'Human:{instruction}\n{input}\nAI:{output}'
            content.append(text)
        else:
            text=f'Human:{instruction}\nAI:{output}'
            content.append(text)
    
    encoded = tokenizer(
        content,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    labels = encoded["input_ids"].clone()
    for index,text in enumerate(content):
        answer_start=text.find('AI:')+len('AI:')
        question=text[:answer_start]
        question_ids=tokenizer.encode(question, add_special_tokens=False)
        question_length=len(question_ids)
        labels[index,:question_length]=-100
    return {
        "input_ids": encoded["input_ids"],
        "attention_mask": encoded["attention_mask"],
        "labels": labels
    }


In [13]:
train_process_data=dataset['train'].map(process_fun,batched=True,remove_columns=dataset['train'].column_names)
test_process_data=dataset['test'].map(process_fun,batched=True,remove_columns=dataset['test'].column_names)

### 训练超参数

In [14]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir="../../models/qlora",
    logging_steps=100,
    logging_dir='./runs',
    eval_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    metric_for_best_model='eval_loss',
    gradient_accumulation_steps=4,  # 如果GPU内存有限
)

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


### 训练模型

In [16]:
trainer=Trainer(
    model=qlora_model,
    args=training_args,
    eval_dataset=test_process_data,
    train_dataset=train_process_data,
    data_collator=default_data_collator,
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,8.024446
2,No log,5.540305
3,No log,4.521588


TrainOutput(global_step=66, training_loss=7.227429939038826, metrics={'train_runtime': 1791.6738, 'train_samples_per_second': 1.172, 'train_steps_per_second': 0.037, 'total_flos': 3852635996160000.0, 'train_loss': 7.227429939038826, 'epoch': 3.0})

In [18]:
trainer.save_model('../../models/qlora')