# Chat-甄嬛模型训练脚本

In [18]:
import pandas as pd
import torch

from datasets import Dataset
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM, 
                          DataCollatorForSeq2Seq, 
                          TrainingArguments, 
                          Trainer)

from peft import (LoraConfig, TaskType, get_peft_model)

In [19]:
# !pip list | grep transformer       # 4.24.0
# !pip install transformers==4.43.1  # 解决annot import name 'GenerationConfig' from 'transformers'
# !pip install peft==0.11.1

## 模型训练数据处理
- 1.tokenizer处理
- 2.input_ids, attention_mask,label处理

In [20]:
def process_func(example):
    MAX_LENGTH = 384  # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []

    # 拼接instruction和response
    # NOTE: begin_of_text、start_header_id、end_header_id、eot_id是模型自定义的分词符，用于区分不同部分
    instruction = tokenizer(f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n现在你要扮演皇帝身边的女人--甄嬛<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{example['instruction'] + example['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}<|eot_id|>", add_special_tokens=False)

    # 注意input_ids和labels的是不用的
    # input_ids是输入+输出，labels是只保留输出，-100表示不参与模型计算
    input_ids = instruction['input_ids'] + response['input_ids'] + [tokenizer.pad_token_id]
    attention_mask = instruction['attention_mask'] + response['attention_mask'] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  

    # 长度截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [22]:
if __name__ == '__main__':
    # 加载原始模型
    model_path = ""
    model = AutoModelForCausalLM.from_pretrained(model_path, 
                                                 device_map='auto', 
                                                 torch_dtype=torch.float16)
    model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法

    # 加载tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    # 加载数据集
    df = pd.read_json('huanuhan.json')
    ds = Dataset.from_pandas(df)
    tokenizer_id = df.map(process_func, remove_columns=ds.column_names)

    # PEFT配置
    config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        target_modules=["q_proj", "v_proj", 'k_proj'],
        inference_mode=False,   # 训练模式
        r=8,
        lora_alpha=32,
        lora_dropout=0.1
    )
    model = get_peft_model(model, config)
    model.print_trainable_parameters()  # 打印总训练参数

    # 模型训练参数
    args = TrainingArguments(
        output_dir='./output/llama3_1_instruct_lora',  # 指定模型和日志文件的保存路径
        per_device_train_batch_size=4,  # 每个GPU一次训练的样本数
        gradient_accumulation_steps=4,  # 梯度累积步数
        logging_steps=10,
        num_train_epochs=3,
        save_steps=100,              # 指定每多少步保存一次模型检查点
        learning_rate=1e-4,
        save_on_each_node=True,      # 指定是否在每个节点上保存模型检查点（适用于分布式训练）
        gradient_checkpointing=True,  # 启用梯度检查点，一种内存优化技术

        lr_scheduler_type="cosine_with_warmup",  # 使用余弦调度器带热身
        warmup_ratio=0.1,  # 热身占比
    )
    # 调参说明://www.bilibili.com/opus/955908361182773287

    # 定义Trainer
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds,
        tokenizer=DataCollatorForSeq2Seq(tokenizer, padding=True)
    )
    # 开启训练
    trainer.train()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/houhailun/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/configuration_utils.py", line 614, in _get_config_dict
    cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
  File "/Users/houhailun/anaconda3/envs/torch/lib/python3.10/site-packages/transformers/utils/hub.py", line 409, in cached_file
    user_agent=user_agent,
  File "/Users/houhailun/anaconda3/envs/torch/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 110, in _inner_fn
  File "/Users/houhailun/anaconda3/envs/torch/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 164, in validate_repo_id
    )
huggingface_hub.utils._validators.HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: ''.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
 