In [1]:
class TrainingConfig:
    def __init__(self) ->None:
        # LoRA配置
        self.lora_r=8 # 核，控制可训练参数数量4-32
        self.lora_alpha=16 #缩放因子，用于控制 LoRA 模块对原始模型参数的影响程度
        self.lora_dropout=0.1 #用于在训练过程中对 LoRA 模块的输入进行随机丢弃，以防止过拟合 0.1-0.3
        self.lora_target_modules="all-linear" #指定哪些模型层的参数需要使用 LoRA 进行微调
        #"q_proj","k_proj", "v_proj","o_proj", "gate_proj","up_proj", "down_proj" # 前馈神经网络，多头注意力
        
        #模型训练配置
        self.torch_dtype ="bf16" # 训练精度，fp16，bf16（187），fp32 
        self.gradient_checkpointing = True
        self.use_flash_attention=True # 长序列数据,推理速度，内存
        self.seed = 666
        
        self.bnb_config={
            "load_in_4bit"=True,
            "bnb_4bit_use_double_quant"=True, # 双重量化，进一步压缩
            "bnb_4bit_quant_type"="nf4",# 量化类型，normal float 4-bit
            "bnb_4bit_compute_dtype"=torch.bfloat16
            "llm_int8_threshold":6.0 #用于指定 8 位量化中的异常值阈值
            "llm_int8_skip_modules":None
            "llm_int8_has_fp16_weight ":False
        }
        
        self.gpu_memory_threshold=24*1024*1024*1024
        self.large_gpu_batch_size=16
        
        self.max_memory = {0:21GB}
        self.offload_folder="offload"
        
        self.optimizer_config={
            "lr":2e-4,
            "betas":(0.9,0.999),# Adam优化器的beta参数
            "eps":1e-8,   #数值确定性小值   
            "weight_decay":0.01
        }
        
        
class ModelConfig:
    def __init__(self) ->None:
        self.cache_dir="/tmp/.cache"
        self.temp_model_dir="/tmp/model"
        self.model_max_shard_size="2GB"
        
        self.safe_serialization=True
        self.save_strategy="no" # steps表示按步保存
        self.save_merged_model=False # 仅保存Lora权重
        
        
class DataConfig:
    def __init__(self) ->None: 
        self.max_seq_length=2048 # 最大序列长度
        self.pad_to_max_length=True
        self.truncation=True #截断

SyntaxError: invalid decimal literal (280960864.py, line 29)

In [None]:
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_id = "EleutherAI/gpt-neox-20b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)


In [None]:
def download_model(model_name)

def initialize_tokenizer(model_name)

def process_datasets(train_ds,test_ds,tokenizer,data_config,accelerator)

def get_fsdp_config(gradient_checkpointing:bool)-> tuple[str,dict]:-
    
def get_training_configuration

def save_model
def initialize_wandb

def setup_model_environment

def create_training_args

def main(args)->None

    load_dotenv()
    training_config=TrainingConfig()
    model_config=ModelConfig()
    data_config = DataConfig()
    
    # 2初始化分布式训练环境
    gradient_accumulation_steps=args.grad_accum_steps
    accelerator = Accelerator(
    )
    local_rank = accelerator.num_processes>1
    #bitsandbytes 量化开源库 peft主要提供rola微调
    