In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig, BitsAndBytesConfig

In [2]:
# 将JSON文件转换为CSV文件
df = pd.read_json('data/reflection_data.json')
ds = Dataset.from_pandas(df)

In [3]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2___5-7B', use_fast=False, trust_remote_code=True)

In [4]:
def process_func(example):
    MAX_LENGTH = 4096    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n{example['system']}<|im_end|>\n<|im_start|>user\n{example['prompt']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['response']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [5]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

Map:   0%|          | 0/60121 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 60121
})

In [6]:
tokenizer.decode(tokenized_id[0]['input_ids'])

"<|im_start|>system\nYou are a world-class AI system capable of complex reasoning and reflection. You respond to all questions in the following way-\n<thinking>\nIn this section you understand the problem and develop a plan to solve the problem.\n\nFor easy problems-\nMake a simple plan and use COT\n\nFor moderate to hard problems-\n1. Devise a step-by-step plan to solve the problem. (don't actually start solving yet, just make a plan)\n2. Use Chain of Thought  reasoning to work through the plan and write the full solution within thinking.\n\nYou can use <reflection> </reflection> tags whenever you execute a complex step to verify if your reasoning is correct and if not correct it.\n\n\n</thinking>\n\n<output>\nIn this section, provide the complete answer for the user based on your thinking process. Do not refer to the thinking tag. Include all relevant information and keep the response somewhat verbose, the user will not see what is in the thinking tag.\n</output><|im_end|>\n<|im_star

In [7]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"])))

"<thinking>\nTo address this complex question, I'll need to develop a structured approach and consider multiple factors. Let's break this down into steps:\n\n1. Understand the premise: A text-based internet without easy image/video sharing\n2. Consider the key factors: Technological, social, and economic\n3. Develop three potential alternative outcomes\n4. Provide reasoning for each scenario\n\nLet's start with developing the three scenarios:\n\nScenario 1: Text-Centric Social Networks\nScenario 2: Audio-Focused Social Media\nScenario 3: Virtual Text Worlds\n\nNow, let's explore each scenario in detail:\n\nScenario 1: Text-Centric Social Networks\n- Social media platforms would focus on text-based communication\n- Emphasis on writing skills and articulation\n- Rise of microblogging and collaborative writing platforms\n\nReasoning:\nWithout easy image and video sharing, social media would likely evolve to prioritize text-based content. This could lead to a greater emphasis on writing sk

In [8]:
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 是否在4位精度下加载模型。如果设置为True，则在4位精度下加载模型。
    bnb_4bit_compute_dtype=torch.half,  # 4位精度计算的数据类型。这里设置为torch.half，表示使用半精度浮点数。
    bnb_4bit_quant_type="nf4", # 4位精度量化的类型。这里设置为"nf4"，表示使用nf4量化类型。
    bnb_4bit_use_double_quant=True  # 是否使用双精度量化。如果设置为True，则使用双精度量化。
)

model = AutoModelForCausalLM.from_pretrained(
        'Qwen/Qwen2___5-7B', 
        device_map="auto",
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config, 
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
# 查看各个层的最终量化后的精度
for name, param in model.named_parameters():
    print(name, param.shape, param.dtype)

model.embed_tokens.weight torch.Size([152064, 3584]) torch.bfloat16
model.layers.0.self_attn.q_proj.weight torch.Size([6422528, 1]) torch.uint8
model.layers.0.self_attn.q_proj.bias torch.Size([3584]) torch.bfloat16
model.layers.0.self_attn.k_proj.weight torch.Size([917504, 1]) torch.uint8
model.layers.0.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16
model.layers.0.self_attn.v_proj.weight torch.Size([917504, 1]) torch.uint8
model.layers.0.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16
model.layers.0.self_attn.o_proj.weight torch.Size([6422528, 1]) torch.uint8
model.layers.0.mlp.gate_proj.weight torch.Size([33947648, 1]) torch.uint8
model.layers.0.mlp.up_proj.weight torch.Size([33947648, 1]) torch.uint8
model.layers.0.mlp.down_proj.weight torch.Size([33947648, 1]) torch.uint8
model.layers.0.input_layernorm.weight torch.Size([3584]) torch.bfloat16
model.layers.0.post_attention_layernorm.weight torch.Size([3584]) torch.bfloat16
model.layers.1.self_attn.q_proj.weight torch.S

In [10]:
# 配置LoRA时,打印model所有参数层名,方便观察target_modules
for name, param in model.named_parameters():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.q_proj.bias
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.k_proj.bias
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.v_proj.bias
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.q_proj.bias
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.k_proj.bias
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.v_proj.bias
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self

In [11]:
model.enable_input_require_grads()

In [12]:
model.dtype

torch.bfloat16

In [13]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1 # Dropout 比例
)

In [14]:
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643


In [15]:
args = TrainingArguments(
    output_dir="./output/Qwen2.5_instruct_lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=500, 
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [16]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,0.582
20,0.5391
30,0.502
40,0.4893
50,0.5308
60,0.5097
70,0.513
80,0.4931
90,0.4873
100,0.4874


OutOfMemoryError: CUDA out of memory. Tried to allocate 9.28 GiB. GPU 