# 模型微调(英译中翻译)

### 1 数据预览

In [None]:
!head -5 News-Commentary-v16-1000.en-zh

### 2 数据处理

In [2]:
import json

In [3]:
prompt='"{txt}"\n请将上面的句子翻译成中文(仅输出翻译结果):\n'
with open('News-Commentary-v16-1000.en-zh','r') as f:
    with open('sample.jsonl','w') as g:
        for i,line in enumerate(f):
            en,zh=line.strip().split('\t')
            s= [{'user':prompt.format(txt=en)},
                {'assistant':zh}]
            g.write(json.dumps(s,ensure_ascii=False)+'\n')
        for _ in range(5):
            g.write(json.dumps([{'user':'你是谁？'}, {'assistant':'我是工银智涌大模型。'}],ensure_ascii=False)+'\n')

### 3 数据转换

In [None]:
!python -m jllm.raw2ids \
    --tokenizer Qwen2.5-7B-Instruct \
    -i sample.jsonl \
    --max_len 8193 -C --filter

### 3.1 数据检查

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained('Qwen2.5-7B-Instruct')
import pyarrow.parquet

In [7]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [8]:
print(text)

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Give me a short introduction to large language model.<|im_end|>
<|im_start|>assistant



In [9]:
data = pyarrow.parquet.read_table('sample_Qwen2.5-7B-Instruct/sample-00000')

In [10]:
input_ids=data['input_ids'].to_numpy().tolist()
labels=[i.copy() for i in data['labels'].to_numpy().tolist()]

In [11]:
idx=-1

In [12]:
labels[idx][0]=-100

In [None]:
print(tokenizer.decode(input_ids[idx]));
labels[idx][labels[idx]==-100]=9
print(tokenizer.decode(labels[idx]))

## 4 模型训练

#### 4.1全参微调

In [None]:
!deepspeed --module jllm.train_pipe \
    --model pretrained_hf \
    --num_train_epochs 2 \
    --train_data sample_Qwen2.5-7B-Instruct \
    --pipe_parallel_size 4 \
    --model_parallel_size 1 \
    --sequence_parallel_size 1 \
    --per_device_train_batch_size 1 \
    --global_batch_size 16 \
    --partition_method fast \
    --split_dlayer \
    --only_ckpt_model \
    --max_num_checkpoints 2 \
    --split_dlayer \
    --learning_rate 1e-5 |tee finetuning.log

# 注释：
# --model 模型路径至少需要包含config.json
# --num_train_epochs 训练轮数
# --train_data 训练数据
# --pipe_parallel_size 流水线并行个数
# --model_parallel_size 张量并行个数
# --per_device_train_batch_size 一次输入训练多少样本
# --global_batch_size 全局训练完多少样本后（累加完多少个梯度后）进行一次参数更新
# --partition_method fast 流水线拆分策略
# --only_ckpt_model 只check模型参数，此时会直接存成huggingface格式
# --checkpoint checkpoint 模型检查点目录
# --output_dir pretrained 最终模型输出目录
# --max_num_checkpoints 2 最大保留多少个检查点
# --split_dlayer 是否拆分docoder layer
# --learning_rate 1e-5 学习率

In [None]:
!grep  'steps:.*loss:' finetuning.log|awk '{print $2,$4}'>loss
import matplotlib.pyplot as plt
import numpy as np
xy=np.loadtxt('loss')  
plt.plot(xy[:,0], xy[:,1])  
plt.show()

#### 4.2 LORA微调

In [None]:
!deepspeed --module jllm.train_pipe \
    --model pretrained_hf \
    --num_train_epochs 3 \
    --train_data sample_Qwen2.5-7B-Instruct \
    --pipe_parallel_size 4 \
    --model_parallel_size 1 \
    --per_device_train_batch_size 1 \
    --global_batch_size 32 \
    --partition_method fast \
    --split_dlayer \
    --only_ckpt_lora \
    --checkpoint checkpoint_lora \
    --max_num_checkpoints 2 \
    --skip_epoch 1,2 \
    --split_dlayer \
    --lora_dim 32 \
    --lora_alpha 32 \
    --lora_module_name 'qkv_proj,o_proj,gate_up_proj,down_proj' \
    --only_optimize_lora \
    --learning_rate 1e-5 |tee lora.log

#注释：
# --model 模型路径至少需要包含config.json
# --num_train_epochs 训练轮数
# --train_data 训练数据
# --pipe_parallel_size 流水线并行个数
# --model_parallel_size 张量并行个数
# --per_device_train_batch_size 一次输入训练多少样本
# --global_batch_size 训练完多少样本后（累加完多少个梯度后）进行一次参数更新
# --partition_method 流水线拆分策略
# --only_ckpt_model 只check模型参数，此时会直接存成huggingface格式
# --checkpoint checkpoint 模型检查点目录
# --output_dir pretrained 最终模型输出目录
# --max_num_checkpoints 2 最大保留多少个检查点
# --skip_epoch 跳过的检查点
# --split_dlayer 是否拆分docoder layer
# --lora_dim lora参数的秩
# --lora_alpha lora参数的权重
# --lora_module_name 对哪些线性层执行lora替换
# --only_optimize_lora 只对被lora替换的参数进行梯度更新
# --learning_rate 1e-5 学习率

In [None]:
!grep  'steps:.*loss:' lora.log|awk '{print $2,$4}'>loss
import matplotlib.pyplot as plt
import numpy as np
xy=np.loadtxt('loss')  
plt.plot(xy[:,0], xy[:,1])  
plt.show()

## 5 推理测试

In [4]:
import torch_npu
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained('Qwen2.5-7B-Instruct')

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    'checkpoint/31',
    torch_dtype="auto",
    device_map="auto"
)

In [6]:
en='PARIS – As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening.'
prompt = f'"{en}"\n请将上面的句子翻译成中文(仅输出翻译结果):\n'

In [7]:
def infer(prompt):
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=128
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
infer(prompt)

In [None]:
infer('你是谁？')