In [1]:
from transformers import AutoModel
from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live

[2024-03-10 17:47:23,312] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
class CFG:
    model_path = '/root/autodl-tmp/weights/chatglm3-6b'
    output_dir = '/root/autodl-tmp/checkpoints/glm3'
    #output_dir = '/root/autodl-tmp/checkpoints/glm3-full_query_turbo3'
    
    num_train_epochs = 1
    max_train_steps = None
    
    batch_size = 2
    max_tokens = 648
    max_query = 64
    
    lr = 1e-5
    warm_up_steps = 1000
    
    #data_path = '/root/autodl-tmp/dataset/OESD-GG-zh_cn-1/single_query.jsonl'
    #data_path = '/root/autodl-tmp/dataset/OESD-GG-zh_cn-1/full_query.jsonl'
    #query_key = 'User'
    #answer_key = 'Assisstant'
    
    data_path = '/root/autodl-tmp/dataset/psychology-dataset/data/train.jsonl'
    query_key = 'question'
    answer_key = 'response_j'
    
    #data_path = '/root/autodl-tmp/dataset/zhihu_qa/zhihu_qa_5w.jsonl'
    #query_key = 'question'
    #answer_key = 'answer'

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss
from torch.utils.data.distributed import DistributedSampler

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

import sys
import json
import pandas as pd
from tqdm import tqdm

In [4]:
%matplotlib inline
%config InlineBackend.figure_format='svg'

In [5]:
sys.path.append('/root/tuning_space/Components/')
import interact
import model_tools
from Static import prompt_dict, st, si

In [6]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],  # lora的目标位置，具体有哪些可选项可打印出源码中的key_list 注意不同的模型中的定义名称不同
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [7]:
%%time
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(CFG.model_path, trust_remote_code=True).cuda().half()#.float()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

CPU times: user 11.5 s, sys: 38.9 s, total: 50.4 s
Wall time: 13.8 s


In [8]:
estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)

Estimated memory needed for params, optim states and gradients for a:
HW: Setup with 1 node, 1 GPU per node.
SW: Model with 6243M total params, 266M largest layer params.
  per CPU  |  per GPU |   Options
  157.00GB |   0.99GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
  157.00GB |   0.99GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
  139.55GB |  12.62GB | offload_param=none, offload_optimizer=cpu , zero_init=1
  139.55GB |  12.62GB | offload_param=none, offload_optimizer=cpu , zero_init=0
    1.49GB | 105.66GB | offload_param=none, offload_optimizer=none, zero_init=1
   34.89GB | 105.66GB | offload_param=none, offload_optimizer=none, zero_init=0


In [9]:
#施加peft lora
model_tools.model_profile(model)
print('conducting peft lora ---------------')
model = get_peft_model(model, config)
model_tools.model_profile(model)

Total Parameters: 6243584000
Trainable Parameters: 6243584000
Percentage of Trainable Parameters: 100.00%
conducting peft lora ---------------
Total Parameters: 6247483392
Trainable Parameters: 3899392
Percentage of Trainable Parameters: 0.06%


In [10]:
class instruction_dataset(Dataset):
    def __init__(self, data_path:'str', tokenizer, truncate_length, max_query_length, query_key, answer_key):
        super().__init__()
        self.tokenizer = tokenizer
        self.examples = []
        
        with open(data_path, 'r') as file:
            for line in file:
                sample=json.loads(line)
                # input_ids的结构应该为：prompt_tokens, src_tokens, [gMASK], <sop>, tgt_tokens, <eop>, [PAD]... 
                # 或者简化一点，即为 query, [gMASK], <sop>, answer, <eop>, [PAD]... 
                # padding的目的是为了对齐各个instance，以组成batch（当然batch_size=1时其实没必要）
                # 总体的input_ids的长度不超过truncate_length，其中query的长度不超过max_query_length，同理可以计算出answer的最大长度
                max_answer_length = truncate_length - max_query_length - 3
                
                # 判断query的长度
                query = sample[query_key]
                query_ids = tokenizer.encode(query, add_special_tokens=False)
                if len(query_ids) > max_query_length:
                    query_ids = query_ids[:max_query_length]
                
                # 判断answer的长度
                answer = sample[answer_key]
                answer_ids = tokenizer.encode(answer, add_special_tokens=False)
                if len(answer) > max_answer_length:
                    answer_ids = answer_ids[:max_answer_length]
                    
                # 合并
                input_ids = query_ids + [si['[gMASK]']] + [si['sop']] + answer_ids + [si['eop']]
                pre_context_length = input_ids.index(si['sop'])
                end_answer_index = input_ids.index(si['eop'])
                
                # padding
                padding_length=truncate_length-len(input_ids)
                input_ids+=padding_length*[tokenizer.pad_token_id]
                
                # 制作labels；其中query部分，pad部分均不参与loss的计算 # 因为需要整体向左移动，所以要少填充一个
                labels = [-100] * (pre_context_length+1) + input_ids[pre_context_length+1: end_answer_index+1]
                labels = labels + [-100] * (truncate_length-len(labels))
                
                # 制作attention_mask
                eop_position = input_ids.index(si['eop'])+1
                attention_mask = [True]*eop_position
                attention_mask += [False]*(truncate_length-len(attention_mask))
                
                self.examples.append({
                    'query' : query,
                    'answer' : answer,
                    'input_ids' : input_ids,
                    'labels' : labels,
                    'attention_mask' : attention_mask,
                })
                
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        instance = self.examples[item]
        return instance

In [11]:
def coll_fn(batch:list):
    input_labels = []
    labels = []
    attention_mask = []
    for sample in batch:
        # 实际上词表长度只有65024，所以int32就可以了 # attention_mask用bool就行 (我收回我的画，完全是玄学)
        input_labels.append(torch.tensor(sample['input_ids'], dtype=torch.long))
        labels.append(torch.tensor(sample['labels'], dtype=torch.long))
        attention_mask.append(torch.tensor(sample['attention_mask'], dtype=torch.float64)) #, dtype=torch.bool
    batch = {'input_ids':input_labels, 'labels':labels, 'attention_mask': attention_mask}
    batch = {name:torch.stack(item).cuda() for name,item in batch.items()} #对于一个元素不为基本元素的list，需要使用stack方法
    return batch

In [12]:
%%time
finetuning_instruction = instruction_dataset(CFG.data_path, tokenizer, CFG.max_tokens, CFG.max_query, CFG.query_key, CFG.answer_key)
instruction_loader = DataLoader(finetuning_instruction, batch_size=CFG.batch_size, shuffle=True, collate_fn=coll_fn)

CPU times: user 5.67 s, sys: 57.9 ms, total: 5.73 s
Wall time: 5.73 s


In [13]:
# 定义参数优化器 & 学习率优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr)

lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=CFG.warm_up_steps,
        num_training_steps=(len(instruction_loader) * CFG.num_train_epochs),
    )

In [14]:
def lora_tuning():
    status=dict()
    status['loss']=[]
    # 训练阶段
    model.train()
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(CFG.num_train_epochs):
        for step, batch in tqdm(enumerate(instruction_loader)):
            # 前向传播 & 计算loss (使用fp16)
            with torch.cuda.amp.autocast():
                outputs = model(**batch)
                loss = outputs.loss
                if step%200==0:
                    print(loss)
                    status['loss'].append(loss)
                if CFG.max_train_steps!=None:
                    if step>=CFG.max_train_steps:
                        return status
            # 反向传播
            scaler.scale(loss).backward()
            # 优化器调度
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            lr_scheduler.step()
    return status

In [15]:
status=lora_tuning()

0it [00:01, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 162.00 MiB. GPU 0 has a total capacty of 23.65 GiB of which 15.56 MiB is free. Process 879804 has 528.00 MiB memory in use. Process 920831 has 530.00 MiB memory in use. Process 16203 has 22.59 GiB memory in use. Of the allocated memory 21.23 GiB is allocated by PyTorch, and 529.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
#模型的保存
model.save_pretrained(CFG.output_dir)

In [None]:
!ls $CFG.output_dir

In [None]:
import matplotlib.pyplot as plt
import torch

loss = [value.item() for times, value in enumerate(status['loss'])]

# 创建图表
plt.figure(figsize=(10, 5))  # 可以调整图表大小
plt.scatter(range(len(loss)), loss, marker='o')  # 使用圆圈标记每个点

# 添加标题和标签
plt.title('Loss over Steps')
plt.xlabel('Step')
plt.ylabel('Loss')

# 显示网格
plt.grid(True)

# 显示图表
plt.show()