In [22]:
from transformers import  AutoModel,AutoTokenizer,TrainingArguments,Trainer,DataCollatorForSeq2Seq
import torch
import datasets

In [2]:
ds = datasets.load_from_disk('./alpaca_data_zh')
ds

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 26858
})

In [3]:
ds[:3]

{'output': ['以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。',
  '4/16等于1/4是因为我们可以约分分子分母都除以他们的最大公约数4，得到（4÷4）/ (16÷4）=1/4。分数的约分是用分子和分母除以相同的非零整数，来表示分数的一个相同的值，这因为分数实际上表示了分子除以分母，所以即使两个数同时除以同一个非零整数，分数的值也不会改变。所以4/16 和1/4是两种不同的书写形式，但它们的值相等。',
  '朱利叶斯·凯撒，又称尤利乌斯·恺撒（Julius Caesar）是古罗马的政治家、军事家和作家。他于公元前44年3月15日被刺杀。 \n\n根据历史记载，当时罗马元老院里一些参议员联合起来策划了对恺撒的刺杀行动，因为他们担心恺撒的统治将给罗马共和制带来威胁。在公元前44年3月15日（又称“3月的艾达之日”），恺撒去参加元老院会议时，被一群参议员包围并被攻击致死。据记载，他身中23刀，其中一刀最终致命。'],
 'input': ['', '输入：4/16', ''],
 'instruction': ['保持健康的三个提示。', '解释为什么以下分数等同于1/4', '朱利叶斯·凯撒是如何死亡的？']}

In [4]:
tokenizer = AutoTokenizer.from_pretrained('THUDM/chatglm3-6b',trust_remote_code=True)
tokenizer

Setting eos_token is not supported, use the default one.
Setting pad_token is not supported, use the default one.
Setting unk_token is not supported, use the default one.


ChatGLMTokenizer(name_or_path='THUDM/chatglm3-6b', vocab_size=64798, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	64790: AddedToken("[gMASK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	64792: AddedToken("sop", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	64795: AddedToken("<|user|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	64796: AddedToken("<|assistant|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
}

In [5]:
model = AutoModel.from_pretrained('THUDM/chatglm3-6b',trust_remote_code=True,low_cpu_mem_usage=True,torch_dtype=torch.half)
model

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_layer): Linear(in_

In [6]:
def process_fn(example):
    MAX_LENGTH = 384
    input_ids, attention_mask, labels = [] , [] , []
    instruction = '\n'.join([example['instruction'],example['input']]).strip()
    instruction = tokenizer.build_chat_input(instruction,history=[],role='user')
    response = tokenizer('\n'+example['output'],add_special_tokens=False)
    input_ids = instruction['input_ids'][0].numpy().tolist() + response['input_ids'] + [tokenizer.eos_token_id]
    attention_mask = instruction['attention_mask'][0].numpy().tolist() + response['attention_mask'] + [1]
    labels = [-100]* len(instruction['input_ids']) + response['input_ids'] + [tokenizer.eos_token_id] #instruction部分的labels 都是看不到的 仅仅计算output部分的loss
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        'input_ids':input_ids,
        'attention_mask':attention_mask,
        'labels':labels
    }

In [7]:
tokenizer.encode('呀',add_special_tokens=True)

[64790, 64792, 30910, 56657]

In [8]:
tokenizer.eos_token,tokenizer.eos_token_id

('</s>', 2)

In [9]:
print(tokenizer.build_chat_input('你好',history=[],role='user'))

{'input_ids': tensor([[64790, 64792, 64795, 30910,    13, 36474, 54591, 64796]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]]), 'position_ids': tensor([[0, 1, 2, 3, 4, 5, 6, 7]])}


In [10]:
tokenizer.decode([64790, 64792, 64795, 30910,    13, 36474, 54591, 64796])

'[gMASK] sop <|user|> \n 你好 <|assistant|>'

'[gMASK] sop <|user|> \n query <|assistant|>' \n response eos_token

In [11]:
?model.chat

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mchat[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtokenizer[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mquery[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhistory[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mDict[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrole[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'user'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_length[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m8192[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnum_beams[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_sample[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtop_p[0m[0;34m=[0m[0;36m0.8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtemperature[0m[0;34m=[0m[0;36m0.8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlogits_processor[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0

In [25]:
tokenized_ds  = ds.map(process_fn,remove_columns=ds.column_names,batch_size=48)
tokenized_ds

Map:   0%|          | 0/26858 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 26858
})

In [13]:
tokenizer.decode(tokenized_ds[1]['input_ids'],skip_special_tokens=True)

'[gMASK] sop <|user|> \n 解释为什么以下分数等同于1/4\n输入：4/16 <|assistant|> \n4/16等于1/4是因为我们可以约分分子分母都除以他们的最大公约数4，得到（4÷4）/ (16÷4）=1/4。分数的约分是用分子和分母除以相同的非零整数，来表示分数的一个相同的值，这因为分数实际上表示了分子除以分母，所以即使两个数同时除以同一个非零整数，分数的值也不会改变。所以4/16 和1/4是两种不同的书写形式，但它们的值相等。'

In [16]:
for name,param in model.named_parameters():
    print(name)

transformer.embedding.word_embeddings.weight
transformer.encoder.layers.0.input_layernorm.weight
transformer.encoder.layers.0.self_attention.query_key_value.weight
transformer.encoder.layers.0.self_attention.query_key_value.bias
transformer.encoder.layers.0.self_attention.dense.weight
transformer.encoder.layers.0.post_attention_layernorm.weight
transformer.encoder.layers.0.mlp.dense_h_to_4h.weight
transformer.encoder.layers.0.mlp.dense_4h_to_h.weight
transformer.encoder.layers.1.input_layernorm.weight
transformer.encoder.layers.1.self_attention.query_key_value.weight
transformer.encoder.layers.1.self_attention.query_key_value.bias
transformer.encoder.layers.1.self_attention.dense.weight
transformer.encoder.layers.1.post_attention_layernorm.weight
transformer.encoder.layers.1.mlp.dense_h_to_4h.weight
transformer.encoder.layers.1.mlp.dense_4h_to_h.weight
transformer.encoder.layers.2.input_layernorm.weight
transformer.encoder.layers.2.self_attention.query_key_value.weight
transformer.enco

In [18]:
from peft import LoraConfig,TaskType,get_peft_model

config = LoraConfig(target_modules=['query_key_value'],task_type=TaskType.CAUSAL_LM)
config

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'query_key_value'}, exclude_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [19]:
model = get_peft_model(model,config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): ChatGLMForConditionalGeneration(
      (transformer): ChatGLMModel(
        (embedding): Embedding(
          (word_embeddings): Embedding(65024, 4096)
        )
        (rotary_pos_emb): RotaryEmbedding()
        (encoder): GLMTransformer(
          (layers): ModuleList(
            (0-27): 28 x GLMBlock(
              (input_layernorm): RMSNorm()
              (self_attention): SelfAttention(
                (query_key_value): lora.Linear(
                  (base_layer): Linear(in_features=4096, out_features=4608, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Identity()
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=4608, bias=False)
                  )
         

In [20]:
model.print_trainable_parameters()

trainable params: 1,949,696 || all params: 6,245,533,696 || trainable%: 0.0312


In [32]:
args = TrainingArguments(
    output_dir='./glm3_fintue_model',
    # per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    logging_steps=10,
    num_train_epochs=1,
    learning_rate=1e-4
)

In [33]:
trainer = Trainer(
    model=model,
    args = args,
    train_dataset=tokenized_ds.select(range(1000)),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer,padding=True)
)

In [34]:
trainer.train()

  0%|          | 0/7 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 18.11 GB, other allocations: 24.78 MB, max allowed: 18.13 GB). Tried to allocate 256 bytes on shared pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).