In [1]:



from transformers import AutoTokenizer, AutoModelForCausalLM

from peft import PeftModel, LoraConfig, TaskType
import os
import torch



In [2]:
"""
加载原始模型和对应的lora模型
"""
lora_model_id = "/root/autodl-tmp/qwen1.5-7b-chat-lora-fine-tuning/checkpoint-6700/"

model_name = "Qwen/Qwen1.5-7B-Chat"
cache_dir = "/root/autodl-tmp/"
model_cache_dir = "Qwen1.5-7B-Chat"

model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto",
                                             torch_dtype=torch.bfloat16,
                                             cache_dir=os.path.join(cache_dir, model_cache_dir))

config = LoraConfig(task_type=TaskType.CAUSAL_LM, 
                   target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
                   inference_mode=False,
                   r=8,   # lora的秩
                   lora_alpha=32,  # lora alpha, lora的缩放，缩放的原则是alpha / r， 目前这个参数是缩放4倍（增大lora的影响和贡献度）
                   lora_dropout=0.1)


peft_model = PeftModel.from_pretrained(model=model,model_id=lora_model_id,
                                      is_trainable=False,  # 默认参数，可以不管
                                       adapter_name="default",   # 默认参数，可以不管
                                       config = config
                                       
                                      )

pert_model = peft_model.cuda()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:

peft_model


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 4096)
        (layers): ModuleList(
          (0-31): 32 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=409

In [4]:

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=os.path.join(cache_dir, model_cache_dir),rust_remote_code=True)


example_prompt = "考试有哪些技巧"

messages = [{"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": example_prompt}]


text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(text)

inputs = tokenizer([text], add_special_tokens=False, return_tensors='pt').to("cuda")

print(inputs)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
考试有哪些技巧<|im_end|>
<|im_start|>assistant

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 103960, 104719, 102118, 151645,
            198, 151644,  77091,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}


In [5]:

generated_ids = peft_model.generate(**inputs, do_sample=False, max_length=256, 
                                    max_new_tokens=512)[0]


response =tokenizer.decode(generated_ids)

print(response)




Both `max_new_tokens` (=512) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
考试有哪些技巧<|im_end|>
<|im_start|>assistant
考试技巧因学科和考试类型而异，但以下是一些通用的建议：

1. 熟悉考试格式：了解考试的结构、题型和时间限制，以便更好地安排答题时间。

2. 制定计划：在考试前制定一个详细的复习计划，包括每天要复习的内容和时间。

3. 有效记忆：使用记忆技巧，如联想记忆、故事法等，帮助记忆重要知识点。

4. 阅读理解：仔细阅读题目，理解题意，避免答非所问。

5. 时间管理：合理分配时间，不要在一道题上花费过多时间，以免影响其他题目的解答。

6. 保持冷静：考试时保持冷静，不要紧张，按照自己的节奏答题。

7. 检查答案：完成答题后，检查一遍答案，确保没有遗漏或错误。

8. 保持健康的生活习惯：保证充足的睡眠，保持良好的饮食习惯，避免过度疲劳。

9. 做好心理准备：考试前做好心理准备，保持积极的心态，相信自己能够取得好成绩。<|endoftext|>


In [6]:

generated_ids_valid = []
for input_ids, output_ids in zip(inputs.input_ids, [generated_ids]):
    input_length = len(input_ids)
    print("input_length： ", input_length, len(output_ids))
    generated_ids_valid.append(output_ids[input_length:])

print(tokenizer.batch_decode(generated_ids_valid, skip_special_tokens=True))


input_length：  22 249
['考试技巧因学科和考试类型而异，但以下是一些通用的建议：\n\n1. 熟悉考试格式：了解考试的结构、题型和时间限制，以便更好地安排答题时间。\n\n2. 制定计划：在考试前制定一个详细的复习计划，包括每天要复习的内容和时间。\n\n3. 有效记忆：使用记忆技巧，如联想记忆、故事法等，帮助记忆重要知识点。\n\n4. 阅读理解：仔细阅读题目，理解题意，避免答非所问。\n\n5. 时间管理：合理分配时间，不要在一道题上花费过多时间，以免影响其他题目的解答。\n\n6. 保持冷静：考试时保持冷静，不要紧张，按照自己的节奏答题。\n\n7. 检查答案：完成答题后，检查一遍答案，确保没有遗漏或错误。\n\n8. 保持健康的生活习惯：保证充足的睡眠，保持良好的饮食习惯，避免过度疲劳。\n\n9. 做好心理准备：考试前做好心理准备，保持积极的心态，相信自己能够取得好成绩。']


In [7]:
tokenizer.apply_chat_template??

[0;31mSignature:[0m
[0mtokenizer[0m[0;34m.[0m[0mapply_chat_template[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mconversation[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mForwardRef[0m[0;34m([0m[0;34m'Conversation'[0m[0;34m)[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchat_template[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0madd_generation_prompt[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtokenize[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding[0m[0;34m:[0m [0mbool[0m [0;3

In [8]:
peft_model.generate??

[0;31mSignature:[0m [0mpeft_model[0m[0;34m.[0m[0mgenerate[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
    [0;32mdef[0m [0mgenerate[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mpeft_config[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mactive_peft_config[0m[0;34m[0m
[0;34m[0m        [0mself[0m[0;34m.[0m[0mbase_model[0m[0;34m.[0m[0mprepare_inputs_for_generation[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mprepare_inputs_for_generation[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0mhasattr[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mbase_model[0m[0;34m,[0m [0;34m"model"[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0mself[0m[0;34m.[0m[0mbase_model[0m[0;34m.[0m[0mmodel[0m[0;34m.[0m[0m

In [11]:

# 权重合并：qwen权重与lora权重合并

# merge_model = peft_model.merge_and_unload()


merge_model = peft_model.merge_and_unload()

merge_model.save_pretrained("/root/autodl-tmp/qwen1.5-7b_merge_lora")

In [1]:

from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForCausalLM



In [None]:

TrainingArguments()
