In [1]:
"""
参考：  
1. https://github.com/yuanzhoulvpi2017/zero_nlp/wiki/%E4%BB%8Esft_clm_mlm%E4%B8%89%E7%A7%8D%E8%AE%AD%E7%BB%83%E6%96%B9%E5%BC%8F%E6%9D%A5%E7%9C%8Bdata_collator%E2%80%94%E2%80%94%E3%80%90transformers%E6%BA%90%E7%A0%81%E9%98%85%E8%AF%BB%E3%80%91
2. https://huggingface.co/learn/nlp-course/zh-CN/chapter7/6


利用指令微调的数据（包括：instruction、input、output等字段的数据样本）进行持续预训练，进行的是因果语言模型（CausalLM）， 
需要注意的是：在一般的因果语言模型CLM中，input_ids和label_ids偏移一个位置， 然后组成数据进行预训练
在基于指令微调数据的语言模型sft中，input_ids是instruction和input组成的source, output组成的target

"""

import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

cache_dir = "/root/autodl-tmp/"
import transformers 

print(transformers.__version__)

4.41.2


In [2]:
import logging, torch, click

import numpy as np 
import pandas as pd
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union
from functools import partial

from datetime import datetime
from datasets import Dataset, load_dataset, load_from_disk


from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, PreTrainedTokenizer, Trainer, TrainingArguments, set_seed, DataCollatorForSeq2Seq


from consts import DEFAULT_INPUT_MODEL, DEFAULT_SEED, PROMPT_WITH_INPUT_FORMAT, PROMPT_NO_INPUT_FORMAT, END_KEY, INSTRUCTION_KEY, RESPONSE_KEY, INTRO_KEY


In [3]:

logger = logging.getLogger(__name__)

# ROOT_PATH = "llm-tutorial/llm-ie/"




In [4]:

class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the
        # sequence of tokens.  This should just be a single token.
        response_token_ids = self.tokenizer(RESPONSE_KEY)["input_ids"]

        labels = batch["labels"].clone()

        for i in range(len(examples)):

            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # Make pytorch loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch


In [5]:

def preprocess_batch(batch: Dict[str, List], tokenizer: AutoTokenizer, max_length: int) -> dict:
    return tokenizer(
        batch["text"],
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )


def load_training_dataset(path_or_dataset: str = "data/yayi_train_example.json") -> Dataset:
    logger.info(f"Loading dataset from {path_or_dataset}")
    dataset = load_dataset("json", data_files=path_or_dataset)["train"]
    print(dataset)
    logger.info("Found %d rows", dataset.num_rows)

    def _add_text(rec):
        instruction = rec["instruction"]
        context = rec.get("input", "")
        response = rec["output"]

        if not instruction:
            raise ValueError(f"Expected an instruction in: {rec}")

        if not response:
            raise ValueError(f"Expected a response in: {rec}")

        # For some instructions there is an input that goes along with the instruction, providing context for the
        # instruction.  For example, the input might be a passage from Wikipedia and the instruction says to extract
        # some piece of information from it.  The response is that information to extract.  In other cases there is
        # no input.  For example, the instruction might be open QA such as asking what year some historic figure was
        # born.
        if context:
            rec["text"] = PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, response=response, input=context)
        else:
            rec["text"] = PROMPT_NO_INPUT_FORMAT.format(instruction=instruction, response=response)
        return rec

    dataset = dataset.map(_add_text)
    return dataset


def load_tokenizer(pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL) -> PreTrainedTokenizer:
    logger.info(f"Loading tokenizer for {pretrained_model_name_or_path}")
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, cache_dir=os.path.join(cache_dir, "yayi_7b_model"))
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_special_tokens({"additional_special_tokens": [INTRO_KEY, INSTRUCTION_KEY, RESPONSE_KEY, END_KEY]})
    return tokenizer


def load_model(
    pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL, *, gradient_checkpointing: bool = False
) -> AutoModelForCausalLM:
    logger.info(f"Loading model for {pretrained_model_name_or_path}")
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path, trust_remote_code=True, torch_dtype=torch.float16, cache_dir=os.path.join(cache_dir, "yayi_7b_model"), use_cache=False if gradient_checkpointing else True
    )
    return model


def get_model_tokenizer(
    pretrained_model_name_or_path: str = DEFAULT_INPUT_MODEL, *, gradient_checkpointing: bool = False
) -> Tuple[AutoModelForCausalLM, PreTrainedTokenizer]:
    tokenizer = load_tokenizer(pretrained_model_name_or_path)
    model = load_model(pretrained_model_name_or_path, gradient_checkpointing=gradient_checkpointing)
    model.resize_token_embeddings(len(tokenizer))

    return model, tokenizer

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed=DEFAULT_SEED, path_or_dataset=None) -> Dataset:
    """Loads the training dataset and tokenizes it so it is ready for training.

    Args:
        tokenizer (AutoTokenizer): Tokenizer tied to the model.
        max_length (int): Maximum number of tokens to emit from tokenizer.

    Returns:
        Dataset: HuggingFace dataset
    """

    dataset = load_training_dataset(path_or_dataset=path_or_dataset)

    logger.info("Preprocessing dataset")
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "input", "output", "text"],
    )
    logger.info(f"datasets after processing: {dataset}")

    # Make sure we don't have any truncated records, as this would mean the end keyword is missing.
    logger.info("Processed dataset has %d rows", dataset.num_rows)
    # dataset = dataset.filter(lambda rec: len(rec["input_ids"]) < max_length)
    logger.info("Processed dataset has %d rows after filtering for truncated records", dataset.num_rows)

    logger.info("Shuffling dataset")
    dataset = dataset.shuffle(seed=seed)

    logger.info("Done preprocessing")

    return dataset




set_seed(DEFAULT_SEED)
model, tokenizer = get_model_tokenizer(DEFAULT_INPUT_MODEL, gradient_checkpointing=False)


for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
    max_length = getattr(model.config, length_setting, None)
    if max_length:
        logger.info(f"Found max lenth: {max_length}")
        break
if not max_length:
    max_length = 1024
    logger.info(f"Using default max length: {max_length}")


max_length = 1024 * 2



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [6]:
from transformers import default_data_collator  # 默认是因果语言模型


dataset = load_dataset("json", data_files="yayi_train_example.json")["train"]
dataset[0]





{'system': '',
 'instruction': '你是谁',
 'input': '',
 'output': '我的中文名是雅意，英文名是YaYi，是一个由中科闻歌算法团队训练的语言模型'}

In [7]:

processed_dataset = preprocess_dataset(tokenizer=tokenizer, max_length=max_length, seed=DEFAULT_SEED, path_or_dataset="yayi_train_example.json")


split_dataset = processed_dataset.train_test_split(test_size=0.1, seed=DEFAULT_SEED)






Dataset({
    features: ['system', 'instruction', 'input', 'output'],
    num_rows: 3
})


In [8]:
print(PROMPT_NO_INPUT_FORMAT.format(instruction="你是谁", response="我是大模型"))

<|System|>:
A chat between a human and an AI assistant named YaYi.
YaYi is a helpful and harmless language model developed by Beijing Wenge Technology Co.,Ltd.

<|Human|>:
你是谁

<|YaYi|>:
我是大模型

<|End|>


In [9]:
print(PROMPT_WITH_INPUT_FORMAT.format(instruction="你是谁", response="我是大模型", input="训练中"))

<|System|>:
A chat between a human and an AI assistant named YaYi.
YaYi is a helpful and harmless language model developed by Beijing Wenge Technology Co.,Ltd.

<|Human|>:
你是谁
训练中

<|YaYi|>:
我是大模型

<|End|>


In [12]:
processed_dataset[0:2]

from typing import Mapping


isinstance(split_dataset["train"][0], (Mapping,))





True

In [13]:


# 训练过程


set_seed(DEFAULT_SEED)


In [14]:



from trl import DataCollatorForCompletionOnlyLM as OnlyLM  # 主要作用于续写
"""
是一个用于自然语言处理任务中的特定数据批处理工具，特别是在语言模型的完成（completion）任务中。在这种任务中，模型的目标是生成给定前缀或提示的文本的续写部分。
"""

texts = ["我是谁", "我喜欢上海" "我"]

OnlyLM.__init__??






[0;31mSignature:[0m
[0mOnlyLM[0m[0;34m.[0m[0m__init__[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mresponse_template[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minstruction_template[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0margs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmlm[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;34m-[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m I

In [16]:



"""
分析多轮度化的模版

"""

from transformers import AutoTokenizer

import pandas as pd





In [20]:

tk = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct", cache_dir="/root/autodl-tmp/qwen2-7b-instruct/", trust_remote_code=True)


messages = [{"role": ""}, {}, {}, {}]







Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'input_ids': [104198, 100165], 'attention_mask': [1, 1]}
