# 加载和处理数据集

In [None]:
from datasets import load_dataset

# 数据使用的是eli5
data = load_dataset("eli5_category", trust_remote_code=True)
small_data = data["train"].shuffle().select(range(5000)).train_test_split(0.3)
small_data

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("your model path")
tokenizer

In [None]:
# 只要answers中的text字段
flatten_data = small_data.flatten()
flatten_data["train"][0], small_data["train"][0]

In [None]:
def process_flatten_function(examples):
    # examples["answers.text"]中可能包含多个字符串，这个函数会拼成一个长的字符串放在一个列表中
    # tokenizer是没有长度的限制的但是模型会有
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [None]:
join_data = flatten_data.map(process_flatten_function, batched=True, remove_columns=flatten_data["train"].column_names)
join_data

In [None]:
# block_size的大小取决于机器的显存，如果显存大可以直接设置成为模型可以接受的最大长度
# 显存和sequence_length以及batch_size都有关系
block_size = 128

def graup_text(examples):
    # examples:batch * [input_ids]
    # 将一个批量的ids和attention_mask拼接成一个大的都会放在一个列表中
    concatenated_examples = {k:sum(examples[k],[]) for k in examples.keys()}
    # 这个批量中的总长度（ids）
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        # 把不能被整除的部分去除掉
        total_length = (total_length // block_size) * block_size
    result = {
        # 切片不会出现索引错误
        k:[t[i:i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # labels直接使用的是input_ids的数值
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
dataset = join_data.map(graup_text, batched=True)
dataset

In [None]:
from transformers import DataCollatorForLanguageModeling
# 可以打印前面直接加载的tokenizer是没有填充的，但是这个任务是需要的，所以需要设置
tokenizer.pad_token = tokenizer.eos_token
# gpt2的tokenizer没有mask这个
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

# 准备训练

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

# model使用的是DistilGPT
model = AutoModelForCausalLM.from_pretrained("your model path")
model

In [None]:
training_args = TrainingArguments(
    output_dir="./checkout",
    per_device_train_batch_size=2,
    logging_strategy="steps",
    logging_steps=15,
    learning_rate=5e-5,
    warmup_steps=10,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_eval_batch_size=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
import math
eval_results = trainer.evaluate()
# 这个是困惑度的计算方式e的loss次方
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")