2. 预处理数据并进行 Tokenization

在使用这些数据微调模型之前，我们需要对文本数据进行预处理，这包括以下步骤：

    加载文本数据。
    进行 Tokenization，将文本转换为模型可理解的 token。
    对齐数据格式，比如统一长度、添加 padding 等。

2.1 加载并 Tokenization

可以使用 Hugging Face 的 transformers 库进行数据预处理。以 GPT-2 为例

这里我们将数据集中的每条文本记录进行 Tokenization，并使用 max_length 将文本截断至 128 个 token 长度。

In [8]:
from transformers import GPT2Tokenizer
from datasets import load_dataset

# 1. 加载 GPT-2 Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# 2. 手动读取和编码转换（如果需要）
# 先使用实际编码读取文件
with open('chat_log_data.txt', 'r', encoding='latin1') as f:  # 请确认这里的编码
    content = f.readlines()

# 将内容写入新的 UTF-8 编码文件
with open('chat_log_data_utf8.txt', 'w', encoding='utf-8') as f:
    for line in content:
        f.write(line)

# 3. 加载聊天记录数据集
dataset = load_dataset('text', data_files={'train': 'chat_log_data_utf8.txt'})

# 设置填充标记（如果没有定义的话）
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # 或者你可以定义自己的填充标记，如 '[PAD]'
    
# 确保重新加载分词器后再进行tokenization
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, return_tensors="pt")

# 对数据集进行 Tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 输出 tokenized 数据集的示例
print(tokenized_datasets['train'][0])


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'text': 'User: Hey, how are you?', 'input_ids': [12982, 25, 14690, 11, 703, 389, 345, 30, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,

3. 微调模型

为了让你的模型像聊天记录一样回复，可以使用 Hugging Face 的 Trainer 和 LoRA 进行轻量化微调。

In [1]:
from transformers import GPT2LMHeadModel
from peft import get_peft_model, LoraConfig

# 加载 GPT-2 模型
model = GPT2LMHeadModel.from_pretrained("gpt2")

# 定义 LoRA 微调配置
lora_config = LoraConfig(
    r=8,  # LoRA矩阵的秩
    lora_alpha=16,  # LoRA的alpha参数
    target_modules=["c_attn", "c_proj"],  # 使用 Conv1D 层
    lora_dropout=0.05,  # Dropout比例
    fan_in_fan_out=True
)

# 应用 LoRA
lora_model = get_peft_model(model, lora_config)


3.3 定义训练参数

我们使用 Trainer 进行训练

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

class CustomDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        # 设置填充标记（如果没有定义的话）
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token  # 或者你可以定义自己的填充标记，如 '[PAD]'
        # Tokenize inputs
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
        
        # 创建 labels，shift input_ids 以创建下一个 token 的标签
        labels = inputs['input_ids'].clone()
        labels[:, :-1] = inputs['input_ids'][:, 1:]  # Shift right
        labels[:, -1] = -100  # Ignore last token for label
        
        return {
            "input_ids": inputs['input_ids'].squeeze(0),  # (1, sequence_length) -> (sequence_length,)
            "labels": labels.squeeze(0)  # (1, sequence_length) -> (sequence_length,)
        }

# 示例数据
texts = [
    "User: Hey, how are you?\nBot: I'm doing great! How about you?",
    "User: What’s the weather like today?\nBot: It’s sunny with a slight chance of rain later.",
    "User: Do you like music?\nBot: Yes, I love it! What about you?",
    "User: Can you help me with my homework?\nBot: Sure, what do you need help with?",
    "User: What’s your favorite movie?\nBot: I enjoy a lot of movies, but 'Inception' is a top choice."
]

# 创建数据集
dataset = CustomDataset(texts)

# 创建数据加载器
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# 加载模型
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=1,
)

# 创建 Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=dataset,
)

# 开始微调模型
trainer.train()



  from pandas.core.computation.check import NUMEXPR_INSTALLED
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss


TrainOutput(global_step=9, training_loss=12.370220608181423, metrics={'train_runtime': 10.6159, 'train_samples_per_second': 1.413, 'train_steps_per_second': 0.848, 'total_flos': 989187932160.0, 'train_loss': 12.370220608181423, 'epoch': 3.0})

In [4]:
from transformers import GPT2Tokenizer
from datasets import load_dataset

# 1. 加载 GPT-2 Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# 保存模型
lora_model.save_pretrained("./gpt2-chat-finetuned")
tokenizer.save_pretrained("./gpt2-chat-finetuned")


('./gpt2-chat-finetuned\\tokenizer_config.json',
 './gpt2-chat-finetuned\\special_tokens_map.json',
 './gpt2-chat-finetuned\\vocab.json',
 './gpt2-chat-finetuned\\merges.txt',
 './gpt2-chat-finetuned\\added_tokens.json')

In [6]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 加载微调好的模型和 tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-chat-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-chat-finetuned")

# 输入文本
input_text = "Hey, how are you?"

# 编码输入
inputs = tokenizer(input_text, return_tensors='pt')

# 使用模型生成文本
outputs = model.generate(
    inputs['input_ids'], 
    max_length=30,
  
    repetition_penalty=1.2,  # 添加重复惩罚
    num_return_sequences=1
)

# 解码生成的文本
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Hey, how are you? I'm not sure what to say.
"I don't know if it's a good idea or bad thing."
